Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion digital_land/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,16 @@ def pipeline_run(
default_values = pipeline.default_values(endpoints=endpoints)
combine_fields = pipeline.combine_fields(endpoints=endpoints)

lpa_geometry_path = "var/cache/lpa-geometry.csv"
local_authority_path = "var/cache/local-authority.csv"
# load organisations
organisation = Organisation(organisation_path, Path(pipeline.path))
organisation = Organisation(
organisation_path,
Path(pipeline.path),
None,
lpa_geometry_path,
local_authority_path,
)

# load the resource default values from the collection
if not endpoints:
Expand Down
4 changes: 2 additions & 2 deletions digital_land/datatype/point.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@


class PointDataType(WktDataType):
def normalise(self, values, default=["", ""], issues=None):
def normalise(self, values, default=["", ""], issues=None, boundary=None):
if not values or "" in values:
return default

point = f"POINT ({values[0]} {values[1]})"
point = super().normalise(point, issues=issues)
point = super().normalise(point, issues=issues, boundary=boundary)

if not point:
return default
Expand Down
15 changes: 14 additions & 1 deletion digital_land/organisation.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,27 @@ def lower_uri(value):

class Organisation:
organisation_path = "var/cache/organisation.csv"
lpa_geometry_path = "var/cache/lpa-geometry.csv"
local_authority_path = "var/cache/local-authority.csv"
pipeline_patch_path = "pipeline/patch.csv"
organisation = {}
organisation_uri = {}
organisation_lookup = {}

def __init__(self, organisation_path=None, pipeline_dir=None, organisation=None):
def __init__(
self,
organisation_path=None,
pipeline_dir=None,
organisation=None,
lpa_geometry_path=None,
local_authority_path=None,
):
if organisation_path:
self.organisation_path = organisation_path
if lpa_geometry_path:
self.lpa_geometry_path = lpa_geometry_path
if local_authority_path:
self.local_authority_path = local_authority_path
if pipeline_dir:
self.pipeline_patch_path = pipeline_dir / "patch.csv"
if organisation is None:
Expand Down
36 changes: 35 additions & 1 deletion digital_land/phase/harmonise.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import csv
from datetime import datetime

from .phase import Phase
Expand All @@ -9,9 +10,11 @@ def __init__(
self,
specification=None,
issues=None,
organisation=None,
):
self.specification = specification
self.issues = issues
self.organisation = organisation

def harmonise_field(self, fieldname, value):
if not value:
Expand All @@ -21,6 +24,26 @@ def harmonise_field(self, fieldname, value):
datatype = self.specification.field_type(fieldname)
return datatype.normalise(value, issues=self.issues)

def load_local_planning_authority(self, organisation):
with open(self.organisation.local_authority_path) as f:
reader = csv.DictReader(f)
for row in reader:
if row["reference"] == organisation:
return row["local-planning-authority"]
return None

def load_lpa_geometry(self, organisation):
if not organisation:
return None
lpa_ref = self.load_local_planning_authority(organisation[-3:])
if lpa_ref:
with open(self.organisation.lpa_geometry_path) as f:
reader = csv.DictReader(f)
for row in reader:
if row["reference"] == lpa_ref:
return row["geometry"]
return None

def process(self, stream):
for block in stream:
row = block["row"]
Expand All @@ -43,14 +66,25 @@ def process(self, stream):
self.issues.log_issue(field, "future entry-date", row[field])
o[field] = ""

# if dataset is brownfield land get boundary geometry to use for boundary check
if (
self.organisation
and self.organisation.lpa_geometry_path
and self.organisation.local_authority_path
and block["dataset"] == "brownfield-land"
):
boundary = self.load_lpa_geometry(o["organisation"])
else:
boundary = None

# fix point geometry
# TBD: generalise as a co-constraint
if set(["GeoX", "GeoY"]).issubset(row.keys()):
self.issues.fieldname = "GeoX,GeoY"

point = PointDataType()
(o["GeoX"], o["GeoY"]) = point.normalise(
[o["GeoX"], o["GeoY"]], issues=self.issues
[o["GeoX"], o["GeoY"]], issues=self.issues, boundary=boundary
)

# ensure typology fields are a CURIE
Expand Down
4 changes: 4 additions & 0 deletions tests/data/boundary/local-authority.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
dataset,prefix,reference,local-authority-type,wikidata,name,combined-authority,local-authority-district,local-planning-authority,start-date,end-date
local-authority,local-authority,HPL,UA,Q16993720,Hartlepool Borough Council,TVCA,E06000001,E60000003,1905-06-18,
local-authority,local-authority,MDB,UA,Q6841843,Middlesbrough Borough Council,TVCA,E06000002,E60000004,1905-06-18,
local-authority,local-authority,RCC,UA,Q17019415,Redcar and Cleveland Borough Council,TVCA,E06000003,E60000006,1996-04-01,
4 changes: 4 additions & 0 deletions tests/data/boundary/lpa-geometry.csv

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions tests/data/boundary/organisation.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
organisation,entity,wikidata,name,website,twitter,statistical-geography,boundary,toid,opendatacommunities,opendatacommunities-area,billing-authority,census-area,local-authority-type,government-organisation,combined-authority,esd-inventories,local-resilience-forum,region,addressbase-custodian,company,wikipedia,start-date,end-date
local-authority-eng:HPL,172,Q16993720,Hartlepool Borough Council,https://www.hartlepool.gov.uk,HpoolCouncil,E06000001,https://raw.githubusercontent.com/digital-land/boundary-collection/master/collection/local-authority/E06000001/index.geojson,,http://opendatacommunities.org/id/unitary-authority/hartlepool,http://opendatacommunities.org/id/geography/administration/ua/E06000001,E0701,00EB,UA,,TVCA,,cleveland,north-east,724,,https://en.wikipedia.org/wiki/Hartlepool_Borough_Council,1905-06-18,
local-authority-eng:MDB,210,Q6841843,Middlesbrough Borough Council,https://www.middlesbrough.gov.uk,MbroCouncil,E06000002,https://raw.githubusercontent.com/digital-land/boundary-collection/master/collection/local-authority/E06000002/index.geojson,,http://opendatacommunities.org/id/unitary-authority/middlesbrough,http://opendatacommunities.org/id/geography/administration/ua/E06000002,E0702,00EC,UA,,TVCA,,cleveland,north-east,734,,https://en.wikipedia.org/wiki/Middlesbrough_Council,1905-06-18,
local-authority-eng:RCC,259,Q17019415,Redcar and Cleveland Borough Council,https://www.redcar-cleveland.gov.uk,RedcarCleveland,E06000003,https://raw.githubusercontent.com/digital-land/boundary-collection/master/collection/local-authority/E06000003/index.geojson,,http://opendatacommunities.org/id/unitary-authority/redcar-and-cleveland,http://opendatacommunities.org/id/geography/administration/ua/E06000003,E0703,00EE,UA,,TVCA,,cleveland,north-east,728,,https://en.wikipedia.org/wiki/Redcar_and_Cleveland_Borough_Council,1996-04-01,
2 changes: 2 additions & 0 deletions tests/data/specification/field.csv
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,5 @@ entity,,string,1,,,,
reference-entity,,string,1,,,,
field,,string,1,,,,
value,,string,1,,,,
GeoX,,decimal,1,,,,
GeoY,,decimal,1,,,,
2 changes: 1 addition & 1 deletion tests/unit/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ class FakeDictReader:
# csv.DictReader). Simply returns values from the passed in list of rows.
def __init__(self, rows, resource=None, dataset=None):
self.resource = resource
self.dataset = resource
self.dataset = dataset
self.fieldnames = rows[0].keys()
self.rows = iter(rows)
self.line_number = 0
Expand Down
161 changes: 160 additions & 1 deletion tests/unit/test_harmonise.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env -S py.test -svv

from digital_land.organisation import Organisation
from digital_land.phase.harmonise import HarmonisePhase
from digital_land.specification import Specification
from digital_land.log import IssueLog
Expand All @@ -19,9 +20,12 @@ def test_harmonise_field():

def test_harmonise():
specification = Specification("tests/data/specification")
organisation = Organisation("tests/data/organisation.csv")
issues = IssueLog()

h = HarmonisePhase(specification=specification, issues=issues)
h = HarmonisePhase(
specification=specification, issues=issues, organisation=organisation
)
reader = FakeDictReader(
[
{"field-integer": "123"},
Expand All @@ -34,3 +38,158 @@ def test_harmonise():
assert output[0]["row"] == {"field-integer": "123"}, "pass through valid data"
assert output[1]["row"] == {"field-integer": "321"}, "whitespace trimmed"
assert output[2]["row"] == {"field-integer": ""}, "remove bad data"


def test_harmonise_inside_custom_boundary():
specification = Specification("tests/data/specification")
organisation = Organisation(
"tests/data/boundary/organisation.csv",
None,
None,
"tests/data/boundary/lpa-geometry.csv",
"tests/data/boundary/local-authority.csv",
)
issues = IssueLog()

h = HarmonisePhase(
specification=specification, issues=issues, organisation=organisation
)
reader = FakeDictReader(
[
{
"organisation": "local-authority-eng:HPL",
"GeoX": "-1.259419",
"GeoY": "54.66946",
},
],
None,
dataset="brownfield-land",
)
output = list(h.process(reader))
assert len(output) == 1
assert output[0]["row"]["GeoX"] == "-1.259419"
assert output[0]["row"]["GeoY"] == "54.66946"


def test_harmonise_outside_custom_boundary():
specification = Specification("tests/data/specification")
organisation = Organisation(
"tests/data/boundary/organisation.csv",
None,
None,
"tests/data/boundary/lpa-geometry.csv",
"tests/data/boundary/local-authority.csv",
)
issues = IssueLog()

h = HarmonisePhase(
specification=specification, issues=issues, organisation=organisation
)
reader = FakeDictReader(
[
{
"organisation": "local-authority-eng:HPL",
"GeoX": "-0.446154",
"GeoY": "53.081202",
},
],
None,
"brownfield-land",
)
output = list(h.process(reader))
assert len(output) == 1
assert output[0]["row"]["GeoX"] == ""
assert output[0]["row"]["GeoY"] == ""


def test_harmonise_non_brownfield_outside_custom_boundary():
specification = Specification("tests/data/specification")
organisation = Organisation(
"tests/data/boundary/organisation.csv",
None,
None,
"tests/data/boundary/lpa-geometry.csv",
"tests/data/boundary/local-authority.csv",
)
issues = IssueLog()

h = HarmonisePhase(
specification=specification, issues=issues, organisation=organisation
)
reader = FakeDictReader(
[
{
"organisation": "local-authority-eng:HPL",
"GeoX": "-0.446154",
"GeoY": "53.081202",
},
],
None,
"not-brownfield-land",
)
output = list(h.process(reader))
assert len(output) == 1
assert output[0]["row"]["GeoX"] == "-0.446154"
assert output[0]["row"]["GeoY"] == "53.081202"


def test_harmonise_inside_default_boundary_unknown_organisation():
specification = Specification("tests/data/specification")
organisation = Organisation(
"tests/data/boundary/organisation.csv",
None,
None,
"tests/data/boundary/lpa-geometry.csv",
"tests/data/boundary/local-authority.csv",
)
issues = IssueLog()

h = HarmonisePhase(
specification=specification, issues=issues, organisation=organisation
)
reader = FakeDictReader(
[
{
"organisation": "???",
"GeoX": "-0.446154",
"GeoY": "53.081202",
},
],
None,
"brownfield-land",
)
output = list(h.process(reader))
assert len(output) == 1
assert output[0]["row"]["GeoX"] == "-0.446154"
assert output[0]["row"]["GeoY"] == "53.081202"


def test_harmonise_outside_default_boundary_unknown_organisation():
specification = Specification("tests/data/specification")
organisation = Organisation(
"tests/data/boundary/organisation.csv",
None,
None,
"tests/data/boundary/lpa-geometry.csv",
"tests/data/boundary/local-authority.csv",
)
issues = IssueLog()

h = HarmonisePhase(
specification=specification, issues=issues, organisation=organisation
)
reader = FakeDictReader(
[
{
"organisation": "???",
"GeoX": "7.578883",
"GeoY": "54.379565",
},
],
None,
"brownfield-land",
)
output = list(h.process(reader))
assert len(output) == 1
assert output[0]["row"]["GeoX"] == ""
assert output[0]["row"]["GeoY"] == ""
4 changes: 4 additions & 0 deletions tests/unit/test_specification.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ def test_field_names():
"field",
"reference-entity",
"fact",
"GeoX",
"GeoY",
]
)

Expand Down Expand Up @@ -133,6 +135,8 @@ def test_current_fieldnames():
"line-number",
"resource",
"value",
"GeoX",
"GeoY",
]
)

Expand Down