From 7d20087749db34b5d56474d66a9d0844d4205016 Mon Sep 17 00:00:00 2001 From: CarlosCoelhoSL Date: Thu, 24 Apr 2025 16:57:24 +0100 Subject: [PATCH 1/2] adds validity check for reference prefixes --- digital_land/commands.py | 2 + digital_land/phase/reference.py | 16 ++++++- tests/e2e/test_assign_entities.py | 74 ++++++++++++++++++++++++++++++ tests/unit/phase/test_reference.py | 5 +- 4 files changed, 95 insertions(+), 2 deletions(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index 4aafb7f3f..751c79906 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -340,6 +340,7 @@ def pipeline_run( EntityReferencePhase( dataset=dataset, prefix=specification.dataset_prefix(dataset), + valid_reference_prefixes=specification.dataset, ), EntityPrefixPhase(dataset=dataset), EntityLookupPhase( @@ -1408,6 +1409,7 @@ def get_resource_unidentified_lookups( EntityReferencePhase( dataset=dataset, prefix=specification.dataset_prefix(dataset), + valid_reference_prefixes=specification.dataset, ), EntityPrefixPhase(dataset=dataset), print_lookup_phase, diff --git a/digital_land/phase/reference.py b/digital_land/phase/reference.py index 87b281a58..b2964da5f 100644 --- a/digital_land/phase/reference.py +++ b/digital_land/phase/reference.py @@ -23,9 +23,16 @@ class EntityReferencePhase(Phase): ensure an entry has the prefix and reference fields """ - def __init__(self, dataset=None, prefix=None, specification=None): + def __init__( + self, + dataset=None, + prefix=None, + specification=None, + valid_reference_prefixes=None, + ): self.dataset = dataset self.specification = specification + self.valid_reference_prefixes = valid_reference_prefixes if prefix: self.prefix = prefix elif specification: @@ -48,6 +55,13 @@ def process_row(self, row): if "UPRN" in reference_prefix: reference_prefix = "" + if ( + self.valid_reference_prefixes + and reference_prefix not in self.valid_reference_prefixes + ): + reference_prefix = "" + # case that isn't covered by this: prefix from reference that isn't a dataset will be filtered out. when will this happen? + prefix = row.get("prefix", "") or reference_prefix or self.prefix return prefix, reference diff --git a/tests/e2e/test_assign_entities.py b/tests/e2e/test_assign_entities.py index 35a5e5e4a..06ebaec84 100644 --- a/tests/e2e/test_assign_entities.py +++ b/tests/e2e/test_assign_entities.py @@ -45,6 +45,37 @@ def mock_resource(): os.remove(mock_csv_path) +@pytest.fixture +def mock_resource_with_colon(): + row1 = { + "reference": "DA:20/21=S106:001:20/0284/FUL", + "organisation": "government-organisation:D1342", + "value": "test", + } + row2 = { + "reference": "DA:20/21=S106:001:21/0284/FUL", + "organisation": "government-organisation:D1342", + "value": "test", + } + row3 = { + "reference": "", + "organisation": "government-organisation:D1342", + "value": "test", + } + + mock_csv_path = Path("mock_csv.csv") + with open(mock_csv_path, "w", encoding="utf-8") as f: + dictwriter = csv.DictWriter(f, fieldnames=row1.keys()) + dictwriter.writeheader() + dictwriter.writerow(row1) + dictwriter.writerow(row2) + dictwriter.writerow(row3) + + yield mock_csv_path + + os.remove(mock_csv_path) + + @pytest.fixture def collection_dir(tmp_path): collection_dir = os.path.join(tmp_path, "collection") @@ -447,3 +478,46 @@ def test_assign_entities_unique_assignment( assert len(set(combined_entities)) == len(set(initial_entities)) + len( set(updated_entities) - set(initial_entities) ), "No duplicates introduced." + + +def test_command_assign_entities_colon_in_reference( + collection_dir, + pipeline_dir, + specification_dir, + organisation_path, + mock_resource_with_colon, +): + """ + This tests a function that, given a specific resource hash for an endpoint already added to the system, + should identify any missing lookups and add them to lookup.csv + """ + collection_name = "ancient-woodland" + dataset_name = "ancient-woodland" + test_endpoint = "d779ad1c91c5a46e2d4ace4d5446d7d7f81df1ed058f882121070574697a5412" + + collection = Collection(name=collection_name, directory=collection_dir) + + assign_entities( + resource_file_paths=["mock_csv.csv"], + collection=collection, + dataset=dataset_name, + organisation=["government-organisation:D1342"], + pipeline_dir=pipeline_dir, + specification_dir=specification_dir, + organisation_path=organisation_path, + endpoints=[test_endpoint], + ) + + lookups = Lookups(pipeline_dir) + lookups.load_csv() + + specification = Specification(specification_dir) + entity_range_min = specification.get_dataset_entity_min(dataset_name) + + assert len(lookups.entries) > 0 + assert lookups.entries[0]["entity"] == entity_range_min + assert lookups.entries[0]["reference"] == "20/21=S106:001:20/0284/FUL" + assert lookups.entries[0]["prefix"] == dataset_name + assert lookups.entries[1]["entity"] == str(int(entity_range_min) + 1) + assert lookups.entries[1]["reference"] == "20/21=S106:001:21/0284/FUL" + assert lookups.entries[1]["prefix"] == dataset_name diff --git a/tests/unit/phase/test_reference.py b/tests/unit/phase/test_reference.py index 849bf16e0..4f9d28d1a 100755 --- a/tests/unit/phase/test_reference.py +++ b/tests/unit/phase/test_reference.py @@ -15,7 +15,9 @@ def test_split_curie(): def test_entity_reference(): - phase = EntityReferencePhase(dataset="tree", prefix="tree") + phase = EntityReferencePhase( + dataset="tree", prefix="tree", valid_prefixes={"tree": {}, "wikidata": {}} + ) assert ("tree", "31") == phase.process_row({"reference": "31"}) assert ("foo", "Q1234") == phase.process_row( @@ -29,6 +31,7 @@ def test_entity_reference(): assert ("foo", "Q1234") == phase.process_row( {"prefix": "foo", "reference": "wikidata:Q1234"} ) + assert ("tree", "Q1234") == phase.process_row({"reference": "invalidprefix:Q1234"}) def test_fact_reference(): From 219a64625fd0d8178a040a68c92e0680d6db7646 Mon Sep 17 00:00:00 2001 From: CarlosCoelhoSL Date: Thu, 24 Apr 2025 17:00:56 +0100 Subject: [PATCH 2/2] fixes typo --- tests/unit/phase/test_reference.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/unit/phase/test_reference.py b/tests/unit/phase/test_reference.py index 4f9d28d1a..4f06234fc 100755 --- a/tests/unit/phase/test_reference.py +++ b/tests/unit/phase/test_reference.py @@ -16,7 +16,9 @@ def test_split_curie(): def test_entity_reference(): phase = EntityReferencePhase( - dataset="tree", prefix="tree", valid_prefixes={"tree": {}, "wikidata": {}} + dataset="tree", + prefix="tree", + valid_reference_prefixes={"tree": {}, "wikidata": {}}, ) assert ("tree", "31") == phase.process_row({"reference": "31"})