digital-land · CarlosCoelhoSL · Apr 24, 2025 · Apr 24, 2025
diff --git a/digital_land/commands.py b/digital_land/commands.py
@@ -340,6 +340,7 @@ def pipeline_run(
         EntityReferencePhase(
             dataset=dataset,
             prefix=specification.dataset_prefix(dataset),
+            valid_reference_prefixes=specification.dataset,
         ),
         EntityPrefixPhase(dataset=dataset),
         EntityLookupPhase(
@@ -1408,6 +1409,7 @@ def get_resource_unidentified_lookups(
         EntityReferencePhase(
             dataset=dataset,
             prefix=specification.dataset_prefix(dataset),
+            valid_reference_prefixes=specification.dataset,
         ),
         EntityPrefixPhase(dataset=dataset),
         print_lookup_phase,

diff --git a/digital_land/phase/reference.py b/digital_land/phase/reference.py
@@ -23,9 +23,16 @@ class EntityReferencePhase(Phase):
     ensure an entry has the prefix and reference fields
     """
 
-    def __init__(self, dataset=None, prefix=None, specification=None):
+    def __init__(
+        self,
+        dataset=None,
+        prefix=None,
+        specification=None,
+        valid_reference_prefixes=None,
+    ):
         self.dataset = dataset
         self.specification = specification
+        self.valid_reference_prefixes = valid_reference_prefixes
         if prefix:
             self.prefix = prefix
         elif specification:
@@ -48,6 +55,13 @@ def process_row(self, row):
         if "UPRN" in reference_prefix:
             reference_prefix = ""
 
+        if (
+            self.valid_reference_prefixes
+            and reference_prefix not in self.valid_reference_prefixes
+        ):
+            reference_prefix = ""
+        # case that isn't covered by this: prefix from reference that isn't a dataset will be filtered out. when will this happen?
+
         prefix = row.get("prefix", "") or reference_prefix or self.prefix
         return prefix, reference
 

diff --git a/tests/e2e/test_assign_entities.py b/tests/e2e/test_assign_entities.py
@@ -45,6 +45,37 @@ def mock_resource():
     os.remove(mock_csv_path)
 
 
+@pytest.fixture
+def mock_resource_with_colon():
+    row1 = {
+        "reference": "DA:20/21=S106:001:20/0284/FUL",
+        "organisation": "government-organisation:D1342",
+        "value": "test",
+    }
+    row2 = {
+        "reference": "DA:20/21=S106:001:21/0284/FUL",
+        "organisation": "government-organisation:D1342",
+        "value": "test",
+    }
+    row3 = {
+        "reference": "",
+        "organisation": "government-organisation:D1342",
+        "value": "test",
+    }
+
+    mock_csv_path = Path("mock_csv.csv")
+    with open(mock_csv_path, "w", encoding="utf-8") as f:
+        dictwriter = csv.DictWriter(f, fieldnames=row1.keys())
+        dictwriter.writeheader()
+        dictwriter.writerow(row1)
+        dictwriter.writerow(row2)
+        dictwriter.writerow(row3)
+
+    yield mock_csv_path
+
+    os.remove(mock_csv_path)
+
+
 @pytest.fixture
 def collection_dir(tmp_path):
     collection_dir = os.path.join(tmp_path, "collection")
@@ -447,3 +478,46 @@ def test_assign_entities_unique_assignment(
     assert len(set(combined_entities)) == len(set(initial_entities)) + len(
         set(updated_entities) - set(initial_entities)
     ), "No duplicates introduced."
+
+
+def test_command_assign_entities_colon_in_reference(
+    collection_dir,
+    pipeline_dir,
+    specification_dir,
+    organisation_path,
+    mock_resource_with_colon,
+):
+    """
+    This tests a function that, given a specific resource hash for an endpoint already added to the system,
+    should identify any missing lookups and add them to lookup.csv
+    """
+    collection_name = "ancient-woodland"
+    dataset_name = "ancient-woodland"
+    test_endpoint = "d779ad1c91c5a46e2d4ace4d5446d7d7f81df1ed058f882121070574697a5412"
+
+    collection = Collection(name=collection_name, directory=collection_dir)
+
+    assign_entities(
+        resource_file_paths=["mock_csv.csv"],
+        collection=collection,
+        dataset=dataset_name,
+        organisation=["government-organisation:D1342"],
+        pipeline_dir=pipeline_dir,
+        specification_dir=specification_dir,
+        organisation_path=organisation_path,
+        endpoints=[test_endpoint],
+    )
+
+    lookups = Lookups(pipeline_dir)
+    lookups.load_csv()
+
+    specification = Specification(specification_dir)
+    entity_range_min = specification.get_dataset_entity_min(dataset_name)
+
+    assert len(lookups.entries) > 0
+    assert lookups.entries[0]["entity"] == entity_range_min
+    assert lookups.entries[0]["reference"] == "20/21=S106:001:20/0284/FUL"
+    assert lookups.entries[0]["prefix"] == dataset_name
+    assert lookups.entries[1]["entity"] == str(int(entity_range_min) + 1)
+    assert lookups.entries[1]["reference"] == "20/21=S106:001:21/0284/FUL"
+    assert lookups.entries[1]["prefix"] == dataset_name
diff --git a/tests/unit/phase/test_reference.py b/tests/unit/phase/test_reference.py
@@ -15,7 +15,11 @@ def test_split_curie():
 
 
 def test_entity_reference():
-    phase = EntityReferencePhase(dataset="tree", prefix="tree")
+    phase = EntityReferencePhase(
+        dataset="tree",
+        prefix="tree",
+        valid_reference_prefixes={"tree": {}, "wikidata": {}},
+    )
 
     assert ("tree", "31") == phase.process_row({"reference": "31"})
     assert ("foo", "Q1234") == phase.process_row(
@@ -29,6 +33,7 @@ def test_entity_reference():
     assert ("foo", "Q1234") == phase.process_row(
         {"prefix": "foo", "reference": "wikidata:Q1234"}
     )
+    assert ("tree", "Q1234") == phase.process_row({"reference": "invalidprefix:Q1234"})
 
 
 def test_fact_reference():