Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions digital_land/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,7 @@ def pipeline_run(
EntityReferencePhase(
dataset=dataset,
prefix=specification.dataset_prefix(dataset),
valid_reference_prefixes=specification.dataset,
),
EntityPrefixPhase(dataset=dataset),
EntityLookupPhase(
Expand Down Expand Up @@ -1408,6 +1409,7 @@ def get_resource_unidentified_lookups(
EntityReferencePhase(
dataset=dataset,
prefix=specification.dataset_prefix(dataset),
valid_reference_prefixes=specification.dataset,
),
EntityPrefixPhase(dataset=dataset),
print_lookup_phase,
Expand Down
16 changes: 15 additions & 1 deletion digital_land/phase/reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,16 @@ class EntityReferencePhase(Phase):
ensure an entry has the prefix and reference fields
"""

def __init__(self, dataset=None, prefix=None, specification=None):
def __init__(
self,
dataset=None,
prefix=None,
specification=None,
valid_reference_prefixes=None,
):
self.dataset = dataset
self.specification = specification
self.valid_reference_prefixes = valid_reference_prefixes
if prefix:
self.prefix = prefix
elif specification:
Expand All @@ -48,6 +55,13 @@ def process_row(self, row):
if "UPRN" in reference_prefix:
reference_prefix = ""

if (
self.valid_reference_prefixes
and reference_prefix not in self.valid_reference_prefixes
):
reference_prefix = ""
# case that isn't covered by this: prefix from reference that isn't a dataset will be filtered out. when will this happen?

prefix = row.get("prefix", "") or reference_prefix or self.prefix
return prefix, reference

Expand Down
74 changes: 74 additions & 0 deletions tests/e2e/test_assign_entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,37 @@ def mock_resource():
os.remove(mock_csv_path)


@pytest.fixture
def mock_resource_with_colon():
row1 = {
"reference": "DA:20/21=S106:001:20/0284/FUL",
"organisation": "government-organisation:D1342",
"value": "test",
}
row2 = {
"reference": "DA:20/21=S106:001:21/0284/FUL",
"organisation": "government-organisation:D1342",
"value": "test",
}
row3 = {
"reference": "",
"organisation": "government-organisation:D1342",
"value": "test",
}

mock_csv_path = Path("mock_csv.csv")
with open(mock_csv_path, "w", encoding="utf-8") as f:
dictwriter = csv.DictWriter(f, fieldnames=row1.keys())
dictwriter.writeheader()
dictwriter.writerow(row1)
dictwriter.writerow(row2)
dictwriter.writerow(row3)

yield mock_csv_path

os.remove(mock_csv_path)


@pytest.fixture
def collection_dir(tmp_path):
collection_dir = os.path.join(tmp_path, "collection")
Expand Down Expand Up @@ -447,3 +478,46 @@ def test_assign_entities_unique_assignment(
assert len(set(combined_entities)) == len(set(initial_entities)) + len(
set(updated_entities) - set(initial_entities)
), "No duplicates introduced."


def test_command_assign_entities_colon_in_reference(
collection_dir,
pipeline_dir,
specification_dir,
organisation_path,
mock_resource_with_colon,
):
"""
This tests a function that, given a specific resource hash for an endpoint already added to the system,
should identify any missing lookups and add them to lookup.csv
"""
collection_name = "ancient-woodland"
dataset_name = "ancient-woodland"
test_endpoint = "d779ad1c91c5a46e2d4ace4d5446d7d7f81df1ed058f882121070574697a5412"

collection = Collection(name=collection_name, directory=collection_dir)

assign_entities(
resource_file_paths=["mock_csv.csv"],
collection=collection,
dataset=dataset_name,
organisation=["government-organisation:D1342"],
pipeline_dir=pipeline_dir,
specification_dir=specification_dir,
organisation_path=organisation_path,
endpoints=[test_endpoint],
)

lookups = Lookups(pipeline_dir)
lookups.load_csv()

specification = Specification(specification_dir)
entity_range_min = specification.get_dataset_entity_min(dataset_name)

assert len(lookups.entries) > 0
assert lookups.entries[0]["entity"] == entity_range_min
assert lookups.entries[0]["reference"] == "20/21=S106:001:20/0284/FUL"
assert lookups.entries[0]["prefix"] == dataset_name
assert lookups.entries[1]["entity"] == str(int(entity_range_min) + 1)
assert lookups.entries[1]["reference"] == "20/21=S106:001:21/0284/FUL"
assert lookups.entries[1]["prefix"] == dataset_name
7 changes: 6 additions & 1 deletion tests/unit/phase/test_reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,11 @@ def test_split_curie():


def test_entity_reference():
phase = EntityReferencePhase(dataset="tree", prefix="tree")
phase = EntityReferencePhase(
dataset="tree",
prefix="tree",
valid_reference_prefixes={"tree": {}, "wikidata": {}},
)

assert ("tree", "31") == phase.process_row({"reference": "31"})
assert ("foo", "Q1234") == phase.process_row(
Expand All @@ -29,6 +33,7 @@ def test_entity_reference():
assert ("foo", "Q1234") == phase.process_row(
{"prefix": "foo", "reference": "wikidata:Q1234"}
)
assert ("tree", "Q1234") == phase.process_row({"reference": "invalidprefix:Q1234"})


def test_fact_reference():
Expand Down