|
3 | 3 | import csv |
4 | 4 | import urllib.request |
5 | 5 | import pandas as pd |
| 6 | +import logging |
6 | 7 | from urllib.error import URLError |
7 | 8 | from digital_land.pipeline import Pipeline |
8 | 9 | from digital_land.pipeline import Lookups |
9 | 10 | from digital_land.specification import Specification |
10 | 11 | from digital_land.organisation import Organisation |
11 | 12 |
|
| 13 | +logger = logging.getLogger(__name__) |
| 14 | + |
12 | 15 |
|
13 | 16 | def write_as_csv(dir, filename, data): |
14 | 17 | with open(os.path.join(dir, filename), "w") as f: |
@@ -551,6 +554,53 @@ def test_load_concat_no_prepend_append(empty_pipeline_dir): |
551 | 554 | } |
552 | 555 |
|
553 | 556 |
|
| 557 | +def test_load_lookup_creates_single_lookup_per_row(tmp_path): |
| 558 | + """ |
| 559 | + Test that load_lookup creates a single lookup entry per row in lookup.csv. |
| 560 | + Each row should create one lookup key based on all provided fields. |
| 561 | + Keys are normalized (lowercased, special chars removed). |
| 562 | + """ |
| 563 | + # -- Arrange -- |
| 564 | + pipeline_dir = tmp_path / "pipeline" |
| 565 | + pipeline_dir.mkdir() |
| 566 | + |
| 567 | + test_pipeline = "test-pipeline" |
| 568 | + |
| 569 | + # Create lookup.csv with various combinations of fields |
| 570 | + lookup_data = { |
| 571 | + "resource": ["", "", "res123"], |
| 572 | + "entry-number": ["", "5", ""], |
| 573 | + "prefix": ["ancient-woodland", "conservation-area", "listed-building"], |
| 574 | + "reference": ["AW001", "CA002", "LB003"], |
| 575 | + "organisation": ["local-authority:ABC", "", "local-authority:XYZ"], |
| 576 | + "entity": ["1000001", "1000002", "1000003"], |
| 577 | + } |
| 578 | + pd.DataFrame(lookup_data).to_csv(f"{pipeline_dir}/lookup.csv", index=False) |
| 579 | + |
| 580 | + # -- Act -- |
| 581 | + pipeline = Pipeline(str(pipeline_dir), test_pipeline) |
| 582 | + |
| 583 | + # -- Assert -- |
| 584 | + # Check that lookups are created correctly |
| 585 | + # Row 1: prefix + reference + organisation (no resource) |
| 586 | + assert "" in pipeline.lookup, "Should have empty resource key for general lookups" |
| 587 | + general_lookups = pipeline.lookup[""] |
| 588 | + |
| 589 | + # Row 1: normalized key (lowercased, colons removed) |
| 590 | + assert ",ancient-woodland,aw001,local-authorityabc" in general_lookups |
| 591 | + assert general_lookups[",ancient-woodland,aw001,local-authorityabc"] == "1000001" |
| 592 | + |
| 593 | + # Row 2: entry-number + prefix + reference (no organisation) |
| 594 | + assert "5,conservation-area,ca002," in general_lookups |
| 595 | + assert general_lookups["5,conservation-area,ca002,"] == "1000002" |
| 596 | + |
| 597 | + # Row 3: resource-scoped lookup |
| 598 | + assert "res123" in pipeline.lookup, "Should have resource-specific key" |
| 599 | + resource_lookups = pipeline.lookup["res123"] |
| 600 | + assert ",listed-building,lb003,local-authorityxyz" in resource_lookups |
| 601 | + assert resource_lookups[",listed-building,lb003,local-authorityxyz"] == "1000003" |
| 602 | + |
| 603 | + |
554 | 604 | @pytest.fixture(scope="session") |
555 | 605 | def specification_dir(tmp_path_factory): |
556 | 606 | """Download specification files from GitHub for testing""" |
@@ -665,7 +715,7 @@ def get_test_lookup_config(): |
665 | 715 | "reference": ["0", "1"], |
666 | 716 | "entity": ["2200001", "2200002"], |
667 | 717 | "start-date": ["", ""], |
668 | | - "organisation": ["101", "101"], |
| 718 | + "organisation": ["local-authority:LBH", "local-authority:LBH"], |
669 | 719 | "end-date": ["", ""], |
670 | 720 | "entry-date": ["", ""], |
671 | 721 | "endpoint": ["", ""], |
@@ -713,11 +763,11 @@ def test_pipeline_transform_basic( |
713 | 763 | pd.DataFrame(get_test_lookup_config()).to_csv( |
714 | 764 | f"{pipeline_dir}/lookup.csv", index=False |
715 | 765 | ) |
716 | | - |
717 | 766 | # Initialize pipeline components |
718 | 767 | spec = Specification(specification_dir) |
719 | 768 | org = Organisation(organisation_path=organisation_path) |
720 | 769 | pipeline = Pipeline(str(pipeline_dir), dataset_name, specification=spec) |
| 770 | + logger.info(f"Pipeline Lookups: {pipeline.lookup}") |
721 | 771 |
|
722 | 772 | output_path = tmp_path / "output" / "transformed.csv" |
723 | 773 | output_path.parent.mkdir(parents=True, exist_ok=True) |
@@ -893,7 +943,3 @@ def test_pipeline_transform_with_unmapped_reference_lookup_disabled( |
893 | 943 | # Verify reference field for unmapped data exists in output |
894 | 944 | reference_values = output_df[output_df["field"] == "reference"]["value"].tolist() |
895 | 945 | assert "99" in reference_values, "Unmapped reference 99 should be in output" |
896 | | - |
897 | | - |
898 | | -if __name__ == "__main__": |
899 | | - pytest.main() |
|
0 commit comments