From a6092c144ec467e711200c8775c16048375a2702 Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Mon, 9 Mar 2026 13:38:23 -0400 Subject: [PATCH] Add edge-case tests for cross-table lookup (#136) Cover four gaps in test coverage for the LookupIndex and transform_spec engine introduced in PR #136: - Duplicate keys: verify LIMIT 1 first-match semantics and document that the returned row is non-deterministic for non-unique keys - Empty secondary tables: headers-only TSV files register and query cleanly, returning None on lookup - LookupIndex lifecycle: close() clears tables, operations after close() raise, double-close is safe - Engine no-joins regression: transform_spec works correctly when class_derivations have no joins block (common case) - Mixed derivations: joins and non-joins class_derivations coexist All 11 tests pass on the cross-table-lookup branch. Co-Authored-By: Claude Opus 4.6 --- .../test_engine_edge_cases.py | 245 ++++++++++++++++++ .../test_lookup_index_edge_cases.py | 121 +++++++++ 2 files changed, 366 insertions(+) create mode 100644 tests/test_transformer/test_engine_edge_cases.py create mode 100644 tests/test_utils/test_lookup_index_edge_cases.py diff --git a/tests/test_transformer/test_engine_edge_cases.py b/tests/test_transformer/test_engine_edge_cases.py new file mode 100644 index 0000000..265a79e --- /dev/null +++ b/tests/test_transformer/test_engine_edge_cases.py @@ -0,0 +1,245 @@ +"""Edge-case tests for the transform_spec engine (supplements test_cross_table_lookup.py). + +Covers: +- Engine with no-joins class_derivation (regression safety) +- Empty joined table (headers only) +- Mixed derivations: one with joins, one without + +See: https://github.com/linkml/linkml-map/pull/136 +""" + +# ruff: noqa: ANN401, PLR2004 + +import textwrap + +import yaml +from linkml_runtime import SchemaView + +from linkml_map.loaders.data_loaders import DataLoader +from linkml_map.transformer.engine import transform_spec +from linkml_map.transformer.object_transformer import ObjectTransformer + + +# ---- shared schemas ---- + +SOURCE_SCHEMA_YAML = textwrap.dedent("""\ + id: https://example.org/engine-test-source + name: engine_test_source + prefixes: + linkml: https://w3id.org/linkml/ + imports: + - linkml:types + default_range: string + classes: + samples: + attributes: + sample_id: + identifier: true + name: {} + site_code: {} + sites: + attributes: + site_code: + identifier: true + site_name: {} +""") + +TARGET_SCHEMA_YAML = textwrap.dedent("""\ + id: https://example.org/engine-test-target + name: engine_test_target + prefixes: + linkml: https://w3id.org/linkml/ + imports: + - linkml:types + default_range: string + classes: + FlatSample: + attributes: + sample_id: + identifier: true + name: {} + site_name: {} +""") + + +def _make_transformer(source_sv, target_sv, spec_yaml): + """Build an ObjectTransformer from inline YAML strings.""" + tr = ObjectTransformer(unrestricted_eval=False) + tr.source_schemaview = source_sv + tr.target_schemaview = target_sv + tr.create_transformer_specification(yaml.safe_load(spec_yaml)) + return tr + + +# ---- no-joins regression ---- + + +def test_engine_no_joins(tmp_path): + """transform_spec works for a class_derivation with no joins block. + + This is a regression test ensuring the join machinery doesn't break + the common case where joins are not used. + """ + (tmp_path / "samples.tsv").write_text( + "sample_id\tname\tsite_code\n" + "S001\tAlpha\tSITE_A\n" + "S002\tBeta\tSITE_B\n" + ) + + spec = textwrap.dedent("""\ + class_derivations: + FlatSample: + populated_from: samples + slot_derivations: + sample_id: + populated_from: sample_id + name: + populated_from: name + """) + source_sv = SchemaView(SOURCE_SCHEMA_YAML) + target_sv = SchemaView(TARGET_SCHEMA_YAML) + tr = _make_transformer(source_sv, target_sv, spec) + loader = DataLoader(tmp_path) + + results = list(transform_spec(tr, loader)) + + assert len(results) == 2 + assert results[0]["sample_id"] == "S001" + assert results[0]["name"] == "Alpha" + assert results[1]["sample_id"] == "S002" + assert results[1]["name"] == "Beta" + + +def test_engine_no_joins_no_data(tmp_path): + """transform_spec gracefully yields nothing when the data file doesn't exist.""" + spec = textwrap.dedent("""\ + class_derivations: + FlatSample: + populated_from: samples + slot_derivations: + sample_id: + populated_from: sample_id + """) + source_sv = SchemaView(SOURCE_SCHEMA_YAML) + target_sv = SchemaView(TARGET_SCHEMA_YAML) + tr = _make_transformer(source_sv, target_sv, spec) + loader = DataLoader(tmp_path) # no files in tmp_path + + results = list(transform_spec(tr, loader)) + assert results == [] + + +# ---- empty joined table ---- + + +def test_join_with_empty_secondary_table(tmp_path): + """When a joined table has headers but no data rows, lookups return None.""" + (tmp_path / "samples.tsv").write_text( + "sample_id\tname\tsite_code\n" + "S001\tAlpha\tSITE_A\n" + ) + # sites.tsv has headers only — no data rows + (tmp_path / "sites.tsv").write_text("site_code\tsite_name\n") + + spec = textwrap.dedent("""\ + class_derivations: + FlatSample: + populated_from: samples + joins: + sites: + join_on: site_code + slot_derivations: + sample_id: + populated_from: sample_id + name: + populated_from: name + site_name: + expr: "{sites.site_name}" + """) + source_sv = SchemaView(SOURCE_SCHEMA_YAML) + target_sv = SchemaView(TARGET_SCHEMA_YAML) + tr = _make_transformer(source_sv, target_sv, spec) + loader = DataLoader(tmp_path) + + results = list(transform_spec(tr, loader)) + + assert len(results) == 1 + assert results[0]["sample_id"] == "S001" + assert results[0]["name"] == "Alpha" + # No matching row in empty sites table → None via null propagation + assert results[0].get("site_name") is None + + +# ---- mixed: one derivation with joins, one without ---- + + +def test_mixed_derivations_with_and_without_joins(tmp_path): + """Multiple class_derivations can coexist: some with joins, some without.""" + (tmp_path / "samples.tsv").write_text( + "sample_id\tname\tsite_code\n" + "S001\tAlpha\tSITE_A\n" + ) + (tmp_path / "sites.tsv").write_text( + "site_code\tsite_name\n" + "SITE_A\tBoston Medical\n" + ) + + # Two target classes: one uses joins, one doesn't + target_yaml = textwrap.dedent("""\ + id: https://example.org/engine-test-target + name: engine_test_target + prefixes: + linkml: https://w3id.org/linkml/ + imports: + - linkml:types + default_range: string + classes: + FlatSample: + attributes: + sample_id: + identifier: true + name: {} + site_name: {} + SimpleSample: + attributes: + sample_id: + identifier: true + name: {} + """) + + spec = textwrap.dedent("""\ + class_derivations: + FlatSample: + populated_from: samples + joins: + sites: + join_on: site_code + slot_derivations: + sample_id: + populated_from: sample_id + name: + populated_from: name + site_name: + expr: "{sites.site_name}" + SimpleSample: + populated_from: samples + slot_derivations: + sample_id: + populated_from: sample_id + name: + populated_from: name + """) + source_sv = SchemaView(SOURCE_SCHEMA_YAML) + target_sv = SchemaView(target_yaml) + tr = _make_transformer(source_sv, target_sv, spec) + loader = DataLoader(tmp_path) + + results = list(transform_spec(tr, loader)) + + # Should get results from both derivations + assert len(results) == 2 + # First: FlatSample with join + assert results[0]["site_name"] == "Boston Medical" + # Second: SimpleSample without join + assert results[1]["sample_id"] == "S001" + assert results[1]["name"] == "Alpha" diff --git a/tests/test_utils/test_lookup_index_edge_cases.py b/tests/test_utils/test_lookup_index_edge_cases.py new file mode 100644 index 0000000..40ae4a7 --- /dev/null +++ b/tests/test_utils/test_lookup_index_edge_cases.py @@ -0,0 +1,121 @@ +"""Edge-case tests for LookupIndex (supplements test_lookup_index.py). + +Covers: +- Duplicate key behavior (LIMIT 1 first-match semantics) +- Empty tables (headers only, zero data rows) +- Lifecycle after close() (operations should fail gracefully) + +See: https://github.com/linkml/linkml-map/pull/136 +""" + +import duckdb +import pytest + +from linkml_map.utils.lookup_index import LookupIndex + + +@pytest.fixture() +def index(): + """Create a LookupIndex and close it after the test.""" + idx = LookupIndex() + yield idx + idx.close() + + +# ---- Duplicate key behavior ---- + + +def test_duplicate_keys_returns_a_row(index, tmp_path): + """When multiple rows share the same key, lookup_row returns one of them. + + The current implementation uses ``LIMIT 1`` without an ``ORDER BY``, + so the returned row is deterministic per DuckDB's storage order (insertion + order for ``read_csv_auto``) but this is NOT guaranteed by the API. + This test documents the behavior without asserting which duplicate wins. + """ + tsv = tmp_path / "dupes.tsv" + tsv.write_text( + "participant_id\tname\tage\n" + "P001\tAlice\t30\n" + "P001\tAlice-v2\t31\n" + "P002\tBob\t25\n" + ) + index.register_table("dupes", tsv, "participant_id") + row = index.lookup_row("dupes", "participant_id", "P001") + + # A row IS returned (not None) + assert row is not None + assert row["participant_id"] == "P001" + # The name is one of the two duplicate rows + assert row["name"] in {"Alice", "Alice-v2"} + + +def test_duplicate_keys_unique_rows_unaffected(index, tmp_path): + """Rows with unique keys are unaffected by the presence of duplicates elsewhere.""" + tsv = tmp_path / "dupes.tsv" + tsv.write_text( + "id\tvalue\n" + "A\t1\n" + "A\t2\n" + "B\t3\n" + ) + index.register_table("dupes", tsv, "id") + row = index.lookup_row("dupes", "id", "B") + assert row is not None + assert row["value"] == "3" + + +# ---- Empty tables ---- + + +def test_empty_table_headers_only(index, tmp_path): + """A table with column headers but zero data rows can be registered and queried.""" + tsv = tmp_path / "empty.tsv" + tsv.write_text("id\tname\tage\n") + index.register_table("empty", tsv, "id") + + assert index.is_registered("empty") + assert index.lookup_row("empty", "id", "anything") is None + + +def test_empty_table_then_drop(index, tmp_path): + """An empty table can be dropped without error.""" + tsv = tmp_path / "empty.tsv" + tsv.write_text("id\tvalue\n") + index.register_table("empty", tsv, "id") + index.drop("empty") + assert not index.is_registered("empty") + + +# ---- Lifecycle after close() ---- + + +def test_close_clears_tables(index, tmp_path): + """After close(), is_registered returns False for all tables.""" + tsv = tmp_path / "data.tsv" + tsv.write_text("id\tval\nA\t1\n") + index.register_table("data", tsv, "id") + assert index.is_registered("data") + + index.close() + assert not index.is_registered("data") + + +def test_operations_after_close_raise(tmp_path): + """Register and lookup operations after close() raise an error.""" + idx = LookupIndex() + idx.close() + + tsv = tmp_path / "data.tsv" + tsv.write_text("id\tval\nA\t1\n") + + with pytest.raises((duckdb.ConnectionException, duckdb.InvalidInputException)): + idx.register_table("data", tsv, "id") + + +def test_double_close_is_safe(): + """Calling close() twice does not raise.""" + idx = LookupIndex() + idx.close() + # Second close should not raise + idx.close()