From a6092c144ec467e711200c8775c16048375a2702 Mon Sep 17 00:00:00 2001
From: "Mark A. Miller" <MAM@lbl.gov>
Date: Mon, 9 Mar 2026 13:38:23 -0400
Subject: [PATCH] Add edge-case tests for cross-table lookup (#136)

Cover four gaps in test coverage for the LookupIndex and
transform_spec engine introduced in PR #136:

- Duplicate keys: verify LIMIT 1 first-match semantics and document
  that the returned row is non-deterministic for non-unique keys
- Empty secondary tables: headers-only TSV files register and query
  cleanly, returning None on lookup
- LookupIndex lifecycle: close() clears tables, operations after
  close() raise, double-close is safe
- Engine no-joins regression: transform_spec works correctly when
  class_derivations have no joins block (common case)
- Mixed derivations: joins and non-joins class_derivations coexist

All 11 tests pass on the cross-table-lookup branch.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../test_engine_edge_cases.py                 | 245 ++++++++++++++++++
 .../test_lookup_index_edge_cases.py           | 121 +++++++++
 2 files changed, 366 insertions(+)
 create mode 100644 tests/test_transformer/test_engine_edge_cases.py
 create mode 100644 tests/test_utils/test_lookup_index_edge_cases.py

diff --git a/tests/test_transformer/test_engine_edge_cases.py b/tests/test_transformer/test_engine_edge_cases.py
new file mode 100644
index 0000000..265a79e
--- /dev/null
+++ b/tests/test_transformer/test_engine_edge_cases.py
@@ -0,0 +1,245 @@
+"""Edge-case tests for the transform_spec engine (supplements test_cross_table_lookup.py).
+
+Covers:
+- Engine with no-joins class_derivation (regression safety)
+- Empty joined table (headers only)
+- Mixed derivations: one with joins, one without
+
+See: https://github.com/linkml/linkml-map/pull/136
+"""
+
+# ruff: noqa: ANN401, PLR2004
+
+import textwrap
+
+import yaml
+from linkml_runtime import SchemaView
+
+from linkml_map.loaders.data_loaders import DataLoader
+from linkml_map.transformer.engine import transform_spec
+from linkml_map.transformer.object_transformer import ObjectTransformer
+
+
+# ---- shared schemas ----
+
+SOURCE_SCHEMA_YAML = textwrap.dedent("""\
+    id: https://example.org/engine-test-source
+    name: engine_test_source
+    prefixes:
+      linkml: https://w3id.org/linkml/
+    imports:
+      - linkml:types
+    default_range: string
+    classes:
+      samples:
+        attributes:
+          sample_id:
+            identifier: true
+          name: {}
+          site_code: {}
+      sites:
+        attributes:
+          site_code:
+            identifier: true
+          site_name: {}
+""")
+
+TARGET_SCHEMA_YAML = textwrap.dedent("""\
+    id: https://example.org/engine-test-target
+    name: engine_test_target
+    prefixes:
+      linkml: https://w3id.org/linkml/
+    imports:
+      - linkml:types
+    default_range: string
+    classes:
+      FlatSample:
+        attributes:
+          sample_id:
+            identifier: true
+          name: {}
+          site_name: {}
+""")
+
+
+def _make_transformer(source_sv, target_sv, spec_yaml):
+    """Build an ObjectTransformer from inline YAML strings."""
+    tr = ObjectTransformer(unrestricted_eval=False)
+    tr.source_schemaview = source_sv
+    tr.target_schemaview = target_sv
+    tr.create_transformer_specification(yaml.safe_load(spec_yaml))
+    return tr
+
+
+# ---- no-joins regression ----
+
+
+def test_engine_no_joins(tmp_path):
+    """transform_spec works for a class_derivation with no joins block.
+
+    This is a regression test ensuring the join machinery doesn't break
+    the common case where joins are not used.
+    """
+    (tmp_path / "samples.tsv").write_text(
+        "sample_id\tname\tsite_code\n"
+        "S001\tAlpha\tSITE_A\n"
+        "S002\tBeta\tSITE_B\n"
+    )
+
+    spec = textwrap.dedent("""\
+        class_derivations:
+          FlatSample:
+            populated_from: samples
+            slot_derivations:
+              sample_id:
+                populated_from: sample_id
+              name:
+                populated_from: name
+    """)
+    source_sv = SchemaView(SOURCE_SCHEMA_YAML)
+    target_sv = SchemaView(TARGET_SCHEMA_YAML)
+    tr = _make_transformer(source_sv, target_sv, spec)
+    loader = DataLoader(tmp_path)
+
+    results = list(transform_spec(tr, loader))
+
+    assert len(results) == 2
+    assert results[0]["sample_id"] == "S001"
+    assert results[0]["name"] == "Alpha"
+    assert results[1]["sample_id"] == "S002"
+    assert results[1]["name"] == "Beta"
+
+
+def test_engine_no_joins_no_data(tmp_path):
+    """transform_spec gracefully yields nothing when the data file doesn't exist."""
+    spec = textwrap.dedent("""\
+        class_derivations:
+          FlatSample:
+            populated_from: samples
+            slot_derivations:
+              sample_id:
+                populated_from: sample_id
+    """)
+    source_sv = SchemaView(SOURCE_SCHEMA_YAML)
+    target_sv = SchemaView(TARGET_SCHEMA_YAML)
+    tr = _make_transformer(source_sv, target_sv, spec)
+    loader = DataLoader(tmp_path)  # no files in tmp_path
+
+    results = list(transform_spec(tr, loader))
+    assert results == []
+
+
+# ---- empty joined table ----
+
+
+def test_join_with_empty_secondary_table(tmp_path):
+    """When a joined table has headers but no data rows, lookups return None."""
+    (tmp_path / "samples.tsv").write_text(
+        "sample_id\tname\tsite_code\n"
+        "S001\tAlpha\tSITE_A\n"
+    )
+    # sites.tsv has headers only — no data rows
+    (tmp_path / "sites.tsv").write_text("site_code\tsite_name\n")
+
+    spec = textwrap.dedent("""\
+        class_derivations:
+          FlatSample:
+            populated_from: samples
+            joins:
+              sites:
+                join_on: site_code
+            slot_derivations:
+              sample_id:
+                populated_from: sample_id
+              name:
+                populated_from: name
+              site_name:
+                expr: "{sites.site_name}"
+    """)
+    source_sv = SchemaView(SOURCE_SCHEMA_YAML)
+    target_sv = SchemaView(TARGET_SCHEMA_YAML)
+    tr = _make_transformer(source_sv, target_sv, spec)
+    loader = DataLoader(tmp_path)
+
+    results = list(transform_spec(tr, loader))
+
+    assert len(results) == 1
+    assert results[0]["sample_id"] == "S001"
+    assert results[0]["name"] == "Alpha"
+    # No matching row in empty sites table → None via null propagation
+    assert results[0].get("site_name") is None
+
+
+# ---- mixed: one derivation with joins, one without ----
+
+
+def test_mixed_derivations_with_and_without_joins(tmp_path):
+    """Multiple class_derivations can coexist: some with joins, some without."""
+    (tmp_path / "samples.tsv").write_text(
+        "sample_id\tname\tsite_code\n"
+        "S001\tAlpha\tSITE_A\n"
+    )
+    (tmp_path / "sites.tsv").write_text(
+        "site_code\tsite_name\n"
+        "SITE_A\tBoston Medical\n"
+    )
+
+    # Two target classes: one uses joins, one doesn't
+    target_yaml = textwrap.dedent("""\
+        id: https://example.org/engine-test-target
+        name: engine_test_target
+        prefixes:
+          linkml: https://w3id.org/linkml/
+        imports:
+          - linkml:types
+        default_range: string
+        classes:
+          FlatSample:
+            attributes:
+              sample_id:
+                identifier: true
+              name: {}
+              site_name: {}
+          SimpleSample:
+            attributes:
+              sample_id:
+                identifier: true
+              name: {}
+    """)
+
+    spec = textwrap.dedent("""\
+        class_derivations:
+          FlatSample:
+            populated_from: samples
+            joins:
+              sites:
+                join_on: site_code
+            slot_derivations:
+              sample_id:
+                populated_from: sample_id
+              name:
+                populated_from: name
+              site_name:
+                expr: "{sites.site_name}"
+          SimpleSample:
+            populated_from: samples
+            slot_derivations:
+              sample_id:
+                populated_from: sample_id
+              name:
+                populated_from: name
+    """)
+    source_sv = SchemaView(SOURCE_SCHEMA_YAML)
+    target_sv = SchemaView(target_yaml)
+    tr = _make_transformer(source_sv, target_sv, spec)
+    loader = DataLoader(tmp_path)
+
+    results = list(transform_spec(tr, loader))
+
+    # Should get results from both derivations
+    assert len(results) == 2
+    # First: FlatSample with join
+    assert results[0]["site_name"] == "Boston Medical"
+    # Second: SimpleSample without join
+    assert results[1]["sample_id"] == "S001"
+    assert results[1]["name"] == "Alpha"
diff --git a/tests/test_utils/test_lookup_index_edge_cases.py b/tests/test_utils/test_lookup_index_edge_cases.py
new file mode 100644
index 0000000..40ae4a7
--- /dev/null
+++ b/tests/test_utils/test_lookup_index_edge_cases.py
@@ -0,0 +1,121 @@
+"""Edge-case tests for LookupIndex (supplements test_lookup_index.py).
+
+Covers:
+- Duplicate key behavior (LIMIT 1 first-match semantics)
+- Empty tables (headers only, zero data rows)
+- Lifecycle after close() (operations should fail gracefully)
+
+See: https://github.com/linkml/linkml-map/pull/136
+"""
+
+import duckdb
+import pytest
+
+from linkml_map.utils.lookup_index import LookupIndex
+
+
+@pytest.fixture()
+def index():
+    """Create a LookupIndex and close it after the test."""
+    idx = LookupIndex()
+    yield idx
+    idx.close()
+
+
+# ---- Duplicate key behavior ----
+
+
+def test_duplicate_keys_returns_a_row(index, tmp_path):
+    """When multiple rows share the same key, lookup_row returns one of them.
+
+    The current implementation uses ``LIMIT 1`` without an ``ORDER BY``,
+    so the returned row is deterministic per DuckDB's storage order (insertion
+    order for ``read_csv_auto``) but this is NOT guaranteed by the API.
+    This test documents the behavior without asserting which duplicate wins.
+    """
+    tsv = tmp_path / "dupes.tsv"
+    tsv.write_text(
+        "participant_id\tname\tage\n"
+        "P001\tAlice\t30\n"
+        "P001\tAlice-v2\t31\n"
+        "P002\tBob\t25\n"
+    )
+    index.register_table("dupes", tsv, "participant_id")
+    row = index.lookup_row("dupes", "participant_id", "P001")
+
+    # A row IS returned (not None)
+    assert row is not None
+    assert row["participant_id"] == "P001"
+    # The name is one of the two duplicate rows
+    assert row["name"] in {"Alice", "Alice-v2"}
+
+
+def test_duplicate_keys_unique_rows_unaffected(index, tmp_path):
+    """Rows with unique keys are unaffected by the presence of duplicates elsewhere."""
+    tsv = tmp_path / "dupes.tsv"
+    tsv.write_text(
+        "id\tvalue\n"
+        "A\t1\n"
+        "A\t2\n"
+        "B\t3\n"
+    )
+    index.register_table("dupes", tsv, "id")
+    row = index.lookup_row("dupes", "id", "B")
+    assert row is not None
+    assert row["value"] == "3"
+
+
+# ---- Empty tables ----
+
+
+def test_empty_table_headers_only(index, tmp_path):
+    """A table with column headers but zero data rows can be registered and queried."""
+    tsv = tmp_path / "empty.tsv"
+    tsv.write_text("id\tname\tage\n")
+    index.register_table("empty", tsv, "id")
+
+    assert index.is_registered("empty")
+    assert index.lookup_row("empty", "id", "anything") is None
+
+
+def test_empty_table_then_drop(index, tmp_path):
+    """An empty table can be dropped without error."""
+    tsv = tmp_path / "empty.tsv"
+    tsv.write_text("id\tvalue\n")
+    index.register_table("empty", tsv, "id")
+    index.drop("empty")
+    assert not index.is_registered("empty")
+
+
+# ---- Lifecycle after close() ----
+
+
+def test_close_clears_tables(index, tmp_path):
+    """After close(), is_registered returns False for all tables."""
+    tsv = tmp_path / "data.tsv"
+    tsv.write_text("id\tval\nA\t1\n")
+    index.register_table("data", tsv, "id")
+    assert index.is_registered("data")
+
+    index.close()
+    assert not index.is_registered("data")
+
+
+def test_operations_after_close_raise(tmp_path):
+    """Register and lookup operations after close() raise an error."""
+    idx = LookupIndex()
+    idx.close()
+
+    tsv = tmp_path / "data.tsv"
+    tsv.write_text("id\tval\nA\t1\n")
+
+    with pytest.raises((duckdb.ConnectionException, duckdb.InvalidInputException)):
+        idx.register_table("data", tsv, "id")
+
+
+def test_double_close_is_safe():
+    """Calling close() twice does not raise."""
+    idx = LookupIndex()
+    idx.close()
+    # Second close should not raise
+    idx.close()