From 00c752ef8b63ff70c0db462478c710e6ce12cddc Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Wed, 4 Mar 2026 07:57:09 -0600 Subject: [PATCH 1/6] Add cross-table lookup support for join-based transformations Implement spec-driven cross-table lookups using DuckDB, enabling slot derivations to reference columns from secondary tables via `{table.column}` syntax in expressions. This supports biomedical data harmonization use cases (e.g., pulling demographics into measurement observations from different source tables). - Extend AliasedClass with source_key, lookup_key, join_on fields - Create DuckDB-backed LookupIndex for fast keyed row lookups - Fix _eval_set to accept {obj.attr} null-propagation syntax - Wire cross-table resolution into Bindings via join_specs - Add get_path() to DataLoader for file path resolution - Create transform_spec() engine for spec-driven processing - Add 18 tests (unit + integration) covering the full stack Closes #134 Co-Authored-By: Claude Opus 4.6 --- src/linkml_map/datamodel/transformer_model.py | 5 +- .../datamodel/transformer_model.yaml | 8 +- src/linkml_map/loaders/data_loaders.py | 14 + src/linkml_map/transformer/engine.py | 80 ++++ .../transformer/object_transformer.py | 26 +- src/linkml_map/utils/eval_utils.py | 5 +- src/linkml_map/utils/lookup_index.py | 89 +++++ .../test_cross_table_lookup.py | 347 ++++++++++++++++++ tests/test_utils/test_eval_utils.py | 32 ++ tests/test_utils/test_lookup_index.py | 84 +++++ 10 files changed, 685 insertions(+), 5 deletions(-) create mode 100644 src/linkml_map/transformer/engine.py create mode 100644 src/linkml_map/utils/lookup_index.py create mode 100644 tests/test_transformer/test_cross_table_lookup.py create mode 100644 tests/test_utils/test_lookup_index.py diff --git a/src/linkml_map/datamodel/transformer_model.py b/src/linkml_map/datamodel/transformer_model.py index 9c50f72..0f20561 100644 --- a/src/linkml_map/datamodel/transformer_model.py +++ b/src/linkml_map/datamodel/transformer_model.py @@ -263,7 +263,7 @@ class ClassDerivation(ElementDerivation): 'EnumDerivation', 'PermissibleValueDerivation']} }) joins: Optional[Dict[str, AliasedClass]] = Field(default_factory=dict, description="""Additional classes to be joined to derive instances of the target class""", json_schema_extra = { "linkml_meta": {'alias': 'joins', - 'comments': ['not yet implemented'], + 'comments': ['supports cross-table lookups via source_key/lookup_key or on shorthand'], 'domain_of': ['ClassDerivation']} }) slot_derivations: Optional[Dict[str, SlotDerivation]] = Field(default_factory=dict, json_schema_extra = { "linkml_meta": {'alias': 'slot_derivations', 'domain_of': ['TransformationSpecification', 'ClassDerivation']} }) @@ -341,6 +341,9 @@ class AliasedClass(ConfiguredBaseModel): alias: str = Field(default=..., description="""name of the class to be aliased""", json_schema_extra = { "linkml_meta": {'alias': 'alias', 'domain_of': ['AliasedClass']} }) class_named: Optional[str] = Field(default=None, description="""local alias for the class""", json_schema_extra = { "linkml_meta": {'alias': 'class_named', 'domain_of': ['AliasedClass']} }) + source_key: Optional[str] = Field(default=None, description="""column in the primary (populated_from) table used as the join key""", json_schema_extra = { "linkml_meta": {'alias': 'source_key', 'domain_of': ['AliasedClass']} }) + lookup_key: Optional[str] = Field(default=None, description="""column in the secondary (joined) table used as the join key""", json_schema_extra = { "linkml_meta": {'alias': 'lookup_key', 'domain_of': ['AliasedClass']} }) + join_on: Optional[str] = Field(default=None, description="""shorthand for source_key and lookup_key when both share the same column name""", json_schema_extra = { "linkml_meta": {'alias': 'join_on', 'domain_of': ['AliasedClass']} }) class SlotDerivation(ElementDerivation): diff --git a/src/linkml_map/datamodel/transformer_model.yaml b/src/linkml_map/datamodel/transformer_model.yaml index ab51e3f..3b3813f 100644 --- a/src/linkml_map/datamodel/transformer_model.yaml +++ b/src/linkml_map/datamodel/transformer_model.yaml @@ -185,7 +185,7 @@ classes: inlined: true description: Additional classes to be joined to derive instances of the target class comments: - - not yet implemented + - supports cross-table lookups via source_key/lookup_key or on shorthand slot_derivations: range: SlotDerivation multivalued: true @@ -220,6 +220,12 @@ classes: description: name of the class to be aliased class_named: description: local alias for the class + source_key: + description: column in the primary (populated_from) table used as the join key + lookup_key: + description: column in the secondary (joined) table used as the join key + join_on: + description: shorthand for source_key and lookup_key when both share the same column name SlotDerivation: is_a: ElementDerivation diff --git a/src/linkml_map/loaders/data_loaders.py b/src/linkml_map/loaders/data_loaders.py index 9c956fd..d4aa094 100644 --- a/src/linkml_map/loaders/data_loaders.py +++ b/src/linkml_map/loaders/data_loaders.py @@ -236,6 +236,20 @@ def _find_file(self, identifier: str) -> Optional[Path]: return None + def get_path(self, identifier: str) -> Path: + """ + Return the resolved file path for *identifier*. + + :param identifier: Logical table/file name (without extension). + :returns: Absolute path to the matching data file. + :raises FileNotFoundError: If no matching file is found. + """ + path = self._find_file(identifier) + if path is None: + msg = f"No data file found for identifier {identifier!r} under {self.base_path}" + raise FileNotFoundError(msg) + return path + def __contains__(self, identifier: str) -> bool: """Check if a data file exists for the given identifier.""" if self.is_single_file: diff --git a/src/linkml_map/transformer/engine.py b/src/linkml_map/transformer/engine.py new file mode 100644 index 0000000..f748083 --- /dev/null +++ b/src/linkml_map/transformer/engine.py @@ -0,0 +1,80 @@ +"""Spec-driven processing engine with cross-table lookup support.""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, Any + +from linkml_map.utils.lookup_index import LookupIndex + +if TYPE_CHECKING: + from collections.abc import Iterator + + from linkml_map.loaders.data_loaders import DataLoader + from linkml_map.transformer.object_transformer import ObjectTransformer + +logger = logging.getLogger(__name__) + + +def transform_spec( + transformer: ObjectTransformer, + data_loader: DataLoader, + source_type: str | None = None, +) -> Iterator[dict[str, Any]]: + """ + Iterate class_derivation blocks and stream transformed rows. + + For each block whose ``populated_from`` names a loadable table, this + function: + + 1. Registers any ``joins`` as secondary tables in a :class:`LookupIndex`. + 2. Streams primary-table rows through + :meth:`ObjectTransformer.map_object`. + 3. Drops secondary tables when the block is done. + + :param transformer: A configured :class:`ObjectTransformer`. + :param data_loader: Loader that can resolve table names to file paths. + :param source_type: Optional explicit source type override. + :returns: Iterator of transformed row dicts. + """ + spec = transformer.derived_specification + if spec is None: + return + + if transformer.lookup_index is None: + transformer.lookup_index = LookupIndex() + + for class_deriv in spec.class_derivations: + table_name = class_deriv.populated_from or class_deriv.name + if table_name not in data_loader: + logger.debug("Skipping class_derivation %s: no data found", class_deriv.name) + continue + + joined_tables: list[str] = [] + try: + # Register secondary (joined) tables + if class_deriv.joins: + for join_name, join_spec in class_deriv.joins.items(): + lookup_key = join_spec.lookup_key or join_spec.join_on + if not lookup_key: + msg = ( + f"Join {join_name!r} must specify 'join_on' or " + f"'lookup_key'" + ) + raise ValueError(msg) + join_path = data_loader.get_path(join_name) + transformer.lookup_index.register_table( + join_name, join_path, lookup_key + ) + joined_tables.append(join_name) + + # Stream primary table rows + for row in data_loader[table_name]: + yield transformer.map_object( + row, + source_type=source_type or table_name, + class_derivation=class_deriv, + ) + finally: + for jt in joined_tables: + transformer.lookup_index.drop(jt) diff --git a/src/linkml_map/transformer/object_transformer.py b/src/linkml_map/transformer/object_transformer.py index fb67cdb..c990570 100644 --- a/src/linkml_map/transformer/object_transformer.py +++ b/src/linkml_map/transformer/object_transformer.py @@ -14,6 +14,7 @@ from pydantic import BaseModel from linkml_map.datamodel.transformer_model import ( + AliasedClass, ClassDerivation, CollectionType, PivotDirectionType, @@ -46,6 +47,7 @@ def __init__( # noqa: PLR0913 source_type: str, sv: SchemaView, bindings: dict, + join_specs: Optional[dict[str, AliasedClass]] = None, ) -> None: self.object_transformer: ObjectTransformer = object_transformer self.source_obj: OBJECT_TYPE = source_obj @@ -53,6 +55,7 @@ def __init__( # noqa: PLR0913 self.source_type: str = source_type self.sv: SchemaView = sv self.bindings: dict = {} + self.join_specs: dict[str, AliasedClass] = join_specs or {} if bindings: self.bindings.update(bindings) @@ -105,10 +108,29 @@ def __iter__(self) -> Iterator: def __getitem__(self, name: Any) -> Any: if name not in self.bindings: - _ = self.get_ctxt_obj_and_dict({name: self.source_obj[name]}) + if name in self.join_specs and self.object_transformer.lookup_index is not None: + self.bindings[name] = self._resolve_join(name) + else: + _ = self.get_ctxt_obj_and_dict({name: self.source_obj[name]}) return self.bindings.get(name) + def _resolve_join(self, table_name: str) -> DynObj | None: + """Resolve a cross-table lookup, returning a DynObj or None.""" + spec = self.join_specs[table_name] + source_key = spec.source_key or spec.join_on + lookup_key = spec.lookup_key or spec.join_on + if not source_key or not lookup_key: + msg = f"Join spec for {table_name!r} must specify 'on' or both 'source_key' and 'lookup_key'" + raise ValueError(msg) + key_val = self.source_obj.get(source_key) + if key_val is None: + return None + row = self.object_transformer.lookup_index.lookup_row(table_name, lookup_key, key_val) + if row is None: + return None + return DynObj(**row) + def __setitem__(self, name: Any, value: Any) -> None: del name, value msg = f"__setitem__ not allowed on class {self.__class__.__name__}" @@ -124,6 +146,7 @@ class ObjectTransformer(Transformer): """ object_index: ObjectIndex = None + lookup_index: Any = None # Optional[LookupIndex] — lazy import to avoid hard duckdb dep def index(self, source_obj: Any, target: Optional[str] = None) -> None: """ @@ -264,6 +287,7 @@ def map_object( source_type=source_type, sv=sv, bindings={"NULL": None}, + join_specs=class_deriv.joins if class_deriv.joins else None, ) try: diff --git a/src/linkml_map/utils/eval_utils.py b/src/linkml_map/utils/eval_utils.py index 3af263b..3124816 100644 --- a/src/linkml_map/utils/eval_utils.py +++ b/src/linkml_map/utils/eval_utils.py @@ -146,12 +146,13 @@ def _eval_set(self, node: ast.Set) -> Any: # noqa: ANN401 msg = "The {} must enclose a single variable" raise ValueError(msg) e = node.elts[0] - if not isinstance(e, ast.Name): + if not isinstance(e, (ast.Name, ast.Attribute)): msg = "The {} must enclose a variable" raise TypeError(msg) v = self._eval(e) if v is None: - msg = f"{e.id} is not set" + label = ast.dump(e) if isinstance(e, ast.Attribute) else e.id + msg = f"{label} is not set" raise UnsetValueError(msg) return v diff --git a/src/linkml_map/utils/lookup_index.py b/src/linkml_map/utils/lookup_index.py new file mode 100644 index 0000000..c379a2d --- /dev/null +++ b/src/linkml_map/utils/lookup_index.py @@ -0,0 +1,89 @@ +"""DuckDB-backed cross-table lookup index for join resolution.""" + +from __future__ import annotations + +import re +from pathlib import Path +from typing import Any + +import duckdb + +_IDENTIFIER_RE = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*$") + + +def _validate_identifier(name: str) -> None: + """Validate that *name* is a safe SQL identifier.""" + if not _IDENTIFIER_RE.match(name): + msg = f"Invalid identifier: {name!r}" + raise ValueError(msg) + + +class LookupIndex: + """ + In-memory DuckDB index for cross-table lookups. + + Each registered table is loaded from a CSV/TSV file via ``read_csv_auto`` + and indexed on a key column for fast single-row lookups. + """ + + def __init__(self) -> None: + """Initialize an empty lookup index with an in-memory DuckDB connection.""" + self._conn = duckdb.connect(":memory:") + self._tables: dict[str, str] = {} # table_name -> key_column + + def register_table(self, name: str, file_path: Path | str, key_column: str) -> None: + """ + Load a CSV/TSV file into DuckDB and create an index on *key_column*. + + :param name: Logical table name (must be a valid identifier). + :param file_path: Path to a CSV or TSV file. + :param key_column: Column to index for lookups. + """ + _validate_identifier(name) + _validate_identifier(key_column) + file_path = Path(file_path) + self._conn.execute( + f"CREATE OR REPLACE TABLE {name} AS " # noqa: S608 + f"SELECT * FROM read_csv_auto('{file_path}', all_varchar=true)" + ) + self._conn.execute( + f"CREATE INDEX IF NOT EXISTS idx_{name}_{key_column} ON {name} ({key_column})" + ) + self._tables[name] = key_column + + def lookup_row( + self, table: str, key_col: str, key_val: Any # noqa: ANN401 + ) -> dict[str, Any] | None: + """ + Return the first row matching *key_val* on *key_col*, or ``None``. + + :param table: Previously registered table name. + :param key_col: Column to match on. + :param key_val: Value to look up. + :returns: Row as a dict, or None if not found. + """ + _validate_identifier(table) + _validate_identifier(key_col) + result = self._conn.execute( + f"SELECT * FROM {table} WHERE {key_col} = $1 LIMIT 1", # noqa: S608 + [str(key_val)], + ).fetchone() + if result is None: + return None + columns = [desc[0] for desc in self._conn.description] + return dict(zip(columns, result, strict=True)) + + def drop(self, table: str) -> None: + """Drop a registered table, releasing memory.""" + _validate_identifier(table) + self._conn.execute(f"DROP TABLE IF EXISTS {table}") + self._tables.pop(table, None) + + def is_registered(self, table: str) -> bool: + """Check whether *table* has been registered.""" + return table in self._tables + + def close(self) -> None: + """Close the DuckDB connection.""" + self._conn.close() + self._tables.clear() diff --git a/tests/test_transformer/test_cross_table_lookup.py b/tests/test_transformer/test_cross_table_lookup.py new file mode 100644 index 0000000..f2eabb1 --- /dev/null +++ b/tests/test_transformer/test_cross_table_lookup.py @@ -0,0 +1,347 @@ +"""Integration tests for cross-table join lookups (Issue #134). + +These tests exercise the full stack: DataLoader → LookupIndex → Bindings → +ObjectTransformer → engine.transform_spec. Temporary TSV files serve as +primary and secondary tables. +""" + +# ruff: noqa: ANN401, PLR2004 + +import textwrap + +import pytest +import yaml +from linkml_runtime import SchemaView + +from linkml_map.loaders.data_loaders import DataLoader +from linkml_map.transformer.engine import transform_spec +from linkml_map.transformer.object_transformer import ObjectTransformer + + +# ---- fixtures ---- + +SOURCE_SCHEMA_YAML = textwrap.dedent("""\ + id: https://example.org/cross-table-source + name: cross_table_source + prefixes: + linkml: https://w3id.org/linkml/ + imports: + - linkml:types + default_range: string + classes: + lab_results: + attributes: + sample_id: + identifier: true + participant_id: {} + analyte: {} + result_value: {} + demographics: + attributes: + participant_id: + identifier: true + age_at_exam: {} + sex: {} + site_info: + attributes: + site_code: + identifier: true + site_name: {} +""") + +TARGET_SCHEMA_YAML = textwrap.dedent("""\ + id: https://example.org/cross-table-target + name: cross_table_target + prefixes: + linkml: https://w3id.org/linkml/ + imports: + - linkml:types + default_range: string + classes: + MeasurementObservation: + attributes: + sample_id: + identifier: true + analyte_value: {} + age_at_observation: {} + participant_sex: {} + site_name: {} +""") + + +@pytest.fixture() +def data_dir(tmp_path): + """Write TSV data files and return the directory path.""" + lab = tmp_path / "lab_results.tsv" + lab.write_text( + "sample_id\tparticipant_id\tanalyte\tresult_value\n" + "S001\tP001\tglucose\t5.5\n" + "S002\tP002\tcholesterol\t200\n" + "S003\tP999\tglucose\t6.1\n" # P999 has no demographics row + ) + demo = tmp_path / "demographics.tsv" + demo.write_text( + "participant_id\tage_at_exam\tsex\n" + "P001\t30\tF\n" + "P002\t45\tM\n" + ) + site = tmp_path / "site_info.tsv" + site.write_text( + "site_code\tsite_name\n" + "SITE_A\tBoston Medical\n" + ) + return tmp_path + + +@pytest.fixture() +def source_sv(): + return SchemaView(SOURCE_SCHEMA_YAML) + + +@pytest.fixture() +def target_sv(): + return SchemaView(TARGET_SCHEMA_YAML) + + +def _make_transformer(source_sv, target_sv, spec_yaml): + """Build an ObjectTransformer from inline YAML strings.""" + tr = ObjectTransformer(unrestricted_eval=False) + tr.source_schemaview = source_sv + tr.target_schemaview = target_sv + tr.create_transformer_specification(yaml.safe_load(spec_yaml)) + return tr + + +# ---- tests ---- + + +def test_cross_table_on_shorthand(data_dir, source_sv, target_sv): + """Cross-table lookup using the `on` shorthand (same column name in both tables).""" + spec = textwrap.dedent("""\ + class_derivations: + MeasurementObservation: + populated_from: lab_results + joins: + demographics: + join_on: participant_id + slot_derivations: + sample_id: + populated_from: sample_id + analyte_value: + populated_from: result_value + age_at_observation: + expr: "{demographics.age_at_exam}" + participant_sex: + expr: "{demographics.sex}" + """) + tr = _make_transformer(source_sv, target_sv, spec) + loader = DataLoader(data_dir) + results = list(transform_spec(tr, loader)) + + assert len(results) == 3 + + # S001 → P001 → age 30, sex F + r0 = results[0] + assert r0["sample_id"] == "S001" + assert str(r0["analyte_value"]) == "5.5" + assert r0["age_at_observation"] == "30" + assert r0["participant_sex"] == "F" + + # S002 → P002 → age 45, sex M + r1 = results[1] + assert r1["sample_id"] == "S002" + assert r1["age_at_observation"] == "45" + assert r1["participant_sex"] == "M" + + +def test_cross_table_explicit_keys(data_dir, source_sv, target_sv): + """Cross-table lookup with explicit source_key and lookup_key.""" + spec = textwrap.dedent("""\ + class_derivations: + MeasurementObservation: + populated_from: lab_results + joins: + demographics: + source_key: participant_id + lookup_key: participant_id + slot_derivations: + sample_id: + populated_from: sample_id + age_at_observation: + expr: "{demographics.age_at_exam}" + """) + tr = _make_transformer(source_sv, target_sv, spec) + loader = DataLoader(data_dir) + results = list(transform_spec(tr, loader)) + + assert results[0]["age_at_observation"] == "30" + assert results[1]["age_at_observation"] == "45" + + +def test_null_propagation_no_match(data_dir, source_sv, target_sv): + """When the lookup table has no matching row, {table.col} propagates None.""" + spec = textwrap.dedent("""\ + class_derivations: + MeasurementObservation: + populated_from: lab_results + joins: + demographics: + join_on: participant_id + slot_derivations: + sample_id: + populated_from: sample_id + age_at_observation: + expr: "{demographics.age_at_exam}" + """) + tr = _make_transformer(source_sv, target_sv, spec) + loader = DataLoader(data_dir) + results = list(transform_spec(tr, loader)) + + # S003 → P999 → no demographics row → None + r2 = results[2] + assert r2["sample_id"] == "S003" + assert r2.get("age_at_observation") is None + + +def test_expression_with_joined_column(data_dir, source_sv, target_sv): + """Expressions can combine joined columns with arithmetic.""" + # Override target schema to use integer range + target_yaml = textwrap.dedent("""\ + id: https://example.org/cross-table-target + name: cross_table_target + prefixes: + linkml: https://w3id.org/linkml/ + imports: + - linkml:types + default_range: string + classes: + MeasurementObservation: + attributes: + sample_id: + identifier: true + age_at_observation: + range: integer + """) + t_sv = SchemaView(target_yaml) + + spec = textwrap.dedent("""\ + class_derivations: + MeasurementObservation: + populated_from: lab_results + joins: + demographics: + join_on: participant_id + slot_derivations: + sample_id: + populated_from: sample_id + age_at_observation: + expr: "int({demographics.age_at_exam}) * 365" + """) + tr = _make_transformer(source_sv, t_sv, spec) + loader = DataLoader(data_dir) + results = list(transform_spec(tr, loader)) + + assert results[0]["age_at_observation"] == 30 * 365 + assert results[1]["age_at_observation"] == 45 * 365 + # P999 → null propagation through int() would raise, but {..} catches it first + assert results[2].get("age_at_observation") is None + + +def test_multiple_joined_tables(data_dir, source_sv, target_sv, tmp_path): + """Multiple secondary tables can be joined in a single class_derivation.""" + # Add a site_code column to lab_results + lab = tmp_path / "lab_results.tsv" + lab.write_text( + "sample_id\tparticipant_id\tanalyte\tresult_value\tsite_code\n" + "S001\tP001\tglucose\t5.5\tSITE_A\n" + ) + # Copy demographics and site_info to tmp_path (already in data_dir fixture) + (tmp_path / "demographics.tsv").write_text( + "participant_id\tage_at_exam\tsex\n" + "P001\t30\tF\n" + ) + (tmp_path / "site_info.tsv").write_text( + "site_code\tsite_name\n" + "SITE_A\tBoston Medical\n" + ) + + # Extend source schema to include site_code on lab_results + src_yaml = textwrap.dedent("""\ + id: https://example.org/cross-table-source + name: cross_table_source + prefixes: + linkml: https://w3id.org/linkml/ + imports: + - linkml:types + default_range: string + classes: + lab_results: + attributes: + sample_id: + identifier: true + participant_id: {} + analyte: {} + result_value: {} + site_code: {} + demographics: + attributes: + participant_id: + identifier: true + age_at_exam: {} + sex: {} + site_info: + attributes: + site_code: + identifier: true + site_name: {} + """) + s_sv = SchemaView(src_yaml) + + spec = textwrap.dedent("""\ + class_derivations: + MeasurementObservation: + populated_from: lab_results + joins: + demographics: + join_on: participant_id + site_info: + source_key: site_code + lookup_key: site_code + slot_derivations: + sample_id: + populated_from: sample_id + age_at_observation: + expr: "{demographics.age_at_exam}" + participant_sex: + expr: "{demographics.sex}" + site_name: + expr: "{site_info.site_name}" + """) + tr = _make_transformer(s_sv, target_sv, spec) + loader = DataLoader(tmp_path) + results = list(transform_spec(tr, loader)) + + assert len(results) == 1 + assert results[0]["age_at_observation"] == "30" + assert results[0]["participant_sex"] == "F" + assert results[0]["site_name"] == "Boston Medical" + + +def test_join_spec_missing_key_raises(source_sv, target_sv, data_dir): + """A join spec with neither `on` nor source_key/lookup_key raises ValueError.""" + spec = textwrap.dedent("""\ + class_derivations: + MeasurementObservation: + populated_from: lab_results + joins: + demographics: {} + slot_derivations: + sample_id: + populated_from: sample_id + age_at_observation: + expr: "{demographics.age_at_exam}" + """) + tr = _make_transformer(source_sv, target_sv, spec) + loader = DataLoader(data_dir) + with pytest.raises(ValueError, match="must specify"): + list(transform_spec(tr, loader)) diff --git a/tests/test_utils/test_eval_utils.py b/tests/test_utils/test_eval_utils.py index cf69e95..4df1ad0 100644 --- a/tests/test_utils/test_eval_utils.py +++ b/tests/test_utils/test_eval_utils.py @@ -206,6 +206,38 @@ def test_list_variable_concatenation() -> None: assert eval_expr("{x} + {y}", x=["a", "b"], y=["c", "d"]) == ["a", "b", "c", "d"] +# ---- Curly-braced attribute access (cross-table syntax) ---- + + +def test_curly_attribute_access() -> None: + """{obj.attr} resolves attribute access with null propagation.""" + from linkml_map.utils.dynamic_object import DynObj + + demo = DynObj(age=30, name="Alice") + assert eval_expr("{demo.age} * 365", demo=demo) == 30 * 365 + + +def test_curly_attribute_null_propagation_none_obj() -> None: + """{obj.attr} propagates None when the object itself is None.""" + assert eval_expr("{demo.age} * 365", demo=None) is None + + +def test_curly_attribute_null_propagation_missing_attr() -> None: + """{obj.attr} propagates None when the attribute is missing.""" + from linkml_map.utils.dynamic_object import DynObj + + demo = DynObj(name="Alice") # no 'age' attribute + assert eval_expr("{demo.age} * 365", demo=demo) is None + + +def test_curly_attribute_in_string_concat() -> None: + """{obj.attr} works in string concatenation expressions.""" + from linkml_map.utils.dynamic_object import DynObj + + demo = DynObj(prefix="Dr") + assert eval_expr("{demo.prefix} + '. Smith'", demo=demo) == "Dr. Smith" + + # ---- Functions ---- diff --git a/tests/test_utils/test_lookup_index.py b/tests/test_utils/test_lookup_index.py new file mode 100644 index 0000000..31d24de --- /dev/null +++ b/tests/test_utils/test_lookup_index.py @@ -0,0 +1,84 @@ +"""Tests for the DuckDB-backed LookupIndex.""" + +# ruff: noqa: ANN401 + +import pytest + +from linkml_map.utils.lookup_index import LookupIndex + + +@pytest.fixture() +def tmp_tsv(tmp_path): + """Create a simple TSV file and return its path.""" + tsv = tmp_path / "demo.tsv" + tsv.write_text("id\tname\tage\nP001\tAlice\t30\nP002\tBob\t25\n") + return tsv + + +@pytest.fixture() +def index(): + """Create a LookupIndex and close it after the test.""" + idx = LookupIndex() + yield idx + idx.close() + + +def test_register_and_lookup(index, tmp_tsv): + """Register a table and look up a row by key.""" + index.register_table("demo", tmp_tsv, "id") + row = index.lookup_row("demo", "id", "P001") + assert row is not None + assert row["name"] == "Alice" + assert row["age"] == "30" + + +def test_lookup_missing_row(index, tmp_tsv): + """Looking up a nonexistent key returns None.""" + index.register_table("demo", tmp_tsv, "id") + assert index.lookup_row("demo", "id", "MISSING") is None + + +def test_is_registered(index, tmp_tsv): + """is_registered reflects table state.""" + assert not index.is_registered("demo") + index.register_table("demo", tmp_tsv, "id") + assert index.is_registered("demo") + + +def test_drop(index, tmp_tsv): + """Dropping a table removes it from the index.""" + index.register_table("demo", tmp_tsv, "id") + index.drop("demo") + assert not index.is_registered("demo") + + +def test_drop_nonexistent(index): + """Dropping a table that was never registered does not raise.""" + index.drop("nonexistent") + + +def test_csv_format(index, tmp_path): + """CSV files are also handled by read_csv_auto.""" + csv = tmp_path / "data.csv" + csv.write_text("id,value\nX1,100\nX2,200\n") + index.register_table("data", csv, "id") + row = index.lookup_row("data", "id", "X2") + assert row is not None + assert row["value"] == "200" + + +def test_invalid_identifier(index): + """SQL-injection-style identifiers are rejected.""" + with pytest.raises(ValueError, match="Invalid identifier"): + index.register_table("drop table;--", "/dev/null", "id") + + +def test_all_varchar_coercion(index, tmp_path): + """Numeric-looking values are stored as VARCHAR due to all_varchar=true.""" + tsv = tmp_path / "nums.tsv" + tsv.write_text("id\tcount\n1\t42\n2\t99\n") + index.register_table("nums", tsv, "id") + row = index.lookup_row("nums", "id", "1") + assert row is not None + assert row["count"] == "42" + assert isinstance(row["count"], str) From f7223a5ff92ddf02da65ef88979660911c060280 Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Wed, 4 Mar 2026 08:12:18 -0600 Subject: [PATCH 2/6] Fix Python 3.9 compatibility: use Optional[] instead of X | None The `DynObj | None` union syntax requires Python 3.10+. CI tests against Python 3.9, so use `Optional[DynObj]` instead. Co-Authored-By: Claude Opus 4.6 --- src/linkml_map/transformer/object_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/linkml_map/transformer/object_transformer.py b/src/linkml_map/transformer/object_transformer.py index c990570..a38ba61 100644 --- a/src/linkml_map/transformer/object_transformer.py +++ b/src/linkml_map/transformer/object_transformer.py @@ -115,7 +115,7 @@ def __getitem__(self, name: Any) -> Any: return self.bindings.get(name) - def _resolve_join(self, table_name: str) -> DynObj | None: + def _resolve_join(self, table_name: str) -> Optional[DynObj]: """Resolve a cross-table lookup, returning a DynObj or None.""" spec = self.join_specs[table_name] source_key = spec.source_key or spec.join_on From 48fb889a846882dc134902593745e5b3da08abd3 Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Wed, 4 Mar 2026 08:14:56 -0600 Subject: [PATCH 3/6] Align transformer_model.py with gen-pydantic output The CI regenerates the Pydantic model from YAML and commits any diff. Align the hand-edited file with gen-pydantic output to avoid spurious CI push failures (only difference was comment line wrapping). Co-Authored-By: Claude Opus 4.6 --- src/linkml_map/datamodel/transformer_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/linkml_map/datamodel/transformer_model.py b/src/linkml_map/datamodel/transformer_model.py index 0f20561..d0eb912 100644 --- a/src/linkml_map/datamodel/transformer_model.py +++ b/src/linkml_map/datamodel/transformer_model.py @@ -263,7 +263,8 @@ class ClassDerivation(ElementDerivation): 'EnumDerivation', 'PermissibleValueDerivation']} }) joins: Optional[Dict[str, AliasedClass]] = Field(default_factory=dict, description="""Additional classes to be joined to derive instances of the target class""", json_schema_extra = { "linkml_meta": {'alias': 'joins', - 'comments': ['supports cross-table lookups via source_key/lookup_key or on shorthand'], + 'comments': ['supports cross-table lookups via source_key/lookup_key or on ' + 'shorthand'], 'domain_of': ['ClassDerivation']} }) slot_derivations: Optional[Dict[str, SlotDerivation]] = Field(default_factory=dict, json_schema_extra = { "linkml_meta": {'alias': 'slot_derivations', 'domain_of': ['TransformationSpecification', 'ClassDerivation']} }) From 18a34062f6072ea87b76d9645014808e87f73fe5 Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Wed, 4 Mar 2026 08:16:56 -0600 Subject: [PATCH 4/6] Fix Python 3.9 compat: remove zip(strict=True) The strict parameter for zip() was added in Python 3.10. Co-Authored-By: Claude Opus 4.6 --- src/linkml_map/utils/lookup_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/linkml_map/utils/lookup_index.py b/src/linkml_map/utils/lookup_index.py index c379a2d..51dd9aa 100644 --- a/src/linkml_map/utils/lookup_index.py +++ b/src/linkml_map/utils/lookup_index.py @@ -71,7 +71,7 @@ def lookup_row( if result is None: return None columns = [desc[0] for desc in self._conn.description] - return dict(zip(columns, result, strict=True)) + return dict(zip(columns, result)) def drop(self, table: str) -> None: """Drop a registered table, releasing memory.""" From d6f50c5ba37a7fcd2f1be2873ff606316608ee44 Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Fri, 6 Mar 2026 15:36:28 -0600 Subject: [PATCH 5/6] Address PR review feedback - Fix 'on shorthand' references to 'join_on' in comments and error messages - Use parameter binding for file_path in DuckDB read_csv_auto call - Add # noqa: S608 to validated dynamic SQL statements - Raise clear ValueError when join is configured but lookup_index is None - Validate both source_key and lookup_key in engine join registration - Resolve path in data_loader.get_path() to match docstring guarantee Co-Authored-By: Claude Opus 4.6 --- src/linkml_map/datamodel/transformer_model.py | 4 ++-- src/linkml_map/datamodel/transformer_model.yaml | 2 +- src/linkml_map/loaders/data_loaders.py | 2 +- src/linkml_map/transformer/engine.py | 7 ++++--- src/linkml_map/transformer/object_transformer.py | 7 +++++-- src/linkml_map/utils/lookup_index.py | 7 ++++--- 6 files changed, 17 insertions(+), 12 deletions(-) diff --git a/src/linkml_map/datamodel/transformer_model.py b/src/linkml_map/datamodel/transformer_model.py index d0eb912..d5a761e 100644 --- a/src/linkml_map/datamodel/transformer_model.py +++ b/src/linkml_map/datamodel/transformer_model.py @@ -263,8 +263,8 @@ class ClassDerivation(ElementDerivation): 'EnumDerivation', 'PermissibleValueDerivation']} }) joins: Optional[Dict[str, AliasedClass]] = Field(default_factory=dict, description="""Additional classes to be joined to derive instances of the target class""", json_schema_extra = { "linkml_meta": {'alias': 'joins', - 'comments': ['supports cross-table lookups via source_key/lookup_key or on ' - 'shorthand'], + 'comments': ['supports cross-table lookups via source_key/lookup_key or the ' + 'join_on field'], 'domain_of': ['ClassDerivation']} }) slot_derivations: Optional[Dict[str, SlotDerivation]] = Field(default_factory=dict, json_schema_extra = { "linkml_meta": {'alias': 'slot_derivations', 'domain_of': ['TransformationSpecification', 'ClassDerivation']} }) diff --git a/src/linkml_map/datamodel/transformer_model.yaml b/src/linkml_map/datamodel/transformer_model.yaml index 3b3813f..2f71fd8 100644 --- a/src/linkml_map/datamodel/transformer_model.yaml +++ b/src/linkml_map/datamodel/transformer_model.yaml @@ -185,7 +185,7 @@ classes: inlined: true description: Additional classes to be joined to derive instances of the target class comments: - - supports cross-table lookups via source_key/lookup_key or on shorthand + - supports cross-table lookups via source_key/lookup_key or the join_on field slot_derivations: range: SlotDerivation multivalued: true diff --git a/src/linkml_map/loaders/data_loaders.py b/src/linkml_map/loaders/data_loaders.py index d4aa094..a939c10 100644 --- a/src/linkml_map/loaders/data_loaders.py +++ b/src/linkml_map/loaders/data_loaders.py @@ -248,7 +248,7 @@ def get_path(self, identifier: str) -> Path: if path is None: msg = f"No data file found for identifier {identifier!r} under {self.base_path}" raise FileNotFoundError(msg) - return path + return path.resolve() def __contains__(self, identifier: str) -> bool: """Check if a data file exists for the given identifier.""" diff --git a/src/linkml_map/transformer/engine.py b/src/linkml_map/transformer/engine.py index f748083..1759b59 100644 --- a/src/linkml_map/transformer/engine.py +++ b/src/linkml_map/transformer/engine.py @@ -56,10 +56,11 @@ def transform_spec( if class_deriv.joins: for join_name, join_spec in class_deriv.joins.items(): lookup_key = join_spec.lookup_key or join_spec.join_on - if not lookup_key: + source_key = join_spec.source_key or join_spec.join_on + if not lookup_key or not source_key: msg = ( - f"Join {join_name!r} must specify 'join_on' or " - f"'lookup_key'" + f"Join {join_name!r} must specify 'join_on' or both " + f"'source_key' and 'lookup_key'" ) raise ValueError(msg) join_path = data_loader.get_path(join_name) diff --git a/src/linkml_map/transformer/object_transformer.py b/src/linkml_map/transformer/object_transformer.py index a38ba61..19bcd8a 100644 --- a/src/linkml_map/transformer/object_transformer.py +++ b/src/linkml_map/transformer/object_transformer.py @@ -108,7 +108,10 @@ def __iter__(self) -> Iterator: def __getitem__(self, name: Any) -> Any: if name not in self.bindings: - if name in self.join_specs and self.object_transformer.lookup_index is not None: + if name in self.join_specs: + if self.object_transformer.lookup_index is None: + msg = f"Join configured for {name!r} but lookup_index has not been initialized" + raise ValueError(msg) self.bindings[name] = self._resolve_join(name) else: _ = self.get_ctxt_obj_and_dict({name: self.source_obj[name]}) @@ -121,7 +124,7 @@ def _resolve_join(self, table_name: str) -> Optional[DynObj]: source_key = spec.source_key or spec.join_on lookup_key = spec.lookup_key or spec.join_on if not source_key or not lookup_key: - msg = f"Join spec for {table_name!r} must specify 'on' or both 'source_key' and 'lookup_key'" + msg = f"Join spec for {table_name!r} must specify 'join_on' or both 'source_key' and 'lookup_key'" raise ValueError(msg) key_val = self.source_obj.get(source_key) if key_val is None: diff --git a/src/linkml_map/utils/lookup_index.py b/src/linkml_map/utils/lookup_index.py index 51dd9aa..dd2889b 100644 --- a/src/linkml_map/utils/lookup_index.py +++ b/src/linkml_map/utils/lookup_index.py @@ -44,10 +44,11 @@ def register_table(self, name: str, file_path: Path | str, key_column: str) -> N file_path = Path(file_path) self._conn.execute( f"CREATE OR REPLACE TABLE {name} AS " # noqa: S608 - f"SELECT * FROM read_csv_auto('{file_path}', all_varchar=true)" + "SELECT * FROM read_csv_auto(?, all_varchar=true)", + [str(file_path)] ) self._conn.execute( - f"CREATE INDEX IF NOT EXISTS idx_{name}_{key_column} ON {name} ({key_column})" + f"CREATE INDEX IF NOT EXISTS idx_{name}_{key_column} ON {name} ({key_column})" # noqa: S608 ) self._tables[name] = key_column @@ -76,7 +77,7 @@ def lookup_row( def drop(self, table: str) -> None: """Drop a registered table, releasing memory.""" _validate_identifier(table) - self._conn.execute(f"DROP TABLE IF EXISTS {table}") + self._conn.execute(f"DROP TABLE IF EXISTS {table}") # noqa: S608 self._tables.pop(table, None) def is_registered(self, table: str) -> bool: From 685b1889eda835646cba06b5879a5c611f218171 Mon Sep 17 00:00:00 2001 From: Mark Andrew Miller Date: Mon, 9 Mar 2026 14:29:11 -0400 Subject: [PATCH 6/6] Add edge-case tests for cross-table lookup --- .../test_engine_edge_cases.py | 245 ++++++++++++++++++ .../test_lookup_index_edge_cases.py | 121 +++++++++ 2 files changed, 366 insertions(+) create mode 100644 tests/test_transformer/test_engine_edge_cases.py create mode 100644 tests/test_utils/test_lookup_index_edge_cases.py diff --git a/tests/test_transformer/test_engine_edge_cases.py b/tests/test_transformer/test_engine_edge_cases.py new file mode 100644 index 0000000..265a79e --- /dev/null +++ b/tests/test_transformer/test_engine_edge_cases.py @@ -0,0 +1,245 @@ +"""Edge-case tests for the transform_spec engine (supplements test_cross_table_lookup.py). + +Covers: +- Engine with no-joins class_derivation (regression safety) +- Empty joined table (headers only) +- Mixed derivations: one with joins, one without + +See: https://github.com/linkml/linkml-map/pull/136 +""" + +# ruff: noqa: ANN401, PLR2004 + +import textwrap + +import yaml +from linkml_runtime import SchemaView + +from linkml_map.loaders.data_loaders import DataLoader +from linkml_map.transformer.engine import transform_spec +from linkml_map.transformer.object_transformer import ObjectTransformer + + +# ---- shared schemas ---- + +SOURCE_SCHEMA_YAML = textwrap.dedent("""\ + id: https://example.org/engine-test-source + name: engine_test_source + prefixes: + linkml: https://w3id.org/linkml/ + imports: + - linkml:types + default_range: string + classes: + samples: + attributes: + sample_id: + identifier: true + name: {} + site_code: {} + sites: + attributes: + site_code: + identifier: true + site_name: {} +""") + +TARGET_SCHEMA_YAML = textwrap.dedent("""\ + id: https://example.org/engine-test-target + name: engine_test_target + prefixes: + linkml: https://w3id.org/linkml/ + imports: + - linkml:types + default_range: string + classes: + FlatSample: + attributes: + sample_id: + identifier: true + name: {} + site_name: {} +""") + + +def _make_transformer(source_sv, target_sv, spec_yaml): + """Build an ObjectTransformer from inline YAML strings.""" + tr = ObjectTransformer(unrestricted_eval=False) + tr.source_schemaview = source_sv + tr.target_schemaview = target_sv + tr.create_transformer_specification(yaml.safe_load(spec_yaml)) + return tr + + +# ---- no-joins regression ---- + + +def test_engine_no_joins(tmp_path): + """transform_spec works for a class_derivation with no joins block. + + This is a regression test ensuring the join machinery doesn't break + the common case where joins are not used. + """ + (tmp_path / "samples.tsv").write_text( + "sample_id\tname\tsite_code\n" + "S001\tAlpha\tSITE_A\n" + "S002\tBeta\tSITE_B\n" + ) + + spec = textwrap.dedent("""\ + class_derivations: + FlatSample: + populated_from: samples + slot_derivations: + sample_id: + populated_from: sample_id + name: + populated_from: name + """) + source_sv = SchemaView(SOURCE_SCHEMA_YAML) + target_sv = SchemaView(TARGET_SCHEMA_YAML) + tr = _make_transformer(source_sv, target_sv, spec) + loader = DataLoader(tmp_path) + + results = list(transform_spec(tr, loader)) + + assert len(results) == 2 + assert results[0]["sample_id"] == "S001" + assert results[0]["name"] == "Alpha" + assert results[1]["sample_id"] == "S002" + assert results[1]["name"] == "Beta" + + +def test_engine_no_joins_no_data(tmp_path): + """transform_spec gracefully yields nothing when the data file doesn't exist.""" + spec = textwrap.dedent("""\ + class_derivations: + FlatSample: + populated_from: samples + slot_derivations: + sample_id: + populated_from: sample_id + """) + source_sv = SchemaView(SOURCE_SCHEMA_YAML) + target_sv = SchemaView(TARGET_SCHEMA_YAML) + tr = _make_transformer(source_sv, target_sv, spec) + loader = DataLoader(tmp_path) # no files in tmp_path + + results = list(transform_spec(tr, loader)) + assert results == [] + + +# ---- empty joined table ---- + + +def test_join_with_empty_secondary_table(tmp_path): + """When a joined table has headers but no data rows, lookups return None.""" + (tmp_path / "samples.tsv").write_text( + "sample_id\tname\tsite_code\n" + "S001\tAlpha\tSITE_A\n" + ) + # sites.tsv has headers only — no data rows + (tmp_path / "sites.tsv").write_text("site_code\tsite_name\n") + + spec = textwrap.dedent("""\ + class_derivations: + FlatSample: + populated_from: samples + joins: + sites: + join_on: site_code + slot_derivations: + sample_id: + populated_from: sample_id + name: + populated_from: name + site_name: + expr: "{sites.site_name}" + """) + source_sv = SchemaView(SOURCE_SCHEMA_YAML) + target_sv = SchemaView(TARGET_SCHEMA_YAML) + tr = _make_transformer(source_sv, target_sv, spec) + loader = DataLoader(tmp_path) + + results = list(transform_spec(tr, loader)) + + assert len(results) == 1 + assert results[0]["sample_id"] == "S001" + assert results[0]["name"] == "Alpha" + # No matching row in empty sites table → None via null propagation + assert results[0].get("site_name") is None + + +# ---- mixed: one derivation with joins, one without ---- + + +def test_mixed_derivations_with_and_without_joins(tmp_path): + """Multiple class_derivations can coexist: some with joins, some without.""" + (tmp_path / "samples.tsv").write_text( + "sample_id\tname\tsite_code\n" + "S001\tAlpha\tSITE_A\n" + ) + (tmp_path / "sites.tsv").write_text( + "site_code\tsite_name\n" + "SITE_A\tBoston Medical\n" + ) + + # Two target classes: one uses joins, one doesn't + target_yaml = textwrap.dedent("""\ + id: https://example.org/engine-test-target + name: engine_test_target + prefixes: + linkml: https://w3id.org/linkml/ + imports: + - linkml:types + default_range: string + classes: + FlatSample: + attributes: + sample_id: + identifier: true + name: {} + site_name: {} + SimpleSample: + attributes: + sample_id: + identifier: true + name: {} + """) + + spec = textwrap.dedent("""\ + class_derivations: + FlatSample: + populated_from: samples + joins: + sites: + join_on: site_code + slot_derivations: + sample_id: + populated_from: sample_id + name: + populated_from: name + site_name: + expr: "{sites.site_name}" + SimpleSample: + populated_from: samples + slot_derivations: + sample_id: + populated_from: sample_id + name: + populated_from: name + """) + source_sv = SchemaView(SOURCE_SCHEMA_YAML) + target_sv = SchemaView(target_yaml) + tr = _make_transformer(source_sv, target_sv, spec) + loader = DataLoader(tmp_path) + + results = list(transform_spec(tr, loader)) + + # Should get results from both derivations + assert len(results) == 2 + # First: FlatSample with join + assert results[0]["site_name"] == "Boston Medical" + # Second: SimpleSample without join + assert results[1]["sample_id"] == "S001" + assert results[1]["name"] == "Alpha" diff --git a/tests/test_utils/test_lookup_index_edge_cases.py b/tests/test_utils/test_lookup_index_edge_cases.py new file mode 100644 index 0000000..40ae4a7 --- /dev/null +++ b/tests/test_utils/test_lookup_index_edge_cases.py @@ -0,0 +1,121 @@ +"""Edge-case tests for LookupIndex (supplements test_lookup_index.py). + +Covers: +- Duplicate key behavior (LIMIT 1 first-match semantics) +- Empty tables (headers only, zero data rows) +- Lifecycle after close() (operations should fail gracefully) + +See: https://github.com/linkml/linkml-map/pull/136 +""" + +import duckdb +import pytest + +from linkml_map.utils.lookup_index import LookupIndex + + +@pytest.fixture() +def index(): + """Create a LookupIndex and close it after the test.""" + idx = LookupIndex() + yield idx + idx.close() + + +# ---- Duplicate key behavior ---- + + +def test_duplicate_keys_returns_a_row(index, tmp_path): + """When multiple rows share the same key, lookup_row returns one of them. + + The current implementation uses ``LIMIT 1`` without an ``ORDER BY``, + so the returned row is deterministic per DuckDB's storage order (insertion + order for ``read_csv_auto``) but this is NOT guaranteed by the API. + This test documents the behavior without asserting which duplicate wins. + """ + tsv = tmp_path / "dupes.tsv" + tsv.write_text( + "participant_id\tname\tage\n" + "P001\tAlice\t30\n" + "P001\tAlice-v2\t31\n" + "P002\tBob\t25\n" + ) + index.register_table("dupes", tsv, "participant_id") + row = index.lookup_row("dupes", "participant_id", "P001") + + # A row IS returned (not None) + assert row is not None + assert row["participant_id"] == "P001" + # The name is one of the two duplicate rows + assert row["name"] in {"Alice", "Alice-v2"} + + +def test_duplicate_keys_unique_rows_unaffected(index, tmp_path): + """Rows with unique keys are unaffected by the presence of duplicates elsewhere.""" + tsv = tmp_path / "dupes.tsv" + tsv.write_text( + "id\tvalue\n" + "A\t1\n" + "A\t2\n" + "B\t3\n" + ) + index.register_table("dupes", tsv, "id") + row = index.lookup_row("dupes", "id", "B") + assert row is not None + assert row["value"] == "3" + + +# ---- Empty tables ---- + + +def test_empty_table_headers_only(index, tmp_path): + """A table with column headers but zero data rows can be registered and queried.""" + tsv = tmp_path / "empty.tsv" + tsv.write_text("id\tname\tage\n") + index.register_table("empty", tsv, "id") + + assert index.is_registered("empty") + assert index.lookup_row("empty", "id", "anything") is None + + +def test_empty_table_then_drop(index, tmp_path): + """An empty table can be dropped without error.""" + tsv = tmp_path / "empty.tsv" + tsv.write_text("id\tvalue\n") + index.register_table("empty", tsv, "id") + index.drop("empty") + assert not index.is_registered("empty") + + +# ---- Lifecycle after close() ---- + + +def test_close_clears_tables(index, tmp_path): + """After close(), is_registered returns False for all tables.""" + tsv = tmp_path / "data.tsv" + tsv.write_text("id\tval\nA\t1\n") + index.register_table("data", tsv, "id") + assert index.is_registered("data") + + index.close() + assert not index.is_registered("data") + + +def test_operations_after_close_raise(tmp_path): + """Register and lookup operations after close() raise an error.""" + idx = LookupIndex() + idx.close() + + tsv = tmp_path / "data.tsv" + tsv.write_text("id\tval\nA\t1\n") + + with pytest.raises((duckdb.ConnectionException, duckdb.InvalidInputException)): + idx.register_table("data", tsv, "id") + + +def test_double_close_is_safe(): + """Calling close() twice does not raise.""" + idx = LookupIndex() + idx.close() + # Second close should not raise + idx.close()