diff --git a/src/linkml_map/datamodel/transformer_model.py b/src/linkml_map/datamodel/transformer_model.py index 9c50f72..d5a761e 100644 --- a/src/linkml_map/datamodel/transformer_model.py +++ b/src/linkml_map/datamodel/transformer_model.py @@ -263,7 +263,8 @@ class ClassDerivation(ElementDerivation): 'EnumDerivation', 'PermissibleValueDerivation']} }) joins: Optional[Dict[str, AliasedClass]] = Field(default_factory=dict, description="""Additional classes to be joined to derive instances of the target class""", json_schema_extra = { "linkml_meta": {'alias': 'joins', - 'comments': ['not yet implemented'], + 'comments': ['supports cross-table lookups via source_key/lookup_key or the ' + 'join_on field'], 'domain_of': ['ClassDerivation']} }) slot_derivations: Optional[Dict[str, SlotDerivation]] = Field(default_factory=dict, json_schema_extra = { "linkml_meta": {'alias': 'slot_derivations', 'domain_of': ['TransformationSpecification', 'ClassDerivation']} }) @@ -341,6 +342,9 @@ class AliasedClass(ConfiguredBaseModel): alias: str = Field(default=..., description="""name of the class to be aliased""", json_schema_extra = { "linkml_meta": {'alias': 'alias', 'domain_of': ['AliasedClass']} }) class_named: Optional[str] = Field(default=None, description="""local alias for the class""", json_schema_extra = { "linkml_meta": {'alias': 'class_named', 'domain_of': ['AliasedClass']} }) + source_key: Optional[str] = Field(default=None, description="""column in the primary (populated_from) table used as the join key""", json_schema_extra = { "linkml_meta": {'alias': 'source_key', 'domain_of': ['AliasedClass']} }) + lookup_key: Optional[str] = Field(default=None, description="""column in the secondary (joined) table used as the join key""", json_schema_extra = { "linkml_meta": {'alias': 'lookup_key', 'domain_of': ['AliasedClass']} }) + join_on: Optional[str] = Field(default=None, description="""shorthand for source_key and lookup_key when both share the same column name""", json_schema_extra = { "linkml_meta": {'alias': 'join_on', 'domain_of': ['AliasedClass']} }) class SlotDerivation(ElementDerivation): diff --git a/src/linkml_map/datamodel/transformer_model.yaml b/src/linkml_map/datamodel/transformer_model.yaml index ab51e3f..2f71fd8 100644 --- a/src/linkml_map/datamodel/transformer_model.yaml +++ b/src/linkml_map/datamodel/transformer_model.yaml @@ -185,7 +185,7 @@ classes: inlined: true description: Additional classes to be joined to derive instances of the target class comments: - - not yet implemented + - supports cross-table lookups via source_key/lookup_key or the join_on field slot_derivations: range: SlotDerivation multivalued: true @@ -220,6 +220,12 @@ classes: description: name of the class to be aliased class_named: description: local alias for the class + source_key: + description: column in the primary (populated_from) table used as the join key + lookup_key: + description: column in the secondary (joined) table used as the join key + join_on: + description: shorthand for source_key and lookup_key when both share the same column name SlotDerivation: is_a: ElementDerivation diff --git a/src/linkml_map/loaders/data_loaders.py b/src/linkml_map/loaders/data_loaders.py index 9c956fd..a939c10 100644 --- a/src/linkml_map/loaders/data_loaders.py +++ b/src/linkml_map/loaders/data_loaders.py @@ -236,6 +236,20 @@ def _find_file(self, identifier: str) -> Optional[Path]: return None + def get_path(self, identifier: str) -> Path: + """ + Return the resolved file path for *identifier*. + + :param identifier: Logical table/file name (without extension). + :returns: Absolute path to the matching data file. + :raises FileNotFoundError: If no matching file is found. + """ + path = self._find_file(identifier) + if path is None: + msg = f"No data file found for identifier {identifier!r} under {self.base_path}" + raise FileNotFoundError(msg) + return path.resolve() + def __contains__(self, identifier: str) -> bool: """Check if a data file exists for the given identifier.""" if self.is_single_file: diff --git a/src/linkml_map/transformer/engine.py b/src/linkml_map/transformer/engine.py new file mode 100644 index 0000000..1759b59 --- /dev/null +++ b/src/linkml_map/transformer/engine.py @@ -0,0 +1,81 @@ +"""Spec-driven processing engine with cross-table lookup support.""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, Any + +from linkml_map.utils.lookup_index import LookupIndex + +if TYPE_CHECKING: + from collections.abc import Iterator + + from linkml_map.loaders.data_loaders import DataLoader + from linkml_map.transformer.object_transformer import ObjectTransformer + +logger = logging.getLogger(__name__) + + +def transform_spec( + transformer: ObjectTransformer, + data_loader: DataLoader, + source_type: str | None = None, +) -> Iterator[dict[str, Any]]: + """ + Iterate class_derivation blocks and stream transformed rows. + + For each block whose ``populated_from`` names a loadable table, this + function: + + 1. Registers any ``joins`` as secondary tables in a :class:`LookupIndex`. + 2. Streams primary-table rows through + :meth:`ObjectTransformer.map_object`. + 3. Drops secondary tables when the block is done. + + :param transformer: A configured :class:`ObjectTransformer`. + :param data_loader: Loader that can resolve table names to file paths. + :param source_type: Optional explicit source type override. + :returns: Iterator of transformed row dicts. + """ + spec = transformer.derived_specification + if spec is None: + return + + if transformer.lookup_index is None: + transformer.lookup_index = LookupIndex() + + for class_deriv in spec.class_derivations: + table_name = class_deriv.populated_from or class_deriv.name + if table_name not in data_loader: + logger.debug("Skipping class_derivation %s: no data found", class_deriv.name) + continue + + joined_tables: list[str] = [] + try: + # Register secondary (joined) tables + if class_deriv.joins: + for join_name, join_spec in class_deriv.joins.items(): + lookup_key = join_spec.lookup_key or join_spec.join_on + source_key = join_spec.source_key or join_spec.join_on + if not lookup_key or not source_key: + msg = ( + f"Join {join_name!r} must specify 'join_on' or both " + f"'source_key' and 'lookup_key'" + ) + raise ValueError(msg) + join_path = data_loader.get_path(join_name) + transformer.lookup_index.register_table( + join_name, join_path, lookup_key + ) + joined_tables.append(join_name) + + # Stream primary table rows + for row in data_loader[table_name]: + yield transformer.map_object( + row, + source_type=source_type or table_name, + class_derivation=class_deriv, + ) + finally: + for jt in joined_tables: + transformer.lookup_index.drop(jt) diff --git a/src/linkml_map/transformer/object_transformer.py b/src/linkml_map/transformer/object_transformer.py index fb67cdb..19bcd8a 100644 --- a/src/linkml_map/transformer/object_transformer.py +++ b/src/linkml_map/transformer/object_transformer.py @@ -14,6 +14,7 @@ from pydantic import BaseModel from linkml_map.datamodel.transformer_model import ( + AliasedClass, ClassDerivation, CollectionType, PivotDirectionType, @@ -46,6 +47,7 @@ def __init__( # noqa: PLR0913 source_type: str, sv: SchemaView, bindings: dict, + join_specs: Optional[dict[str, AliasedClass]] = None, ) -> None: self.object_transformer: ObjectTransformer = object_transformer self.source_obj: OBJECT_TYPE = source_obj @@ -53,6 +55,7 @@ def __init__( # noqa: PLR0913 self.source_type: str = source_type self.sv: SchemaView = sv self.bindings: dict = {} + self.join_specs: dict[str, AliasedClass] = join_specs or {} if bindings: self.bindings.update(bindings) @@ -105,10 +108,32 @@ def __iter__(self) -> Iterator: def __getitem__(self, name: Any) -> Any: if name not in self.bindings: - _ = self.get_ctxt_obj_and_dict({name: self.source_obj[name]}) + if name in self.join_specs: + if self.object_transformer.lookup_index is None: + msg = f"Join configured for {name!r} but lookup_index has not been initialized" + raise ValueError(msg) + self.bindings[name] = self._resolve_join(name) + else: + _ = self.get_ctxt_obj_and_dict({name: self.source_obj[name]}) return self.bindings.get(name) + def _resolve_join(self, table_name: str) -> Optional[DynObj]: + """Resolve a cross-table lookup, returning a DynObj or None.""" + spec = self.join_specs[table_name] + source_key = spec.source_key or spec.join_on + lookup_key = spec.lookup_key or spec.join_on + if not source_key or not lookup_key: + msg = f"Join spec for {table_name!r} must specify 'join_on' or both 'source_key' and 'lookup_key'" + raise ValueError(msg) + key_val = self.source_obj.get(source_key) + if key_val is None: + return None + row = self.object_transformer.lookup_index.lookup_row(table_name, lookup_key, key_val) + if row is None: + return None + return DynObj(**row) + def __setitem__(self, name: Any, value: Any) -> None: del name, value msg = f"__setitem__ not allowed on class {self.__class__.__name__}" @@ -124,6 +149,7 @@ class ObjectTransformer(Transformer): """ object_index: ObjectIndex = None + lookup_index: Any = None # Optional[LookupIndex] — lazy import to avoid hard duckdb dep def index(self, source_obj: Any, target: Optional[str] = None) -> None: """ @@ -264,6 +290,7 @@ def map_object( source_type=source_type, sv=sv, bindings={"NULL": None}, + join_specs=class_deriv.joins if class_deriv.joins else None, ) try: diff --git a/src/linkml_map/utils/eval_utils.py b/src/linkml_map/utils/eval_utils.py index 847a006..83e3188 100644 --- a/src/linkml_map/utils/eval_utils.py +++ b/src/linkml_map/utils/eval_utils.py @@ -195,12 +195,13 @@ def _eval_set(self, node: ast.Set) -> Any: # noqa: ANN401 msg = "The {} must enclose a single variable" raise ValueError(msg) e = node.elts[0] - if not isinstance(e, ast.Name): + if not isinstance(e, (ast.Name, ast.Attribute)): msg = "The {} must enclose a variable" raise TypeError(msg) v = self._eval(e) if v is None: - msg = f"{e.id} is not set" + label = ast.dump(e) if isinstance(e, ast.Attribute) else e.id + msg = f"{label} is not set" raise UnsetValueError(msg) return v diff --git a/src/linkml_map/utils/lookup_index.py b/src/linkml_map/utils/lookup_index.py new file mode 100644 index 0000000..dd2889b --- /dev/null +++ b/src/linkml_map/utils/lookup_index.py @@ -0,0 +1,90 @@ +"""DuckDB-backed cross-table lookup index for join resolution.""" + +from __future__ import annotations + +import re +from pathlib import Path +from typing import Any + +import duckdb + +_IDENTIFIER_RE = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*$") + + +def _validate_identifier(name: str) -> None: + """Validate that *name* is a safe SQL identifier.""" + if not _IDENTIFIER_RE.match(name): + msg = f"Invalid identifier: {name!r}" + raise ValueError(msg) + + +class LookupIndex: + """ + In-memory DuckDB index for cross-table lookups. + + Each registered table is loaded from a CSV/TSV file via ``read_csv_auto`` + and indexed on a key column for fast single-row lookups. + """ + + def __init__(self) -> None: + """Initialize an empty lookup index with an in-memory DuckDB connection.""" + self._conn = duckdb.connect(":memory:") + self._tables: dict[str, str] = {} # table_name -> key_column + + def register_table(self, name: str, file_path: Path | str, key_column: str) -> None: + """ + Load a CSV/TSV file into DuckDB and create an index on *key_column*. + + :param name: Logical table name (must be a valid identifier). + :param file_path: Path to a CSV or TSV file. + :param key_column: Column to index for lookups. + """ + _validate_identifier(name) + _validate_identifier(key_column) + file_path = Path(file_path) + self._conn.execute( + f"CREATE OR REPLACE TABLE {name} AS " # noqa: S608 + "SELECT * FROM read_csv_auto(?, all_varchar=true)", + [str(file_path)] + ) + self._conn.execute( + f"CREATE INDEX IF NOT EXISTS idx_{name}_{key_column} ON {name} ({key_column})" # noqa: S608 + ) + self._tables[name] = key_column + + def lookup_row( + self, table: str, key_col: str, key_val: Any # noqa: ANN401 + ) -> dict[str, Any] | None: + """ + Return the first row matching *key_val* on *key_col*, or ``None``. + + :param table: Previously registered table name. + :param key_col: Column to match on. + :param key_val: Value to look up. + :returns: Row as a dict, or None if not found. + """ + _validate_identifier(table) + _validate_identifier(key_col) + result = self._conn.execute( + f"SELECT * FROM {table} WHERE {key_col} = $1 LIMIT 1", # noqa: S608 + [str(key_val)], + ).fetchone() + if result is None: + return None + columns = [desc[0] for desc in self._conn.description] + return dict(zip(columns, result)) + + def drop(self, table: str) -> None: + """Drop a registered table, releasing memory.""" + _validate_identifier(table) + self._conn.execute(f"DROP TABLE IF EXISTS {table}") # noqa: S608 + self._tables.pop(table, None) + + def is_registered(self, table: str) -> bool: + """Check whether *table* has been registered.""" + return table in self._tables + + def close(self) -> None: + """Close the DuckDB connection.""" + self._conn.close() + self._tables.clear() diff --git a/tests/test_transformer/test_cross_table_lookup.py b/tests/test_transformer/test_cross_table_lookup.py new file mode 100644 index 0000000..f2eabb1 --- /dev/null +++ b/tests/test_transformer/test_cross_table_lookup.py @@ -0,0 +1,347 @@ +"""Integration tests for cross-table join lookups (Issue #134). + +These tests exercise the full stack: DataLoader → LookupIndex → Bindings → +ObjectTransformer → engine.transform_spec. Temporary TSV files serve as +primary and secondary tables. +""" + +# ruff: noqa: ANN401, PLR2004 + +import textwrap + +import pytest +import yaml +from linkml_runtime import SchemaView + +from linkml_map.loaders.data_loaders import DataLoader +from linkml_map.transformer.engine import transform_spec +from linkml_map.transformer.object_transformer import ObjectTransformer + + +# ---- fixtures ---- + +SOURCE_SCHEMA_YAML = textwrap.dedent("""\ + id: https://example.org/cross-table-source + name: cross_table_source + prefixes: + linkml: https://w3id.org/linkml/ + imports: + - linkml:types + default_range: string + classes: + lab_results: + attributes: + sample_id: + identifier: true + participant_id: {} + analyte: {} + result_value: {} + demographics: + attributes: + participant_id: + identifier: true + age_at_exam: {} + sex: {} + site_info: + attributes: + site_code: + identifier: true + site_name: {} +""") + +TARGET_SCHEMA_YAML = textwrap.dedent("""\ + id: https://example.org/cross-table-target + name: cross_table_target + prefixes: + linkml: https://w3id.org/linkml/ + imports: + - linkml:types + default_range: string + classes: + MeasurementObservation: + attributes: + sample_id: + identifier: true + analyte_value: {} + age_at_observation: {} + participant_sex: {} + site_name: {} +""") + + +@pytest.fixture() +def data_dir(tmp_path): + """Write TSV data files and return the directory path.""" + lab = tmp_path / "lab_results.tsv" + lab.write_text( + "sample_id\tparticipant_id\tanalyte\tresult_value\n" + "S001\tP001\tglucose\t5.5\n" + "S002\tP002\tcholesterol\t200\n" + "S003\tP999\tglucose\t6.1\n" # P999 has no demographics row + ) + demo = tmp_path / "demographics.tsv" + demo.write_text( + "participant_id\tage_at_exam\tsex\n" + "P001\t30\tF\n" + "P002\t45\tM\n" + ) + site = tmp_path / "site_info.tsv" + site.write_text( + "site_code\tsite_name\n" + "SITE_A\tBoston Medical\n" + ) + return tmp_path + + +@pytest.fixture() +def source_sv(): + return SchemaView(SOURCE_SCHEMA_YAML) + + +@pytest.fixture() +def target_sv(): + return SchemaView(TARGET_SCHEMA_YAML) + + +def _make_transformer(source_sv, target_sv, spec_yaml): + """Build an ObjectTransformer from inline YAML strings.""" + tr = ObjectTransformer(unrestricted_eval=False) + tr.source_schemaview = source_sv + tr.target_schemaview = target_sv + tr.create_transformer_specification(yaml.safe_load(spec_yaml)) + return tr + + +# ---- tests ---- + + +def test_cross_table_on_shorthand(data_dir, source_sv, target_sv): + """Cross-table lookup using the `on` shorthand (same column name in both tables).""" + spec = textwrap.dedent("""\ + class_derivations: + MeasurementObservation: + populated_from: lab_results + joins: + demographics: + join_on: participant_id + slot_derivations: + sample_id: + populated_from: sample_id + analyte_value: + populated_from: result_value + age_at_observation: + expr: "{demographics.age_at_exam}" + participant_sex: + expr: "{demographics.sex}" + """) + tr = _make_transformer(source_sv, target_sv, spec) + loader = DataLoader(data_dir) + results = list(transform_spec(tr, loader)) + + assert len(results) == 3 + + # S001 → P001 → age 30, sex F + r0 = results[0] + assert r0["sample_id"] == "S001" + assert str(r0["analyte_value"]) == "5.5" + assert r0["age_at_observation"] == "30" + assert r0["participant_sex"] == "F" + + # S002 → P002 → age 45, sex M + r1 = results[1] + assert r1["sample_id"] == "S002" + assert r1["age_at_observation"] == "45" + assert r1["participant_sex"] == "M" + + +def test_cross_table_explicit_keys(data_dir, source_sv, target_sv): + """Cross-table lookup with explicit source_key and lookup_key.""" + spec = textwrap.dedent("""\ + class_derivations: + MeasurementObservation: + populated_from: lab_results + joins: + demographics: + source_key: participant_id + lookup_key: participant_id + slot_derivations: + sample_id: + populated_from: sample_id + age_at_observation: + expr: "{demographics.age_at_exam}" + """) + tr = _make_transformer(source_sv, target_sv, spec) + loader = DataLoader(data_dir) + results = list(transform_spec(tr, loader)) + + assert results[0]["age_at_observation"] == "30" + assert results[1]["age_at_observation"] == "45" + + +def test_null_propagation_no_match(data_dir, source_sv, target_sv): + """When the lookup table has no matching row, {table.col} propagates None.""" + spec = textwrap.dedent("""\ + class_derivations: + MeasurementObservation: + populated_from: lab_results + joins: + demographics: + join_on: participant_id + slot_derivations: + sample_id: + populated_from: sample_id + age_at_observation: + expr: "{demographics.age_at_exam}" + """) + tr = _make_transformer(source_sv, target_sv, spec) + loader = DataLoader(data_dir) + results = list(transform_spec(tr, loader)) + + # S003 → P999 → no demographics row → None + r2 = results[2] + assert r2["sample_id"] == "S003" + assert r2.get("age_at_observation") is None + + +def test_expression_with_joined_column(data_dir, source_sv, target_sv): + """Expressions can combine joined columns with arithmetic.""" + # Override target schema to use integer range + target_yaml = textwrap.dedent("""\ + id: https://example.org/cross-table-target + name: cross_table_target + prefixes: + linkml: https://w3id.org/linkml/ + imports: + - linkml:types + default_range: string + classes: + MeasurementObservation: + attributes: + sample_id: + identifier: true + age_at_observation: + range: integer + """) + t_sv = SchemaView(target_yaml) + + spec = textwrap.dedent("""\ + class_derivations: + MeasurementObservation: + populated_from: lab_results + joins: + demographics: + join_on: participant_id + slot_derivations: + sample_id: + populated_from: sample_id + age_at_observation: + expr: "int({demographics.age_at_exam}) * 365" + """) + tr = _make_transformer(source_sv, t_sv, spec) + loader = DataLoader(data_dir) + results = list(transform_spec(tr, loader)) + + assert results[0]["age_at_observation"] == 30 * 365 + assert results[1]["age_at_observation"] == 45 * 365 + # P999 → null propagation through int() would raise, but {..} catches it first + assert results[2].get("age_at_observation") is None + + +def test_multiple_joined_tables(data_dir, source_sv, target_sv, tmp_path): + """Multiple secondary tables can be joined in a single class_derivation.""" + # Add a site_code column to lab_results + lab = tmp_path / "lab_results.tsv" + lab.write_text( + "sample_id\tparticipant_id\tanalyte\tresult_value\tsite_code\n" + "S001\tP001\tglucose\t5.5\tSITE_A\n" + ) + # Copy demographics and site_info to tmp_path (already in data_dir fixture) + (tmp_path / "demographics.tsv").write_text( + "participant_id\tage_at_exam\tsex\n" + "P001\t30\tF\n" + ) + (tmp_path / "site_info.tsv").write_text( + "site_code\tsite_name\n" + "SITE_A\tBoston Medical\n" + ) + + # Extend source schema to include site_code on lab_results + src_yaml = textwrap.dedent("""\ + id: https://example.org/cross-table-source + name: cross_table_source + prefixes: + linkml: https://w3id.org/linkml/ + imports: + - linkml:types + default_range: string + classes: + lab_results: + attributes: + sample_id: + identifier: true + participant_id: {} + analyte: {} + result_value: {} + site_code: {} + demographics: + attributes: + participant_id: + identifier: true + age_at_exam: {} + sex: {} + site_info: + attributes: + site_code: + identifier: true + site_name: {} + """) + s_sv = SchemaView(src_yaml) + + spec = textwrap.dedent("""\ + class_derivations: + MeasurementObservation: + populated_from: lab_results + joins: + demographics: + join_on: participant_id + site_info: + source_key: site_code + lookup_key: site_code + slot_derivations: + sample_id: + populated_from: sample_id + age_at_observation: + expr: "{demographics.age_at_exam}" + participant_sex: + expr: "{demographics.sex}" + site_name: + expr: "{site_info.site_name}" + """) + tr = _make_transformer(s_sv, target_sv, spec) + loader = DataLoader(tmp_path) + results = list(transform_spec(tr, loader)) + + assert len(results) == 1 + assert results[0]["age_at_observation"] == "30" + assert results[0]["participant_sex"] == "F" + assert results[0]["site_name"] == "Boston Medical" + + +def test_join_spec_missing_key_raises(source_sv, target_sv, data_dir): + """A join spec with neither `on` nor source_key/lookup_key raises ValueError.""" + spec = textwrap.dedent("""\ + class_derivations: + MeasurementObservation: + populated_from: lab_results + joins: + demographics: {} + slot_derivations: + sample_id: + populated_from: sample_id + age_at_observation: + expr: "{demographics.age_at_exam}" + """) + tr = _make_transformer(source_sv, target_sv, spec) + loader = DataLoader(data_dir) + with pytest.raises(ValueError, match="must specify"): + list(transform_spec(tr, loader)) diff --git a/tests/test_transformer/test_engine_edge_cases.py b/tests/test_transformer/test_engine_edge_cases.py new file mode 100644 index 0000000..265a79e --- /dev/null +++ b/tests/test_transformer/test_engine_edge_cases.py @@ -0,0 +1,245 @@ +"""Edge-case tests for the transform_spec engine (supplements test_cross_table_lookup.py). + +Covers: +- Engine with no-joins class_derivation (regression safety) +- Empty joined table (headers only) +- Mixed derivations: one with joins, one without + +See: https://github.com/linkml/linkml-map/pull/136 +""" + +# ruff: noqa: ANN401, PLR2004 + +import textwrap + +import yaml +from linkml_runtime import SchemaView + +from linkml_map.loaders.data_loaders import DataLoader +from linkml_map.transformer.engine import transform_spec +from linkml_map.transformer.object_transformer import ObjectTransformer + + +# ---- shared schemas ---- + +SOURCE_SCHEMA_YAML = textwrap.dedent("""\ + id: https://example.org/engine-test-source + name: engine_test_source + prefixes: + linkml: https://w3id.org/linkml/ + imports: + - linkml:types + default_range: string + classes: + samples: + attributes: + sample_id: + identifier: true + name: {} + site_code: {} + sites: + attributes: + site_code: + identifier: true + site_name: {} +""") + +TARGET_SCHEMA_YAML = textwrap.dedent("""\ + id: https://example.org/engine-test-target + name: engine_test_target + prefixes: + linkml: https://w3id.org/linkml/ + imports: + - linkml:types + default_range: string + classes: + FlatSample: + attributes: + sample_id: + identifier: true + name: {} + site_name: {} +""") + + +def _make_transformer(source_sv, target_sv, spec_yaml): + """Build an ObjectTransformer from inline YAML strings.""" + tr = ObjectTransformer(unrestricted_eval=False) + tr.source_schemaview = source_sv + tr.target_schemaview = target_sv + tr.create_transformer_specification(yaml.safe_load(spec_yaml)) + return tr + + +# ---- no-joins regression ---- + + +def test_engine_no_joins(tmp_path): + """transform_spec works for a class_derivation with no joins block. + + This is a regression test ensuring the join machinery doesn't break + the common case where joins are not used. + """ + (tmp_path / "samples.tsv").write_text( + "sample_id\tname\tsite_code\n" + "S001\tAlpha\tSITE_A\n" + "S002\tBeta\tSITE_B\n" + ) + + spec = textwrap.dedent("""\ + class_derivations: + FlatSample: + populated_from: samples + slot_derivations: + sample_id: + populated_from: sample_id + name: + populated_from: name + """) + source_sv = SchemaView(SOURCE_SCHEMA_YAML) + target_sv = SchemaView(TARGET_SCHEMA_YAML) + tr = _make_transformer(source_sv, target_sv, spec) + loader = DataLoader(tmp_path) + + results = list(transform_spec(tr, loader)) + + assert len(results) == 2 + assert results[0]["sample_id"] == "S001" + assert results[0]["name"] == "Alpha" + assert results[1]["sample_id"] == "S002" + assert results[1]["name"] == "Beta" + + +def test_engine_no_joins_no_data(tmp_path): + """transform_spec gracefully yields nothing when the data file doesn't exist.""" + spec = textwrap.dedent("""\ + class_derivations: + FlatSample: + populated_from: samples + slot_derivations: + sample_id: + populated_from: sample_id + """) + source_sv = SchemaView(SOURCE_SCHEMA_YAML) + target_sv = SchemaView(TARGET_SCHEMA_YAML) + tr = _make_transformer(source_sv, target_sv, spec) + loader = DataLoader(tmp_path) # no files in tmp_path + + results = list(transform_spec(tr, loader)) + assert results == [] + + +# ---- empty joined table ---- + + +def test_join_with_empty_secondary_table(tmp_path): + """When a joined table has headers but no data rows, lookups return None.""" + (tmp_path / "samples.tsv").write_text( + "sample_id\tname\tsite_code\n" + "S001\tAlpha\tSITE_A\n" + ) + # sites.tsv has headers only — no data rows + (tmp_path / "sites.tsv").write_text("site_code\tsite_name\n") + + spec = textwrap.dedent("""\ + class_derivations: + FlatSample: + populated_from: samples + joins: + sites: + join_on: site_code + slot_derivations: + sample_id: + populated_from: sample_id + name: + populated_from: name + site_name: + expr: "{sites.site_name}" + """) + source_sv = SchemaView(SOURCE_SCHEMA_YAML) + target_sv = SchemaView(TARGET_SCHEMA_YAML) + tr = _make_transformer(source_sv, target_sv, spec) + loader = DataLoader(tmp_path) + + results = list(transform_spec(tr, loader)) + + assert len(results) == 1 + assert results[0]["sample_id"] == "S001" + assert results[0]["name"] == "Alpha" + # No matching row in empty sites table → None via null propagation + assert results[0].get("site_name") is None + + +# ---- mixed: one derivation with joins, one without ---- + + +def test_mixed_derivations_with_and_without_joins(tmp_path): + """Multiple class_derivations can coexist: some with joins, some without.""" + (tmp_path / "samples.tsv").write_text( + "sample_id\tname\tsite_code\n" + "S001\tAlpha\tSITE_A\n" + ) + (tmp_path / "sites.tsv").write_text( + "site_code\tsite_name\n" + "SITE_A\tBoston Medical\n" + ) + + # Two target classes: one uses joins, one doesn't + target_yaml = textwrap.dedent("""\ + id: https://example.org/engine-test-target + name: engine_test_target + prefixes: + linkml: https://w3id.org/linkml/ + imports: + - linkml:types + default_range: string + classes: + FlatSample: + attributes: + sample_id: + identifier: true + name: {} + site_name: {} + SimpleSample: + attributes: + sample_id: + identifier: true + name: {} + """) + + spec = textwrap.dedent("""\ + class_derivations: + FlatSample: + populated_from: samples + joins: + sites: + join_on: site_code + slot_derivations: + sample_id: + populated_from: sample_id + name: + populated_from: name + site_name: + expr: "{sites.site_name}" + SimpleSample: + populated_from: samples + slot_derivations: + sample_id: + populated_from: sample_id + name: + populated_from: name + """) + source_sv = SchemaView(SOURCE_SCHEMA_YAML) + target_sv = SchemaView(target_yaml) + tr = _make_transformer(source_sv, target_sv, spec) + loader = DataLoader(tmp_path) + + results = list(transform_spec(tr, loader)) + + # Should get results from both derivations + assert len(results) == 2 + # First: FlatSample with join + assert results[0]["site_name"] == "Boston Medical" + # Second: SimpleSample without join + assert results[1]["sample_id"] == "S001" + assert results[1]["name"] == "Alpha" diff --git a/tests/test_utils/test_eval_utils.py b/tests/test_utils/test_eval_utils.py index b26dd91..f2cda19 100644 --- a/tests/test_utils/test_eval_utils.py +++ b/tests/test_utils/test_eval_utils.py @@ -206,6 +206,38 @@ def test_list_variable_concatenation() -> None: assert eval_expr("{x} + {y}", x=["a", "b"], y=["c", "d"]) == ["a", "b", "c", "d"] +# ---- Curly-braced attribute access (cross-table syntax) ---- + + +def test_curly_attribute_access() -> None: + """{obj.attr} resolves attribute access with null propagation.""" + from linkml_map.utils.dynamic_object import DynObj + + demo = DynObj(age=30, name="Alice") + assert eval_expr("{demo.age} * 365", demo=demo) == 30 * 365 + + +def test_curly_attribute_null_propagation_none_obj() -> None: + """{obj.attr} propagates None when the object itself is None.""" + assert eval_expr("{demo.age} * 365", demo=None) is None + + +def test_curly_attribute_null_propagation_missing_attr() -> None: + """{obj.attr} propagates None when the attribute is missing.""" + from linkml_map.utils.dynamic_object import DynObj + + demo = DynObj(name="Alice") # no 'age' attribute + assert eval_expr("{demo.age} * 365", demo=demo) is None + + +def test_curly_attribute_in_string_concat() -> None: + """{obj.attr} works in string concatenation expressions.""" + from linkml_map.utils.dynamic_object import DynObj + + demo = DynObj(prefix="Dr") + assert eval_expr("{demo.prefix} + '. Smith'", demo=demo) == "Dr. Smith" + + # ---- Functions ---- diff --git a/tests/test_utils/test_lookup_index.py b/tests/test_utils/test_lookup_index.py new file mode 100644 index 0000000..31d24de --- /dev/null +++ b/tests/test_utils/test_lookup_index.py @@ -0,0 +1,84 @@ +"""Tests for the DuckDB-backed LookupIndex.""" + +# ruff: noqa: ANN401 + +import pytest + +from linkml_map.utils.lookup_index import LookupIndex + + +@pytest.fixture() +def tmp_tsv(tmp_path): + """Create a simple TSV file and return its path.""" + tsv = tmp_path / "demo.tsv" + tsv.write_text("id\tname\tage\nP001\tAlice\t30\nP002\tBob\t25\n") + return tsv + + +@pytest.fixture() +def index(): + """Create a LookupIndex and close it after the test.""" + idx = LookupIndex() + yield idx + idx.close() + + +def test_register_and_lookup(index, tmp_tsv): + """Register a table and look up a row by key.""" + index.register_table("demo", tmp_tsv, "id") + row = index.lookup_row("demo", "id", "P001") + assert row is not None + assert row["name"] == "Alice" + assert row["age"] == "30" + + +def test_lookup_missing_row(index, tmp_tsv): + """Looking up a nonexistent key returns None.""" + index.register_table("demo", tmp_tsv, "id") + assert index.lookup_row("demo", "id", "MISSING") is None + + +def test_is_registered(index, tmp_tsv): + """is_registered reflects table state.""" + assert not index.is_registered("demo") + index.register_table("demo", tmp_tsv, "id") + assert index.is_registered("demo") + + +def test_drop(index, tmp_tsv): + """Dropping a table removes it from the index.""" + index.register_table("demo", tmp_tsv, "id") + index.drop("demo") + assert not index.is_registered("demo") + + +def test_drop_nonexistent(index): + """Dropping a table that was never registered does not raise.""" + index.drop("nonexistent") + + +def test_csv_format(index, tmp_path): + """CSV files are also handled by read_csv_auto.""" + csv = tmp_path / "data.csv" + csv.write_text("id,value\nX1,100\nX2,200\n") + index.register_table("data", csv, "id") + row = index.lookup_row("data", "id", "X2") + assert row is not None + assert row["value"] == "200" + + +def test_invalid_identifier(index): + """SQL-injection-style identifiers are rejected.""" + with pytest.raises(ValueError, match="Invalid identifier"): + index.register_table("drop table;--", "/dev/null", "id") + + +def test_all_varchar_coercion(index, tmp_path): + """Numeric-looking values are stored as VARCHAR due to all_varchar=true.""" + tsv = tmp_path / "nums.tsv" + tsv.write_text("id\tcount\n1\t42\n2\t99\n") + index.register_table("nums", tsv, "id") + row = index.lookup_row("nums", "id", "1") + assert row is not None + assert row["count"] == "42" + assert isinstance(row["count"], str) diff --git a/tests/test_utils/test_lookup_index_edge_cases.py b/tests/test_utils/test_lookup_index_edge_cases.py new file mode 100644 index 0000000..40ae4a7 --- /dev/null +++ b/tests/test_utils/test_lookup_index_edge_cases.py @@ -0,0 +1,121 @@ +"""Edge-case tests for LookupIndex (supplements test_lookup_index.py). + +Covers: +- Duplicate key behavior (LIMIT 1 first-match semantics) +- Empty tables (headers only, zero data rows) +- Lifecycle after close() (operations should fail gracefully) + +See: https://github.com/linkml/linkml-map/pull/136 +""" + +import duckdb +import pytest + +from linkml_map.utils.lookup_index import LookupIndex + + +@pytest.fixture() +def index(): + """Create a LookupIndex and close it after the test.""" + idx = LookupIndex() + yield idx + idx.close() + + +# ---- Duplicate key behavior ---- + + +def test_duplicate_keys_returns_a_row(index, tmp_path): + """When multiple rows share the same key, lookup_row returns one of them. + + The current implementation uses ``LIMIT 1`` without an ``ORDER BY``, + so the returned row is deterministic per DuckDB's storage order (insertion + order for ``read_csv_auto``) but this is NOT guaranteed by the API. + This test documents the behavior without asserting which duplicate wins. + """ + tsv = tmp_path / "dupes.tsv" + tsv.write_text( + "participant_id\tname\tage\n" + "P001\tAlice\t30\n" + "P001\tAlice-v2\t31\n" + "P002\tBob\t25\n" + ) + index.register_table("dupes", tsv, "participant_id") + row = index.lookup_row("dupes", "participant_id", "P001") + + # A row IS returned (not None) + assert row is not None + assert row["participant_id"] == "P001" + # The name is one of the two duplicate rows + assert row["name"] in {"Alice", "Alice-v2"} + + +def test_duplicate_keys_unique_rows_unaffected(index, tmp_path): + """Rows with unique keys are unaffected by the presence of duplicates elsewhere.""" + tsv = tmp_path / "dupes.tsv" + tsv.write_text( + "id\tvalue\n" + "A\t1\n" + "A\t2\n" + "B\t3\n" + ) + index.register_table("dupes", tsv, "id") + row = index.lookup_row("dupes", "id", "B") + assert row is not None + assert row["value"] == "3" + + +# ---- Empty tables ---- + + +def test_empty_table_headers_only(index, tmp_path): + """A table with column headers but zero data rows can be registered and queried.""" + tsv = tmp_path / "empty.tsv" + tsv.write_text("id\tname\tage\n") + index.register_table("empty", tsv, "id") + + assert index.is_registered("empty") + assert index.lookup_row("empty", "id", "anything") is None + + +def test_empty_table_then_drop(index, tmp_path): + """An empty table can be dropped without error.""" + tsv = tmp_path / "empty.tsv" + tsv.write_text("id\tvalue\n") + index.register_table("empty", tsv, "id") + index.drop("empty") + assert not index.is_registered("empty") + + +# ---- Lifecycle after close() ---- + + +def test_close_clears_tables(index, tmp_path): + """After close(), is_registered returns False for all tables.""" + tsv = tmp_path / "data.tsv" + tsv.write_text("id\tval\nA\t1\n") + index.register_table("data", tsv, "id") + assert index.is_registered("data") + + index.close() + assert not index.is_registered("data") + + +def test_operations_after_close_raise(tmp_path): + """Register and lookup operations after close() raise an error.""" + idx = LookupIndex() + idx.close() + + tsv = tmp_path / "data.tsv" + tsv.write_text("id\tval\nA\t1\n") + + with pytest.raises((duckdb.ConnectionException, duckdb.InvalidInputException)): + idx.register_table("data", tsv, "id") + + +def test_double_close_is_safe(): + """Calling close() twice does not raise.""" + idx = LookupIndex() + idx.close() + # Second close should not raise + idx.close()