Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
245 changes: 245 additions & 0 deletions tests/test_transformer/test_engine_edge_cases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,245 @@
"""Edge-case tests for the transform_spec engine (supplements test_cross_table_lookup.py).

Covers:
- Engine with no-joins class_derivation (regression safety)
- Empty joined table (headers only)
- Mixed derivations: one with joins, one without

See: https://github.com/linkml/linkml-map/pull/136
"""

# ruff: noqa: ANN401, PLR2004

import textwrap

import yaml
from linkml_runtime import SchemaView

from linkml_map.loaders.data_loaders import DataLoader
from linkml_map.transformer.engine import transform_spec
from linkml_map.transformer.object_transformer import ObjectTransformer


# ---- shared schemas ----

SOURCE_SCHEMA_YAML = textwrap.dedent("""\
id: https://example.org/engine-test-source
name: engine_test_source
prefixes:
linkml: https://w3id.org/linkml/
imports:
- linkml:types
default_range: string
classes:
samples:
attributes:
sample_id:
identifier: true
name: {}
site_code: {}
sites:
attributes:
site_code:
identifier: true
site_name: {}
""")

TARGET_SCHEMA_YAML = textwrap.dedent("""\
id: https://example.org/engine-test-target
name: engine_test_target
prefixes:
linkml: https://w3id.org/linkml/
imports:
- linkml:types
default_range: string
classes:
FlatSample:
attributes:
sample_id:
identifier: true
name: {}
site_name: {}
""")


def _make_transformer(source_sv, target_sv, spec_yaml):
"""Build an ObjectTransformer from inline YAML strings."""
tr = ObjectTransformer(unrestricted_eval=False)
tr.source_schemaview = source_sv
tr.target_schemaview = target_sv
tr.create_transformer_specification(yaml.safe_load(spec_yaml))
return tr


# ---- no-joins regression ----


def test_engine_no_joins(tmp_path):
"""transform_spec works for a class_derivation with no joins block.

This is a regression test ensuring the join machinery doesn't break
the common case where joins are not used.
"""
(tmp_path / "samples.tsv").write_text(
"sample_id\tname\tsite_code\n"
"S001\tAlpha\tSITE_A\n"
"S002\tBeta\tSITE_B\n"
)

spec = textwrap.dedent("""\
class_derivations:
FlatSample:
populated_from: samples
slot_derivations:
sample_id:
populated_from: sample_id
name:
populated_from: name
""")
source_sv = SchemaView(SOURCE_SCHEMA_YAML)
target_sv = SchemaView(TARGET_SCHEMA_YAML)
tr = _make_transformer(source_sv, target_sv, spec)
loader = DataLoader(tmp_path)

results = list(transform_spec(tr, loader))

assert len(results) == 2
assert results[0]["sample_id"] == "S001"
assert results[0]["name"] == "Alpha"
assert results[1]["sample_id"] == "S002"
assert results[1]["name"] == "Beta"


def test_engine_no_joins_no_data(tmp_path):
"""transform_spec gracefully yields nothing when the data file doesn't exist."""
spec = textwrap.dedent("""\
class_derivations:
FlatSample:
populated_from: samples
slot_derivations:
sample_id:
populated_from: sample_id
""")
source_sv = SchemaView(SOURCE_SCHEMA_YAML)
target_sv = SchemaView(TARGET_SCHEMA_YAML)
tr = _make_transformer(source_sv, target_sv, spec)
loader = DataLoader(tmp_path) # no files in tmp_path

results = list(transform_spec(tr, loader))
assert results == []


# ---- empty joined table ----


def test_join_with_empty_secondary_table(tmp_path):
"""When a joined table has headers but no data rows, lookups return None."""
(tmp_path / "samples.tsv").write_text(
"sample_id\tname\tsite_code\n"
"S001\tAlpha\tSITE_A\n"
)
# sites.tsv has headers only — no data rows
(tmp_path / "sites.tsv").write_text("site_code\tsite_name\n")

spec = textwrap.dedent("""\
class_derivations:
FlatSample:
populated_from: samples
joins:
sites:
join_on: site_code
slot_derivations:
sample_id:
populated_from: sample_id
name:
populated_from: name
site_name:
expr: "{sites.site_name}"
""")
source_sv = SchemaView(SOURCE_SCHEMA_YAML)
target_sv = SchemaView(TARGET_SCHEMA_YAML)
tr = _make_transformer(source_sv, target_sv, spec)
loader = DataLoader(tmp_path)

results = list(transform_spec(tr, loader))

assert len(results) == 1
assert results[0]["sample_id"] == "S001"
assert results[0]["name"] == "Alpha"
# No matching row in empty sites table → None via null propagation
assert results[0].get("site_name") is None


# ---- mixed: one derivation with joins, one without ----


def test_mixed_derivations_with_and_without_joins(tmp_path):
"""Multiple class_derivations can coexist: some with joins, some without."""
(tmp_path / "samples.tsv").write_text(
"sample_id\tname\tsite_code\n"
"S001\tAlpha\tSITE_A\n"
)
(tmp_path / "sites.tsv").write_text(
"site_code\tsite_name\n"
"SITE_A\tBoston Medical\n"
)

# Two target classes: one uses joins, one doesn't
target_yaml = textwrap.dedent("""\
id: https://example.org/engine-test-target
name: engine_test_target
prefixes:
linkml: https://w3id.org/linkml/
imports:
- linkml:types
default_range: string
classes:
FlatSample:
attributes:
sample_id:
identifier: true
name: {}
site_name: {}
SimpleSample:
attributes:
sample_id:
identifier: true
name: {}
""")

spec = textwrap.dedent("""\
class_derivations:
FlatSample:
populated_from: samples
joins:
sites:
join_on: site_code
slot_derivations:
sample_id:
populated_from: sample_id
name:
populated_from: name
site_name:
expr: "{sites.site_name}"
SimpleSample:
populated_from: samples
slot_derivations:
sample_id:
populated_from: sample_id
name:
populated_from: name
""")
source_sv = SchemaView(SOURCE_SCHEMA_YAML)
target_sv = SchemaView(target_yaml)
tr = _make_transformer(source_sv, target_sv, spec)
loader = DataLoader(tmp_path)

results = list(transform_spec(tr, loader))

# Should get results from both derivations
assert len(results) == 2
# First: FlatSample with join
assert results[0]["site_name"] == "Boston Medical"
# Second: SimpleSample without join
assert results[1]["sample_id"] == "S001"
assert results[1]["name"] == "Alpha"
121 changes: 121 additions & 0 deletions tests/test_utils/test_lookup_index_edge_cases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
"""Edge-case tests for LookupIndex (supplements test_lookup_index.py).

Covers:
- Duplicate key behavior (LIMIT 1 first-match semantics)
- Empty tables (headers only, zero data rows)
- Lifecycle after close() (operations should fail gracefully)

See: https://github.com/linkml/linkml-map/pull/136
"""

import duckdb
import pytest

from linkml_map.utils.lookup_index import LookupIndex


@pytest.fixture()
def index():
"""Create a LookupIndex and close it after the test."""
idx = LookupIndex()
yield idx
idx.close()


# ---- Duplicate key behavior ----


def test_duplicate_keys_returns_a_row(index, tmp_path):
"""When multiple rows share the same key, lookup_row returns one of them.

The current implementation uses ``LIMIT 1`` without an ``ORDER BY``,
so the returned row is deterministic per DuckDB's storage order (insertion
order for ``read_csv_auto``) but this is NOT guaranteed by the API.
This test documents the behavior without asserting which duplicate wins.
"""
tsv = tmp_path / "dupes.tsv"
tsv.write_text(
"participant_id\tname\tage\n"
"P001\tAlice\t30\n"
"P001\tAlice-v2\t31\n"
"P002\tBob\t25\n"
)
index.register_table("dupes", tsv, "participant_id")
row = index.lookup_row("dupes", "participant_id", "P001")

# A row IS returned (not None)
assert row is not None
assert row["participant_id"] == "P001"
# The name is one of the two duplicate rows
assert row["name"] in {"Alice", "Alice-v2"}


def test_duplicate_keys_unique_rows_unaffected(index, tmp_path):
"""Rows with unique keys are unaffected by the presence of duplicates elsewhere."""
tsv = tmp_path / "dupes.tsv"
tsv.write_text(
"id\tvalue\n"
"A\t1\n"
"A\t2\n"
"B\t3\n"
)
index.register_table("dupes", tsv, "id")
row = index.lookup_row("dupes", "id", "B")
assert row is not None
assert row["value"] == "3"


# ---- Empty tables ----


def test_empty_table_headers_only(index, tmp_path):
"""A table with column headers but zero data rows can be registered and queried."""
tsv = tmp_path / "empty.tsv"
tsv.write_text("id\tname\tage\n")
index.register_table("empty", tsv, "id")

assert index.is_registered("empty")
assert index.lookup_row("empty", "id", "anything") is None


def test_empty_table_then_drop(index, tmp_path):
"""An empty table can be dropped without error."""
tsv = tmp_path / "empty.tsv"
tsv.write_text("id\tvalue\n")
index.register_table("empty", tsv, "id")
index.drop("empty")
assert not index.is_registered("empty")


# ---- Lifecycle after close() ----


def test_close_clears_tables(index, tmp_path):
"""After close(), is_registered returns False for all tables."""
tsv = tmp_path / "data.tsv"
tsv.write_text("id\tval\nA\t1\n")
index.register_table("data", tsv, "id")
assert index.is_registered("data")

index.close()
assert not index.is_registered("data")


def test_operations_after_close_raise(tmp_path):
"""Register and lookup operations after close() raise an error."""
idx = LookupIndex()
idx.close()

tsv = tmp_path / "data.tsv"
tsv.write_text("id\tval\nA\t1\n")

with pytest.raises((duckdb.ConnectionException, duckdb.InvalidInputException)):
idx.register_table("data", tsv, "id")


def test_double_close_is_safe():
"""Calling close() twice does not raise."""
idx = LookupIndex()
idx.close()
# Second close should not raise
idx.close()