diff --git a/src/linkml_map/cli/cli.py b/src/linkml_map/cli/cli.py index 45d49b7d..b118c99a 100644 --- a/src/linkml_map/cli/cli.py +++ b/src/linkml_map/cli/cli.py @@ -86,6 +86,16 @@ def main(verbose: int, quiet: bool) -> None: show_default=True, help="Allow unrestricted eval of python expressions.", ) +@click.option( + "--strict/--no-strict", + default=False, + show_default=True, + help=( + "Raise an error when an expression references a name that is not a slot " + "on the source class. Recommended for real data transforms; the default " + "warns and returns None for backward compatibility." + ), +) @click.option( "--output-format", "-f", diff --git a/src/linkml_map/transformer/object_transformer.py b/src/linkml_map/transformer/object_transformer.py index 46c1e9fb..665ecc6a 100644 --- a/src/linkml_map/transformer/object_transformer.py +++ b/src/linkml_map/transformer/object_transformer.py @@ -6,7 +6,7 @@ import logging from collections.abc import Iterator, Mapping from contextlib import contextmanager -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Any import yaml @@ -126,6 +126,26 @@ def __init__( # noqa: PLR0913 self.class_deriv: ClassDerivation | None = class_deriv if bindings: self.bindings.update(bindings) + self._schema_slots: set[str] = self._collect_schema_slots() + + def _collect_schema_slots(self) -> set[str]: + """Slot names declared on the source class. + + Used by ``__getitem__`` to distinguish a schema-declared slot + absent from the current row (legitimate SQL null) from a name + that does not exist in the schema (typo / stale reference). + + Explicit-join aliases are not added here — they are already + handled earlier in ``__getitem__`` via ``_join_specs``. Slots on + join-target classes are intentionally excluded so that a bare + reference like ``{score}`` (instead of ``{Reading.score}``) + surfaces as a typo in strict mode for explicit-join specs. + Implicit-join column resolution is a separate concern handled + by the runtime merge in ``_derive_nested_objects`` (#217). + """ + if self.source_type and self.source_type in self.sv.all_classes(): + return {s.name for s in self.sv.class_induced_slots(self.source_type)} + return set() @classmethod def from_context(cls, transformer: ObjectTransformer, context: DerivationContext) -> Bindings: @@ -203,8 +223,16 @@ def __getitem__(self, name: Any) -> Any: elif name == self.source_type and name not in self.source_obj: # Self-reference: {Reading.score} when source_type is Reading (non-merge context) self.bindings[name] = DynObj(**self.source_obj) - else: + elif name in self.source_obj: _ = self.get_ctxt_obj_and_dict({name: self.source_obj[name]}) + elif name in self._schema_slots: + # Schema-declared slot absent from this row → SQL null. + # Distinct from a name that is not in the schema at all, + # which falls through and raises KeyError so simpleeval + # surfaces it as NameNotDefined (handled in strict mode). + return None + else: + raise KeyError(name) return self.bindings.get(name) @@ -243,6 +271,18 @@ class ObjectTransformer(Transformer): object_index: ObjectIndex = None lookup_index: Any = None # Optional[LookupIndex] — lazy import to avoid hard duckdb dep + _warned_unbound_names: set[str] = field(default_factory=set, repr=False) + """Names already warned about in non-strict mode. + + Shared across all expression evaluations on this transformer + instance — each unique unbound reference logs once, not once per + row. The set is never cleared automatically, so a transformer + reused across multiple logical runs will not re-warn for the same + typo. This is the intended behavior: if a spec has a typo, one + warning per typo across the lifetime of the transformer is enough. + Construct a fresh ``ObjectTransformer`` if you want a clean slate. + """ + def index(self, source_obj: Any, target: str | None = None) -> None: """ Create an index over a container object. @@ -510,7 +550,13 @@ def _eval_expr( (e.g. ``slot`` for referencing previously derived target slots). """ try: - return eval_expr_with_mapping(expr, bindings, functions=functions) + return eval_expr_with_mapping( + expr, + bindings, + functions=functions, + strict=self.strict, + warned_unbound=self._warned_unbound_names, + ) except (InvalidExpression, TypeError, ValueError): if not self.unrestricted_eval: raise diff --git a/src/linkml_map/transformer/transformer.py b/src/linkml_map/transformer/transformer.py index c2eeea92..307d0227 100644 --- a/src/linkml_map/transformer/transformer.py +++ b/src/linkml_map/transformer/transformer.py @@ -79,6 +79,18 @@ class Transformer(ABC): unrestricted_eval: bool = field(default=False) """Set to True to allow arbitrary evals as part of transformation.""" + strict: bool = field(default=False) + """Raise on expression references that do not resolve to a schema slot. + + When ``False`` (the default), unresolved names emit a warning and the + expression evaluator returns ``None`` (preserving SQL-style null + propagation for legitimate empty values but losing the signal for + typos). When ``True``, unresolved names raise + :class:`~linkml_map.transformer.errors.TransformationError`, which + surfaces typos, stale references, and wrong-table accesses that + would otherwise produce silent nulls in the output. + """ + _curie_converter: Converter = None def map_object(self, obj: OBJECT_TYPE, source_type: str | None = None, **kwargs: Any) -> OBJECT_TYPE: diff --git a/src/linkml_map/utils/eval_utils.py b/src/linkml_map/utils/eval_utils.py index f138260c..52c60139 100644 --- a/src/linkml_map/utils/eval_utils.py +++ b/src/linkml_map/utils/eval_utils.py @@ -410,7 +410,18 @@ class LinkMLEvaluator(EvalWithCompoundTypes): - Numeric-string coercion for comparison operators """ - def __init__(self, **kwargs: Any) -> None: # noqa: ANN401 + def __init__( + self, + *, + strict: bool = False, + warned_unbound: set[str] | None = None, + **kwargs: Any, # noqa: ANN401 + ) -> None: + self.strict: bool = strict + # Shared across rows when the caller passes a set in. Per-evaluator + # default dedupes within a single expression eval (mostly a no-op + # since a given name appears once per expr, but cheap and safe). + self._warned_unbound: set[str] = warned_unbound if warned_unbound is not None else set() super().__init__(**kwargs) for op_type in (ast.Eq, ast.NotEq, ast.Lt, ast.LtE, ast.Gt, ast.GtE): self.operators[op_type] = _coercing(self.operators[op_type]) @@ -436,14 +447,39 @@ def __init__(self, **kwargs: Any) -> None: # noqa: ANN401 def _eval_name(self, node: ast.Name) -> Any: # noqa: ANN401 """ - Override name resolution to return None for unbound variables. - - The default simpleeval behavior raises ``NameNotDefined``. - In LinkML expressions, unbound variables resolve to None. + Override name resolution to surface unbound variables in strict mode. + + The default simpleeval behavior raises ``NameNotDefined`` when a + name has no binding. With ``strict=True``, this is re-raised as + a built-in :class:`NameError` so callers in the transformer can + wrap it with derivation context (see + ``ObjectTransformer._slot_error_context``). With ``strict=False`` + it emits a warning and returns ``None`` (preserving SQL-style + null propagation for backward compatibility, while losing the + signal for typos and stale references). + + ``NameError`` is deliberately chosen over ``TransformationError`` + here: ``_slot_error_context`` only enriches non-``TransformationError`` + exceptions with class/slot/row context. ``NameError`` is also + outside the ``(InvalidExpression, TypeError, ValueError)`` catch + list in ``ObjectTransformer._eval_expr``, so it propagates + cleanly past the ``unrestricted_eval`` fallback rather than + being swallowed. + + Note: names that resolve to ``None`` (e.g. a schema-declared slot + absent from the current row) never reach this path — they + succeed via ``super()._eval_name`` and propagate ``None`` as + legitimate SQL null. """ try: return super()._eval_name(node) except NameNotDefined: + msg = f"Expression references '{node.id}' which is not a slot on the source class. Typo or stale reference?" + if self.strict: + raise NameError(msg) from None + if node.id not in self._warned_unbound: + self._warned_unbound.add(node.id) + logger.warning("%s Returning None (run in strict mode to surface as an error).", msg) return None def _eval_set(self, node: ast.Set) -> Any: # noqa: ANN401 @@ -503,15 +539,29 @@ def _distributed_getattr(obj: Any, attr: str) -> Any: # noqa: ANN401 return getattr(obj, attr) -def _make_evaluator(names: Any, functions: Any = None) -> LinkMLEvaluator: # noqa: ANN401 +def _make_evaluator( + names: Any, # noqa: ANN401 + functions: Any = None, # noqa: ANN401 + *, + strict: bool = False, + warned_unbound: set[str] | None = None, +) -> LinkMLEvaluator: """Create a configured LinkMLEvaluator instance.""" - return LinkMLEvaluator(names=names, functions=functions or FUNCTIONS) + return LinkMLEvaluator( + names=names, + functions=functions or FUNCTIONS, + strict=strict, + warned_unbound=warned_unbound, + ) def eval_expr_with_mapping( expr: str, mapping: Mapping, functions: dict[str, Any] | None = None, + *, + strict: bool = False, + warned_unbound: set[str] | None = None, ) -> Any: # noqa: ANN401 """ Evaluate a given expression, with restricted syntax. @@ -523,11 +573,18 @@ def eval_expr_with_mapping( :param mapping: Variable bindings for the expression. :param functions: Additional functions to make available. Merged with built-in functions; caller-provided names take precedence. + :param strict: If ``True``, unbound names raise + :class:`~linkml_map.transformer.errors.TransformationError` + instead of warning and returning ``None``. + :param warned_unbound: Optional set used to dedupe non-strict + warnings across multiple ``eval_expr_with_mapping`` calls. Pass + the same set across all evaluations in a single transform run + so each unique unbound name is logged once, not once per row. """ if expr == "None": return None merged = {**FUNCTIONS, **functions} if functions else None - evaluator = _make_evaluator(names=mapping, functions=merged) + evaluator = _make_evaluator(names=mapping, functions=merged, strict=strict, warned_unbound=warned_unbound) return evaluator.eval(expr) diff --git a/tests/test_cli/test_cli.py b/tests/test_cli/test_cli.py index d9126d6b..879dc0bb 100644 --- a/tests/test_cli/test_cli.py +++ b/tests/test_cli/test_cli.py @@ -164,6 +164,84 @@ def test_map_data(runner: CliRunner) -> None: assert tr_data["agents"][0]["label"] == "fred bloggs" +def _write_typo_spec_fixture(tmp_path: Path) -> tuple[Path, Path, Path]: + """Write a minimal schema + spec (with a typo'd expr) + input row to tmp_path. + + Returns (schema_path, spec_path, input_path). + """ + schema_path = tmp_path / "schema.yaml" + schema_path.write_text( + "id: https://example.org/typo-source\n" + "name: typo_source\n" + "prefixes:\n" + " linkml: https://w3id.org/linkml/\n" + "imports:\n" + " - linkml:types\n" + "default_range: string\n" + "classes:\n" + " Person:\n" + " attributes:\n" + " id: {identifier: true}\n" + " score: {range: integer}\n" + ) + spec_path = tmp_path / "spec.yaml" + spec_path.write_text( + "source_schema: typo_source\n" + "class_derivations:\n" + " Agent:\n" + " populated_from: Person\n" + " slot_derivations:\n" + " name:\n" + ' expr: "{scroe}"\n' # codespell:ignore + ) + input_path = tmp_path / "input.yaml" + input_path.write_text("id: p1\nscore: 5\n") + return schema_path, spec_path, input_path + + +def test_map_data_strict_flag_surfaces_typo(runner: CliRunner, tmp_path: Path) -> None: + """`--strict` makes an expression typo fail the command instead of silent null.""" + schema_path, spec_path, input_path = _write_typo_spec_fixture(tmp_path) + result = runner.invoke( + main, + [ + "map-data", + "--strict", + "-T", + str(spec_path), + "-s", + str(schema_path), + "--source-type", + "Person", + str(input_path), + ], + ) + assert result.exit_code != 0 + assert "scroe" in (result.stderr + str(result.exception)) # codespell:ignore + + +def test_map_data_no_strict_default_succeeds(runner: CliRunner, tmp_path: Path) -> None: + """Default (non-strict) preserves backward-compat: exits 0, produces empty/null output.""" + schema_path, spec_path, input_path = _write_typo_spec_fixture(tmp_path) + result = runner.invoke( + main, + [ + "map-data", + "-T", + str(spec_path), + "-s", + str(schema_path), + "--source-type", + "Person", + str(input_path), + ], + ) + assert result.exit_code == 0 + # Null values are omitted by the YAML dumper, so {name: None} round-trips to {}. + out = yaml.safe_load(result.stdout) or {} + assert out.get("name") is None + + def test_map_data_norm_denorm(runner: CliRunner) -> None: cmd = [ "map-data", diff --git a/tests/test_transformer/test_expr_strict_mode.py b/tests/test_transformer/test_expr_strict_mode.py new file mode 100644 index 00000000..c8d08cc2 --- /dev/null +++ b/tests/test_transformer/test_expr_strict_mode.py @@ -0,0 +1,148 @@ +"""Tests for strict-mode expression evaluation (issue #213). + +In non-strict mode the evaluator preserves SQL-style null propagation but +emits a warning when an expression references a name that is not a slot +on the source class. In strict mode the same condition raises +``TransformationError``. + +A schema-declared slot that is absent from the current row must still +resolve to ``None`` in both modes — that case is a real SQL null, not +a typo. +""" + +import logging + +import pytest +import yaml +from linkml.utils.schema_builder import SchemaBuilder +from linkml_runtime import SchemaView + +from linkml_map.transformer.errors import TransformationError +from linkml_map.transformer.object_transformer import ObjectTransformer + + +def _build_schemas() -> tuple: + """Source has Person(id, age, score); target has Agent(name).""" + sb_source = SchemaBuilder() + sb_source.add_slot("id", range="string") + sb_source.add_slot("age", range="integer") + sb_source.add_slot("score", range="integer") + sb_source.add_class("Person", slots=["id", "age", "score"]) + sb_source.add_defaults() + + sb_target = SchemaBuilder() + sb_target.add_slot("name", range="string") + sb_target.add_class("Agent", slots=["name"]) + sb_target.add_defaults() + + return sb_source.schema, sb_target.schema + + +def _make_transformer(transform_spec: dict, *, strict: bool) -> ObjectTransformer: + """Build a transformer with the given strict flag and inline spec.""" + source_schema, target_schema = _build_schemas() + transformer = ObjectTransformer(strict=strict) + transformer.source_schemaview = SchemaView(source_schema) + transformer.target_schemaview = SchemaView(target_schema) + transformer.create_transformer_specification(transform_spec) + return transformer + + +TYPO_SPEC = yaml.safe_load( + "class_derivations:\n" + " Agent:\n" + " populated_from: Person\n" + " slot_derivations:\n" + " name:\n" + ' expr: "{scroe}"\n' # codespell:ignore +) + + +VALID_SPEC = yaml.safe_load(""" +class_derivations: + Agent: + populated_from: Person + slot_derivations: + name: + expr: "str({score})" +""") + + +def test_typo_strict_raises() -> None: + """Strict mode raises TransformationError when an expression references a non-slot. + + The raw ``NameError`` from ``_eval_name`` is wrapped by + ``_slot_error_context`` so the surfaced exception carries class / + slot / source-row context for diagnostics. + """ + transformer = _make_transformer(TYPO_SPEC, strict=True) + with pytest.raises(TransformationError, match="scroe") as exc_info: # codespell:ignore + transformer.map_object({"id": "p1", "age": 30, "score": 5}, source_type="Person") + assert exc_info.value.class_derivation_name == "Agent" + assert exc_info.value.slot_derivation_name == "name" + + +def test_typo_non_strict_warns_and_returns_none(caplog: pytest.LogCaptureFixture) -> None: + """Non-strict mode keeps the existing return-None behavior but logs a warning.""" + transformer = _make_transformer(TYPO_SPEC, strict=False) + with caplog.at_level(logging.WARNING, logger="linkml_map.utils.eval_utils"): + result = transformer.map_object({"id": "p1", "age": 30, "score": 5}, source_type="Person") + assert result == {"name": None} + assert any("scroe" in rec.message for rec in caplog.records) # codespell:ignore + + +@pytest.mark.parametrize("strict", [True, False]) +def test_schema_slot_with_null_value_resolves_to_none(strict: bool) -> None: + """Slot is in the schema, value is None on this row → expression resolves to None.""" + transformer = _make_transformer(VALID_SPEC, strict=strict) + result = transformer.map_object({"id": "p1", "age": 30, "score": None}, source_type="Person") + assert result == {"name": None} + + +@pytest.mark.parametrize("strict", [True, False]) +def test_schema_slot_absent_from_row_resolves_to_none(strict: bool) -> None: + """Slot is declared on the source class but missing from this row → None (SQL null). + + This is the key distinction the fix introduces: a schema-declared + slot absent from the row is a legitimate null, while a name that + is not a slot at all is a typo. Both must behave correctly in + strict mode. + """ + transformer = _make_transformer(VALID_SPEC, strict=strict) + result = transformer.map_object({"id": "p1", "age": 30}, source_type="Person") + assert result == {"name": None} + + +def test_valid_slot_resolves_in_strict_mode() -> None: + """A valid bare-name reference works in strict mode.""" + transformer = _make_transformer(VALID_SPEC, strict=True) + result = transformer.map_object({"id": "p1", "age": 30, "score": 5}, source_type="Person") + assert result == {"name": "5"} + + +def test_default_strict_is_false() -> None: + """Default behavior is non-strict to preserve backward compatibility.""" + transformer = ObjectTransformer() + assert transformer.strict is False + + +def test_non_strict_warning_dedupes_across_rows(caplog: pytest.LogCaptureFixture) -> None: + """Each unique unbound name should warn once per transform run, not once per row. + + Without dedup, a typo in a spec used against a 10M-row dataset + produces 10M log lines for one underlying problem. + """ + transformer = _make_transformer(TYPO_SPEC, strict=False) + rows = [ + {"id": "p1", "age": 30, "score": 5}, + {"id": "p2", "age": 31, "score": 6}, + {"id": "p3", "age": 32, "score": 7}, + ] + with caplog.at_level(logging.WARNING, logger="linkml_map.utils.eval_utils"): + for row in rows: + transformer.map_object(row, source_type="Person") + + typo_warnings = [rec for rec in caplog.records if "scroe" in rec.message] # codespell:ignore + assert len(typo_warnings) == 1, ( + f"Expected one warning for the typo across {len(rows)} rows, got {len(typo_warnings)}" + ) diff --git a/tests/test_utils/test_eval_utils.py b/tests/test_utils/test_eval_utils.py index e0fc7b69..7ae53dcb 100644 --- a/tests/test_utils/test_eval_utils.py +++ b/tests/test_utils/test_eval_utils.py @@ -641,6 +641,49 @@ def test_eval_expr_with_mapping_none_literal() -> None: assert eval_expr_with_mapping("None", {}) is None +# ---- Strict mode for unbound names (issue #213) ---- + + +def test_eval_unbound_name_default_returns_none() -> None: + """Non-strict (default) returns None for unbound names, preserving SQL-null semantics.""" + from linkml_map.utils.eval_utils import eval_expr_with_mapping + + assert eval_expr_with_mapping("{scroe}", {"score": 5}) is None # codespell:ignore + + +def test_eval_unbound_name_default_logs_warning(caplog: pytest.LogCaptureFixture) -> None: + """Non-strict emits a warning so the unbound reference is not entirely silent.""" + import logging + + from linkml_map.utils.eval_utils import eval_expr_with_mapping + + with caplog.at_level(logging.WARNING, logger="linkml_map.utils.eval_utils"): + result = eval_expr_with_mapping("{scroe}", {"score": 5}) # codespell:ignore + + assert result is None + assert any("scroe" in rec.message for rec in caplog.records) # codespell:ignore + + +def test_eval_unbound_name_strict_raises() -> None: + """Strict surfaces unbound names as NameError with the offending name. + + The transformer wraps this as ``TransformationError`` with derivation + context via ``_slot_error_context``; at the bare-evaluator level it + is the built-in ``NameError``. + """ + from linkml_map.utils.eval_utils import eval_expr_with_mapping + + with pytest.raises(NameError, match="scroe"): # codespell:ignore + eval_expr_with_mapping("{scroe}", {"score": 5}, strict=True) # codespell:ignore + + +def test_eval_bound_name_with_none_value_returns_none_in_strict() -> None: + """A name that is bound to None resolves to None even in strict mode (real SQL null).""" + from linkml_map.utils.eval_utils import eval_expr_with_mapping + + assert eval_expr_with_mapping("{score}", {"score": None}, strict=True) is None + + # ---- Distribution edge cases ----