From c8506c9177898455633045809e14d39420c22faa Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Tue, 12 May 2026 10:45:50 -0500 Subject: [PATCH 1/7] Recurse into nested class_derivations in validate-spec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Walk nested class_derivations declared on slot_derivations and validate them like top-level CDs, threading the parent CD through so cross-table boundaries can be diagnosed. Adds three checks: - Cross-table mismatch: when a nested CD's populated_from differs from its parent's, emit INFO when an implicit join would synthesize at runtime (#212) and WARNING when no implicit join is possible. Closes #211. - Joined-class attribute refs: expressions like {alias.field} are now validated against the joined class's slot list — for both explicit joins: aliases and nested-CD implicit-join targets. Replaces the previous "drop join aliases from check" behavior. Closes #213. - is_a / mixins resolution: string references resolve against the spec-internal class_derivation name pool first (matches the runtime's _find_class_derivation_by_name), then against target schema classes via SchemaView, else error. Closes #219. ValidationMessage.severity widens to Literal["error", "warning", "info"] to carry the implicit-join INFO. --- src/linkml_map/validator.py | 349 ++++++++++++++++++++-- tests/test_validator.py | 564 ++++++++++++++++++++++++++++++++++++ 2 files changed, 890 insertions(+), 23 deletions(-) diff --git a/src/linkml_map/validator.py b/src/linkml_map/validator.py index 98b470d0..bb892427 100644 --- a/src/linkml_map/validator.py +++ b/src/linkml_map/validator.py @@ -71,7 +71,7 @@ class ValidationMessage: added in the future. """ - severity: Literal["error", "warning"] + severity: Literal["error", "warning", "info"] path: str message: str category: str | None = None @@ -240,6 +240,45 @@ def extract_expr_slot_references(expr: str) -> set[str]: return names - _EXPR_SAFE_NAMES - bound +def extract_expr_attribute_references(expr: str) -> dict[str, set[str]]: + """Extract ``base.attr`` accesses from a LinkML expression. + + Used to validate cross-table references like ``{joined_table.field}`` + against the joined class's slot list. The ``src.attr`` form is excluded + because :func:`extract_expr_slot_references` already treats it as a bare + slot reference on the source class. + + :param expr: A LinkML expression string. + :returns: Mapping of base name to the set of attributes accessed on it. + + Example:: + + >>> result = extract_expr_attribute_references("{demographics.age}") + >>> sorted(result["demographics"]) + ['age'] + >>> extract_expr_attribute_references("plain_var") + {} + >>> extract_expr_attribute_references("src.foo") + {} + """ + try: + tree = ast.parse(expr, mode="eval") + except SyntaxError: + try: + tree = ast.parse(expr, mode="exec") + except SyntaxError: + return {} + + result: dict[str, set[str]] = {} + for node in ast.walk(tree): + if isinstance(node, ast.Attribute) and isinstance(node.value, ast.Name): + base = node.value.id + if base == "src" or base in _EXPR_SAFE_NAMES: + continue + result.setdefault(base, set()).add(node.attr) + return result + + # --------------------------------------------------------------------------- # Semantic validation # --------------------------------------------------------------------------- @@ -495,9 +534,15 @@ def validate_spec_semantics( if source_sv is None and target_sv is None: return messages - # Validate class_derivations + # Top-level class_derivation name pool — used to resolve is_a / mixins + # references against spec-internal derivations (#219). Matches the + # runtime's _find_class_derivation_by_name which only inspects the + # top-level CD list. + derivation_pool = _collect_class_derivation_pool(data) + + # Validate class_derivations (recurses into nested CDs internally) for cd in _iter_derivation_dicts(data.get("class_derivations", [])): - _validate_class_derivation(cd, source_sv, target_sv, strict, messages) + _validate_class_derivation(cd, source_sv, target_sv, strict, messages, derivation_pool=derivation_pool) # Validate enum_derivations for ed in _iter_derivation_dicts(data.get("enum_derivations", [])): @@ -506,16 +551,54 @@ def validate_spec_semantics( return messages +def _collect_class_derivation_pool(data: dict[str, Any]) -> set[str]: + """Names of all top-level class_derivations in the spec. + + This is the pool used to resolve ``is_a`` and ``mixins`` references + against spec-internal derivations (#219). Only top-level CDs are + included, matching the runtime's + :meth:`~linkml_map.transformer.transformer.Transformer._find_class_derivation_by_name` + which raises ``KeyError`` for anything not at the top level. + """ + return {cd.get("name") for cd in _iter_derivation_dicts(data.get("class_derivations", [])) if cd.get("name")} + + def _validate_class_derivation( cd: dict[str, Any], source_sv: SchemaView | None, target_sv: SchemaView | None, strict: bool, messages: list[ValidationMessage], + parent_class_deriv: dict[str, Any] | None = None, + parent_path: str = "", + derivation_pool: set[str] | None = None, ) -> None: - """Validate a single class derivation against schemas.""" + """Validate a single class derivation against schemas. + + Recurses into nested ``class_derivations`` declared under any slot + derivation, threading the parent CD through so cross-table references + (#211) can be diagnosed. + + :param parent_class_deriv: For nested CDs, the enclosing class + derivation's dict. ``None`` for top-level CDs. + :param parent_path: Path prefix for nested validation messages + (e.g. ``class_derivations[Outer].slot_derivations[inner]``). + Empty string for top-level CDs. + :param derivation_pool: Names of all top-level class_derivations in + the spec. Used to resolve ``is_a``/``mixins`` (#219). ``None`` + skips the check. + """ cd_name = cd.get("name", "?") - cd_path = f"class_derivations[{cd_name}]" + cd_path = f"{parent_path}.class_derivations[{cd_name}]" if parent_path else f"class_derivations[{cd_name}]" + + # Cross-table check for nested CDs (closes #211): does the nested CD's + # populated_from differ from its parent's, and can the join be resolved? + if parent_class_deriv is not None: + _check_cross_table_join(cd, parent_class_deriv, source_sv, cd_path, messages) + + # is_a / mixins resolution check (closes #219). + if derivation_pool is not None: + _check_class_inheritance_refs(cd, cd_path, derivation_pool, target_sv, messages) # Target: class name should exist target_class_slots: set[str] | None = None @@ -556,24 +639,38 @@ def _validate_class_derivation( # Validate slot_derivations (may be list or dict after normalization) slot_derivation_dicts = _iter_derivation_dicts(cd.get("slot_derivations", [])) - # Join aliases — names like "demographics" in {demographics.age} that - # reference a joined table, not a slot on the source class. Excluded - # from expression-reference validation to avoid false-positive warnings. - joins = cd.get("joins") or {} - joined_aliases: set[str] = set(joins.keys()) if isinstance(joins, dict) else set() + # Build alias -> slot-set map for cross-table expression reference checks. + # Covers both explicit `joins:` aliases and the nested-CD populated_from + # targets that #212's normalization will synthesize joins for. + joined_class_slots = _build_joined_class_map(cd, source_sv, slot_derivation_dicts) for sd in slot_derivation_dicts: + sd_name = sd.get("name", "?") + sd_path = f"{cd_path}.slot_derivations[{sd_name}]" _validate_slot_derivation( sd, cd_name, cd_path, source_class_slots, target_class_slots, - joined_aliases, + joined_class_slots, strict, messages, ) + # Recurse into nested class_derivations declared on this slot. + for nested_cd in _iter_derivation_dicts(sd.get("class_derivations", [])): + _validate_class_derivation( + nested_cd, + source_sv, + target_sv, + strict, + messages, + parent_class_deriv=cd, + parent_path=sd_path, + derivation_pool=derivation_pool, + ) + # Warning: target class has required slots with no derivation if target_sv is not None and target_class_slots is not None: derived_slot_names = {sd.get("name") for sd in slot_derivation_dicts if "name" in sd} @@ -588,22 +685,203 @@ def _validate_class_derivation( ) +def _build_joined_class_map( + cd: dict[str, Any], + source_sv: SchemaView | None, + slot_derivation_dicts: list[dict[str, Any]], +) -> dict[str, set[str] | None]: + """Map alias names to the slot set of their joined class. + + Combines two sources of "joined tables" visible from this CD's + expressions: + + 1. Explicit ``joins:`` aliases on this CD. The joined class is + ``joins[alias].class_named`` if set, otherwise the alias name. + 2. Implicit join targets — nested ``class_derivations`` whose + ``populated_from`` differs from this CD's, which the engine's + normalization pass (#212) will auto-resolve. + + :param cd: This class derivation dict. + :param source_sv: Source schema view, or ``None``. + :param slot_derivation_dicts: Pre-iterated slot derivations (avoids + re-walking). + :returns: Mapping of alias name to its joined class's slot set. Value + is ``None`` when the joined class can't be resolved in the source + schema (or when no source schema is available). + """ + result: dict[str, set[str] | None] = {} + + joins = cd.get("joins") or {} + if isinstance(joins, dict): + for alias, spec in joins.items(): + joined_class: str + if isinstance(spec, dict): + joined_class = spec.get("class_named") or alias + else: + joined_class = alias + if source_sv is not None and joined_class in set(source_sv.all_classes()): + result[alias] = {s.name for s in source_sv.class_induced_slots(joined_class)} + else: + result[alias] = None + + parent_source = cd.get("populated_from") + if source_sv is not None and parent_source: + all_classes = set(source_sv.all_classes()) + for sd in slot_derivation_dicts: + for nested in _iter_derivation_dicts(sd.get("class_derivations", [])): + nested_source = nested.get("populated_from") + if nested_source and nested_source != parent_source and nested_source not in result: + if nested_source in all_classes: + result[nested_source] = {s.name for s in source_sv.class_induced_slots(nested_source)} + else: + result[nested_source] = None + + return result + + +def _check_cross_table_join( + nested_cd: dict[str, Any], + parent_cd: dict[str, Any], + source_sv: SchemaView | None, + nested_path: str, + messages: list[ValidationMessage], +) -> None: + """Diagnose nested CDs that reference a different source table than their parent. + + Closes #211 at validate-spec time. Mirrors the runtime synthesis logic + in :class:`~linkml_map.transformer.transformer.Transformer`: + + - Explicit ``joins:`` covers the nested source → no message. + - No explicit join, :func:`~linkml_map.utils.join_utils.pick_join_key` + returns a column → ``info`` (#212 will auto-synthesize at runtime). + - No explicit join, no implicit join can be synthesized → ``warning`` + with the same diagnostic the runtime would raise. + """ + nested_source = nested_cd.get("populated_from") + parent_source = parent_cd.get("populated_from") + if not nested_source or not parent_source or nested_source == parent_source: + return + + parent_joins = parent_cd.get("joins") or {} + if isinstance(parent_joins, dict) and nested_source in parent_joins: + return + + if source_sv is None: + return + + from linkml_map.utils.join_utils import find_common_columns, pick_join_key + + all_classes = set(source_sv.all_classes()) + if parent_source not in all_classes or nested_source not in all_classes: + # Missing-class errors are emitted elsewhere; can't predict joinability. + return + + key = pick_join_key(source_sv, parent_source, nested_source) + if key is not None: + messages.append( + ValidationMessage( + severity="info", + path=nested_path, + message=( + f"Nested 'populated_from={nested_source}' differs from parent " + f"'populated_from={parent_source}'. No explicit joins: block; " + f"implicit join will be synthesized on column '{key}'. " + f"Consider declaring the join explicitly." + ), + ) + ) + return + + common = find_common_columns(source_sv, parent_source, nested_source) + if not common: + reason = f"no columns are shared between '{parent_source}' and '{nested_source}'" + else: + candidates = ", ".join(f"'{c}'" for c in sorted(common)) + reason = ( + f"multiple candidate join columns are shared between '{parent_source}' " + f"and '{nested_source}' ({candidates}); cannot pick automatically" + ) + messages.append( + ValidationMessage( + severity="warning", + path=nested_path, + message=( + f"Nested 'populated_from={nested_source}' differs from parent " + f"'populated_from={parent_source}', but no implicit join can be " + f"synthesized: {reason}. Add an explicit joins: block — cross-table " + f"values will otherwise resolve to null." + ), + ) + ) + + +def _check_class_inheritance_refs( + cd: dict[str, Any], + cd_path: str, + derivation_pool: set[str], + target_sv: SchemaView | None, + messages: list[ValidationMessage], +) -> None: + """Resolve ``is_a`` and ``mixins`` string references (closes #219). + + Each reference must resolve to either: + + 1. A top-level ``class_derivation`` in this spec (matches the runtime's + ``_find_class_derivation_by_name`` behavior), or + 2. A class in the target schema (matches LinkML's ``is_a`` convention + on plain schemas). + + A reference that resolves to neither emits an ``error``. + """ + parents: list[tuple[str, str]] = [] + is_a = cd.get("is_a") + if isinstance(is_a, str): + parents.append(("is_a", is_a)) + mixins = cd.get("mixins") + if isinstance(mixins, list): + parents.extend(("mixins", m) for m in mixins if isinstance(m, str)) + + if not parents: + return + + target_classes = set(target_sv.all_classes()) if target_sv is not None else set() + + for field_label, parent_name in parents: + if parent_name in derivation_pool: + continue + if parent_name in target_classes: + continue + messages.append( + ValidationMessage( + severity="error", + path=cd_path, + message=( + f"'{field_label}: {parent_name}' does not resolve to a " + f"class_derivation in this spec or a class in the target schema" + ), + ) + ) + + def _validate_slot_derivation( sd: dict[str, Any], parent_class_name: str, parent_path: str, source_class_slots: set[str] | None, target_class_slots: set[str] | None, - joined_aliases: set[str], + joined_class_slots: dict[str, set[str] | None], strict: bool, messages: list[ValidationMessage], ) -> None: """Validate a single slot derivation against schemas. - :param joined_aliases: Names declared in the parent class derivation's - ``joins`` mapping. Expression references to these names (e.g. - ``{demographics.age}``) are skipped — they refer to joined tables, - not source-class slots. + :param joined_class_slots: Mapping of alias name (from explicit + ``joins:`` or an implicit-join target) to the slot set of the + joined class. Expression bare-name references to these aliases + are excluded from source-class checks, and attribute references + like ``{alias.field}`` are validated against the joined class's + slots. Value of ``None`` means the joined class is unknown — the + bare alias is still tolerated, but attribute access is skipped. """ sd_name = sd.get("name", "?") sd_path = f"{parent_path}.slot_derivations[{sd_name}]" @@ -630,18 +908,43 @@ def _validate_slot_derivation( ) ) - # Expression slot references — skip join aliases, they reference joined - # tables (validated against the joined class is a separate enhancement). expr = sd.get("expr") - if expr is not None and source_class_slots is not None: - refs = extract_expr_slot_references(expr) - joined_aliases - for ref in sorted(refs): - if ref not in source_class_slots: + if expr is None or source_class_slots is None: + return + + joined_aliases = set(joined_class_slots.keys()) + + # Bare-name expression refs — exclude join aliases (they're a "base", not + # a slot on the source class). + refs = extract_expr_slot_references(expr) - joined_aliases + for ref in sorted(refs): + if ref not in source_class_slots: + messages.append( + ValidationMessage( + severity="error" if strict else "warning", + path=sd_path, + message=f"Expression references '{ref}' which is not a slot on the source class", + ) + ) + + # Cross-table attribute refs — partial #213 coverage: {alias.field} + # validated against the joined class's slots when known. + attr_refs = extract_expr_attribute_references(expr) + for base, attrs in attr_refs.items(): + if base not in joined_class_slots: + continue + joined_slots = joined_class_slots[base] + if joined_slots is None: + continue + for attr in sorted(attrs): + if attr not in joined_slots: messages.append( ValidationMessage( severity="error" if strict else "warning", path=sd_path, - message=f"Expression references '{ref}' which is not a slot on the source class", + message=( + f"Expression references '{base}.{attr}' but '{attr}' is not a slot on joined class '{base}'" + ), ) ) diff --git a/tests/test_validator.py b/tests/test_validator.py index 21358dcd..f184e246 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -1,9 +1,13 @@ """Tests for transformation specification validation.""" +import textwrap + import pytest +from linkml_runtime import SchemaView from linkml_map.validator import ( ValidationMessage, + extract_expr_attribute_references, extract_expr_slot_references, normalize_spec_dict, validate_spec, @@ -31,6 +35,10 @@ def _warnings(messages: list[ValidationMessage]) -> list[ValidationMessage]: return [m for m in messages if m.severity == "warning"] +def _infos(messages: list[ValidationMessage]) -> list[ValidationMessage]: + return [m for m in messages if m.severity == "info"] + + # --------------------------------------------------------------------------- # Unit tests for normalize_spec_dict # --------------------------------------------------------------------------- @@ -252,6 +260,28 @@ def test_extract_expr_unparsable(): assert extract_expr_slot_references("{{{{") == set() +@pytest.mark.parametrize( + ("expr", "expected"), + [ + ("{demographics.age}", {"demographics": {"age"}}), + ("{a.x} + {a.y}", {"a": {"x", "y"}}), + ("{a.x} + {b.y}", {"a": {"x"}, "b": {"y"}}), + ("src.foo", {}), # src.* handled by slot ref extractor + ("plain_var", {}), + ("'hello'", {}), + ], + ids=lambda x: repr(x) if isinstance(x, str) else None, +) +def test_extract_expr_attribute_references(expr, expected): + """Attribute reference extraction covers cross-table dot notation.""" + assert extract_expr_attribute_references(expr) == expected + + +def test_extract_expr_attribute_references_unparsable(): + """Unparsable expressions yield an empty mapping, not a crash.""" + assert extract_expr_attribute_references("{{{{") == {} + + # --------------------------------------------------------------------------- # Unit tests for validate_spec_semantics # --------------------------------------------------------------------------- @@ -652,3 +682,537 @@ def test_validate_spec_includes_deprecation_messages(): deprecations = [m for m in msgs if m.category == "deprecated"] assert len(deprecations) == 1 assert "sources" in deprecations[0].message + + +# --------------------------------------------------------------------------- +# Nested cross-table class_derivation validation (#211) +# --------------------------------------------------------------------------- + + +JOINABLE_SCHEMA = textwrap.dedent("""\ + id: https://example.org/joinable + name: joinable + prefixes: + linkml: https://w3id.org/linkml/ + default_prefix: joinable + default_range: string + imports: + - linkml:types + + classes: + Measurement: + attributes: + id: + identifier: true + subject_id: + method: + Reading: + attributes: + id: + identifier: true + subject_id: + score: + range: float +""") + + +NO_OVERLAP_SCHEMA = textwrap.dedent("""\ + id: https://example.org/no-overlap + name: no_overlap + prefixes: + linkml: https://w3id.org/linkml/ + default_prefix: no_overlap + default_range: string + imports: + - linkml:types + + classes: + Measurement: + attributes: + id: + identifier: true + method: + Reading: + attributes: + reading_id: + identifier: true + score: +""") + + +AMBIGUOUS_SCHEMA = textwrap.dedent("""\ + id: https://example.org/ambiguous + name: ambiguous + prefixes: + linkml: https://w3id.org/linkml/ + default_prefix: ambiguous + default_range: string + imports: + - linkml:types + + classes: + Measurement: + attributes: + id: + identifier: true + subject_id: + sample_id: + method: + Reading: + attributes: + id: + identifier: true + subject_id: + sample_id: + score: +""") + + +THREE_TABLE_SCHEMA = textwrap.dedent("""\ + id: https://example.org/three-table + name: three_table + prefixes: + linkml: https://w3id.org/linkml/ + default_prefix: three_table + default_range: string + imports: + - linkml:types + + classes: + Outer: + attributes: + id: + identifier: true + shared_om: + Middle: + attributes: + mid_id: + identifier: true + shared_om: + shared_mi: + Inner: + attributes: + inner_id: + identifier: true + shared_mi: + payload: +""") + + +def _nested_spec(outer_pf: str, inner_pf: str, joins: dict | None = None) -> dict: + """Build a normalized 2-level nested spec with the given populated_from values.""" + inner_cd: dict = { + "Inner": { + "populated_from": inner_pf, + "slot_derivations": {"score": None}, + } + } + outer_cd: dict = { + "populated_from": outer_pf, + "slot_derivations": { + "observation": {"class_derivations": [inner_cd]}, + }, + } + if joins is not None: + outer_cd["joins"] = joins + return normalize_spec_dict({"class_derivations": {"Outer": outer_cd}}) + + +def _nested_path_segment(path: str) -> bool: + return "Inner" in path and "Outer" in path + + +def test_nested_cd_same_populated_from_no_message(): + """Nested CD with the same populated_from as parent emits no cross-table message.""" + sv = SchemaView(JOINABLE_SCHEMA) + spec = _nested_spec(outer_pf="Measurement", inner_pf="Measurement") + msgs = validate_spec_semantics(spec, source_schemaview=sv) + cross_msgs = [m for m in msgs if "implicit join" in m.message or "no implicit join" in m.message] + assert cross_msgs == [] + + +def test_nested_cd_explicit_join_no_message(): + """Nested CD with an explicit joins: entry covering its source emits no cross-table message.""" + sv = SchemaView(JOINABLE_SCHEMA) + spec = _nested_spec( + outer_pf="Measurement", + inner_pf="Reading", + joins={"Reading": {"join_on": "subject_id"}}, + ) + msgs = validate_spec_semantics(spec, source_schemaview=sv) + cross_msgs = [m for m in msgs if "implicit join" in m.message or "no implicit join" in m.message] + assert cross_msgs == [] + + +def test_nested_cd_implicit_join_emits_info(): + """Synthesizable implicit join emits an INFO with the column name.""" + sv = SchemaView(JOINABLE_SCHEMA) + spec = _nested_spec(outer_pf="Measurement", inner_pf="Reading") + msgs = validate_spec_semantics(spec, source_schemaview=sv) + infos = _infos(msgs) + assert len(infos) == 1 + assert "implicit join" in infos[0].message + assert "subject_id" in infos[0].message + assert _nested_path_segment(infos[0].path) + + +def test_nested_cd_no_overlap_emits_warning(): + """No shared columns between parent and nested tables emits a WARNING (closes #211).""" + sv = SchemaView(NO_OVERLAP_SCHEMA) + spec = _nested_spec(outer_pf="Measurement", inner_pf="Reading") + msgs = validate_spec_semantics(spec, source_schemaview=sv) + warnings = [m for m in _warnings(msgs) if "no implicit join" in m.message] + assert len(warnings) == 1 + assert "no columns are shared" in warnings[0].message + assert _nested_path_segment(warnings[0].path) + + +def test_nested_cd_ambiguous_emits_warning(): + """Multiple non-identifier common columns yield an 'ambiguous candidates' WARNING.""" + sv = SchemaView(AMBIGUOUS_SCHEMA) + spec = _nested_spec(outer_pf="Measurement", inner_pf="Reading") + msgs = validate_spec_semantics(spec, source_schemaview=sv) + warnings = [m for m in _warnings(msgs) if "no implicit join" in m.message] + assert len(warnings) == 1 + assert "multiple candidate" in warnings[0].message + assert "subject_id" in warnings[0].message + assert "sample_id" in warnings[0].message + + +def test_nested_cd_three_level_walks_each_boundary(): + """Cross-table check fires at every parent→child boundary in deeply nested specs.""" + sv = SchemaView(THREE_TABLE_SCHEMA) + spec = normalize_spec_dict( + { + "class_derivations": { + "Outer": { + "populated_from": "Outer", + "slot_derivations": { + "mid": { + "class_derivations": [ + { + "Middle": { + "populated_from": "Middle", + "slot_derivations": { + "inner": { + "class_derivations": [ + { + "Inner": { + "populated_from": "Inner", + } + } + ] + }, + }, + } + } + ] + } + }, + } + } + } + ) + msgs = validate_spec_semantics(spec, source_schemaview=sv) + infos = _infos(msgs) + # Two boundaries: Outer↔Middle (shared_om) and Middle↔Inner (shared_mi). + assert len(infos) == 2 + keys = sorted(m.message for m in infos) + assert any("shared_om" in m for m in keys) + assert any("shared_mi" in m for m in keys) + # Paths reflect the nesting depth. + assert any("Middle" in m.path and "Outer" in m.path for m in infos) + assert any("Inner" in m.path and "Middle" in m.path for m in infos) + + +def test_nested_cd_issue_211_regression(): + """Reproduction of the #211 ARIC spirometry shape. + + Two parent/child tables share a single subject identifier column. The + spec has no explicit joins: block. Validator emits an INFO indicating + the implicit join will synthesize at runtime — no silent null hazard. + """ + schema = textwrap.dedent("""\ + id: https://example.org/aric-211 + name: aric_211 + prefixes: + linkml: https://w3id.org/linkml/ + default_prefix: aric_211 + default_range: string + imports: + - linkml:types + + classes: + pht006467: + attributes: + dbGaP_subject_id: + identifier: true + phv00297546: + phv00297568: + range: float + pht012814: + attributes: + dbGaP_subject_id: + phv00512050: + range: integer + phv00512107: + range: float + """) + sv = SchemaView(schema) + spec = normalize_spec_dict( + { + "class_derivations": { + "MeasurementObservationSet": { + "populated_from": "pht006467", + "slot_derivations": { + "observations": { + "class_derivations": [ + { + "MeasurementObservation_inner": { + "populated_from": "pht012814", + "slot_derivations": { + "value_decimal": {"expr": "{pht012814.phv00512107}"}, + }, + } + } + ] + } + }, + } + } + } + ) + msgs = validate_spec_semantics(spec, source_schemaview=sv) + infos = _infos(msgs) + assert len(infos) == 1 + assert "implicit join" in infos[0].message + assert "dbGaP_subject_id" in infos[0].message + + +def test_nested_cd_expr_attribute_validated_against_joined_class(): + """Expression refs like {joined.field} validate against the joined class's slots (#213 stretch).""" + sv = SchemaView(JOINABLE_SCHEMA) + spec = normalize_spec_dict( + { + "class_derivations": { + "Outer": { + "populated_from": "Measurement", + "slot_derivations": { + "observation": {"class_derivations": [{"Inner": {"populated_from": "Reading"}}]}, + "computed": {"expr": "{Reading.score}"}, # valid slot on Reading + }, + } + } + } + ) + msgs = validate_spec_semantics(spec, source_schemaview=sv) + expr_msgs = [m for m in msgs if "Reading.score" in m.message] + assert expr_msgs == [] + + +def test_nested_cd_expr_attribute_unknown_field_warns(): + """A {joined.bogus} ref on a known joined class warns when the field doesn't exist.""" + sv = SchemaView(JOINABLE_SCHEMA) + spec = normalize_spec_dict( + { + "class_derivations": { + "Outer": { + "populated_from": "Measurement", + "slot_derivations": { + "observation": {"class_derivations": [{"Inner": {"populated_from": "Reading"}}]}, + "computed": {"expr": "{Reading.bogus_field}"}, + }, + } + } + } + ) + msgs = validate_spec_semantics(spec, source_schemaview=sv) + warnings = [m for m in _warnings(msgs) if "Reading.bogus_field" in m.message and "joined class" in m.message] + assert len(warnings) == 1 + + +def test_nested_cd_expr_attribute_explicit_join_alias_validated(): + """{alias.field} on an explicit joins: alias also validates against the joined class.""" + sv = SchemaView(JOINABLE_SCHEMA) + spec = normalize_spec_dict( + { + "class_derivations": { + "Outer": { + "populated_from": "Measurement", + "joins": { + "readings": {"join_on": "subject_id", "class_named": "Reading"}, + }, + "slot_derivations": { + "computed": {"expr": "{readings.bogus_field}"}, + }, + } + } + } + ) + msgs = validate_spec_semantics(spec, source_schemaview=sv) + warnings = [m for m in _warnings(msgs) if "readings.bogus_field" in m.message and "joined class" in m.message] + assert len(warnings) == 1 + + +def test_nested_cd_no_source_schema_no_cross_table_messages(): + """With no source schema, cross-table checks are skipped (can't predict).""" + spec = _nested_spec(outer_pf="Measurement", inner_pf="Reading") + msgs = validate_spec_semantics(spec, target_schema=str(PERSONINFO_SRC_SCHEMA)) + cross = [m for m in msgs if "implicit join" in m.message] + assert cross == [] + + +# --------------------------------------------------------------------------- +# is_a / mixins resolution (#219, Option C) +# --------------------------------------------------------------------------- + + +def test_is_a_resolves_via_spec_internal_pool(): + """is_a referencing another top-level class_derivation in the spec is valid.""" + spec = normalize_spec_dict( + { + "class_derivations": { + "Entity": None, + "Agent": {"is_a": "Entity", "populated_from": "Person"}, + } + } + ) + msgs = validate_spec_semantics(spec, source_schema=str(PERSONINFO_SRC_SCHEMA)) + inheritance_errors = [m for m in _errors(msgs) if "does not resolve" in m.message] + assert inheritance_errors == [] + + +def test_is_a_resolves_via_target_schema(): + """is_a referencing a target schema class name (not in spec pool) is valid.""" + target_schema = textwrap.dedent("""\ + id: https://example.org/agent-target + name: agent_target + prefixes: + linkml: https://w3id.org/linkml/ + default_prefix: agent_target + default_range: string + imports: + - linkml:types + classes: + Entity: + description: Target-schema base class + Agent: + is_a: Entity + """) + spec = normalize_spec_dict( + { + "class_derivations": { + "Agent": {"is_a": "Entity", "populated_from": "Person"}, + } + } + ) + target_sv = SchemaView(target_schema) + msgs = validate_spec_semantics(spec, target_schemaview=target_sv) + inheritance_errors = [m for m in _errors(msgs) if "does not resolve" in m.message] + assert inheritance_errors == [] + + +def test_is_a_unresolved_emits_error(): + """is_a that resolves to neither pool nor target schema is an error (#219).""" + target_schema = textwrap.dedent("""\ + id: https://example.org/empty-target + name: empty_target + prefixes: + linkml: https://w3id.org/linkml/ + default_prefix: empty_target + default_range: string + imports: + - linkml:types + classes: + Agent: + """) + spec = normalize_spec_dict( + { + "class_derivations": { + "Agent": {"is_a": "TotallyMadeUpName", "populated_from": "Person"}, + } + } + ) + target_sv = SchemaView(target_schema) + msgs = validate_spec_semantics(spec, target_schemaview=target_sv) + errors = [m for m in _errors(msgs) if "TotallyMadeUpName" in m.message] + assert len(errors) == 1 + assert "is_a" in errors[0].message + assert "does not resolve" in errors[0].message + + +def test_mixins_each_entry_resolved_independently(): + """mixins is a list; each entry resolves independently against pool + target.""" + target_schema = textwrap.dedent("""\ + id: https://example.org/mixins-target + name: mixins_target + prefixes: + linkml: https://w3id.org/linkml/ + default_prefix: mixins_target + default_range: string + imports: + - linkml:types + classes: + NamedThing: + Agent: + """) + spec = normalize_spec_dict( + { + "class_derivations": { + "InternalMixin": None, + "Agent": { + "mixins": ["InternalMixin", "NamedThing", "BogusMixin"], + "populated_from": "Person", + }, + } + } + ) + target_sv = SchemaView(target_schema) + msgs = validate_spec_semantics(spec, target_schemaview=target_sv) + errors = [m for m in _errors(msgs) if "does not resolve" in m.message] + assert len(errors) == 1 + assert "BogusMixin" in errors[0].message + assert "mixins" in errors[0].message + + +def test_is_a_and_mixins_both_validated(): + """Both is_a and mixins emit errors for unresolved references on the same CD.""" + target_schema = textwrap.dedent("""\ + id: https://example.org/dual + name: dual + prefixes: + linkml: https://w3id.org/linkml/ + default_prefix: dual + default_range: string + imports: + - linkml:types + classes: + Agent: + """) + spec = normalize_spec_dict( + { + "class_derivations": { + "Agent": { + "is_a": "MissingParent", + "mixins": ["MissingMixin"], + "populated_from": "Person", + } + } + } + ) + target_sv = SchemaView(target_schema) + msgs = validate_spec_semantics(spec, target_schemaview=target_sv) + errors = [m for m in _errors(msgs) if "does not resolve" in m.message] + assert len(errors) == 2 + field_labels = {("is_a" if "is_a" in e.message else "mixins") for e in errors} + assert field_labels == {"is_a", "mixins"} + + +def test_personinfo_to_agent_fixture_inheritance_resolves(): + """The personinfo-to-agent fixture's Agent.is_a: Entity must resolve via pool-1.""" + from pathlib import Path + + fixture = Path(EXAMPLE_DIR) / "personinfo_basic" / "transform" / "personinfo-to-agent.transform.yaml" + msgs = validate_spec_file(fixture) + inheritance_errors = [m for m in _errors(msgs) if "does not resolve" in m.message] + assert inheritance_errors == [] From 7183ca9393e7b2ce171ebff0b3e3f095b5ddbdf6 Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Tue, 12 May 2026 11:48:04 -0500 Subject: [PATCH 2/7] Address PR review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Cross-table join: when an explicit joins: entry covers the nested source, also verify the spec actually has 'join_on' or both 'source_key' and 'lookup_key'. An empty entry (structurally valid but missing keys) now warns at validate-spec time instead of failing at transform time with ValueError. - is_a / mixins: skip the resolution check entirely when target_sv is None. The "not in pool" half is genuinely ambiguous without the target schema — the reference could still resolve there. Erroring on half-checked pools was a false-positive risk. - Joined-class error wording: include both the resolved class name and the alias when they differ ('joined class X (alias Y)') so users can tell which schema class was actually checked when class_named is set. - _build_joined_class_map now returns dict[alias, (class_name, slot_set | None)] so the consumer can surface the resolved class name in error messages. Added five new regression tests covering each scenario. --- src/linkml_map/validator.py | 82 +++++++++++++++++++++++--------- tests/test_validator.py | 94 +++++++++++++++++++++++++++++++++++++ 2 files changed, 155 insertions(+), 21 deletions(-) diff --git a/src/linkml_map/validator.py b/src/linkml_map/validator.py index bb892427..f467d6aa 100644 --- a/src/linkml_map/validator.py +++ b/src/linkml_map/validator.py @@ -689,8 +689,8 @@ def _build_joined_class_map( cd: dict[str, Any], source_sv: SchemaView | None, slot_derivation_dicts: list[dict[str, Any]], -) -> dict[str, set[str] | None]: - """Map alias names to the slot set of their joined class. +) -> dict[str, tuple[str, set[str] | None]]: + """Map alias names to the resolved joined-class name and its slot set. Combines two sources of "joined tables" visible from this CD's expressions: @@ -705,11 +705,14 @@ def _build_joined_class_map( :param source_sv: Source schema view, or ``None``. :param slot_derivation_dicts: Pre-iterated slot derivations (avoids re-walking). - :returns: Mapping of alias name to its joined class's slot set. Value - is ``None`` when the joined class can't be resolved in the source - schema (or when no source schema is available). + :returns: Mapping of alias name to a ``(joined_class_name, slot_set)`` + tuple. ``slot_set`` is ``None`` when the joined class can't be + resolved in the source schema (or when no source schema is + available). The class name is tracked separately so error + messages can distinguish alias from class for explicit + ``class_named`` joins. """ - result: dict[str, set[str] | None] = {} + result: dict[str, tuple[str, set[str] | None]] = {} joins = cd.get("joins") or {} if isinstance(joins, dict): @@ -720,9 +723,10 @@ def _build_joined_class_map( else: joined_class = alias if source_sv is not None and joined_class in set(source_sv.all_classes()): - result[alias] = {s.name for s in source_sv.class_induced_slots(joined_class)} + slots = {s.name for s in source_sv.class_induced_slots(joined_class)} + result[alias] = (joined_class, slots) else: - result[alias] = None + result[alias] = (joined_class, None) parent_source = cd.get("populated_from") if source_sv is not None and parent_source: @@ -732,9 +736,10 @@ def _build_joined_class_map( nested_source = nested.get("populated_from") if nested_source and nested_source != parent_source and nested_source not in result: if nested_source in all_classes: - result[nested_source] = {s.name for s in source_sv.class_induced_slots(nested_source)} + slots = {s.name for s in source_sv.class_induced_slots(nested_source)} + result[nested_source] = (nested_source, slots) else: - result[nested_source] = None + result[nested_source] = (nested_source, None) return result @@ -764,6 +769,29 @@ def _check_cross_table_join( parent_joins = parent_cd.get("joins") or {} if isinstance(parent_joins, dict) and nested_source in parent_joins: + # Explicit join is present — verify the spec carries enough keys to + # actually resolve a row, mirroring the runtime check in + # ``_resolve_joined_row``. A structurally-valid-but-empty entry like + # ``joins: {Reading: {}}`` would otherwise pass validation but blow + # up at transform time with ``ValueError: Join spec ... must + # specify 'join_on' or both 'source_key' and 'lookup_key'``. + spec = parent_joins[nested_source] + spec_dict = spec if isinstance(spec, dict) else {} + join_on = spec_dict.get("join_on") + source_key = spec_dict.get("source_key") + lookup_key = spec_dict.get("lookup_key") + if not join_on and not (source_key and lookup_key): + messages.append( + ValidationMessage( + severity="warning", + path=nested_path, + message=( + f"Join spec for '{nested_source}' is missing keys: " + f"must specify 'join_on' or both 'source_key' and " + f"'lookup_key'. Runtime will raise ValueError." + ), + ) + ) return if source_sv is None: @@ -831,7 +859,11 @@ def _check_class_inheritance_refs( 2. A class in the target schema (matches LinkML's ``is_a`` convention on plain schemas). - A reference that resolves to neither emits an ``error``. + A reference that resolves to neither emits an ``error`` — but only when + ``target_sv`` is available. Without the target schema, half the pool is + unknown, so a miss against the spec pool alone is ambiguous (it could + still be a valid target schema reference) and is left unchecked rather + than risk a false positive. """ parents: list[tuple[str, str]] = [] is_a = cd.get("is_a") @@ -841,10 +873,10 @@ def _check_class_inheritance_refs( if isinstance(mixins, list): parents.extend(("mixins", m) for m in mixins if isinstance(m, str)) - if not parents: + if not parents or target_sv is None: return - target_classes = set(target_sv.all_classes()) if target_sv is not None else set() + target_classes = set(target_sv.all_classes()) for field_label, parent_name in parents: if parent_name in derivation_pool: @@ -869,18 +901,19 @@ def _validate_slot_derivation( parent_path: str, source_class_slots: set[str] | None, target_class_slots: set[str] | None, - joined_class_slots: dict[str, set[str] | None], + joined_class_slots: dict[str, tuple[str, set[str] | None]], strict: bool, messages: list[ValidationMessage], ) -> None: """Validate a single slot derivation against schemas. :param joined_class_slots: Mapping of alias name (from explicit - ``joins:`` or an implicit-join target) to the slot set of the - joined class. Expression bare-name references to these aliases - are excluded from source-class checks, and attribute references - like ``{alias.field}`` are validated against the joined class's - slots. Value of ``None`` means the joined class is unknown — the + ``joins:`` or an implicit-join target) to a tuple of + ``(joined_class_name, slot_set)``. Expression bare-name + references to these aliases are excluded from source-class + checks, and attribute references like ``{alias.field}`` are + validated against the joined class's slot set. A ``slot_set`` + of ``None`` means the joined class couldn't be resolved — the bare alias is still tolerated, but attribute access is skipped. """ sd_name = sd.get("name", "?") @@ -933,9 +966,16 @@ def _validate_slot_derivation( for base, attrs in attr_refs.items(): if base not in joined_class_slots: continue - joined_slots = joined_class_slots[base] + joined_class, joined_slots = joined_class_slots[base] if joined_slots is None: continue + # When the alias differs from the resolved class name (explicit + # ``class_named``), surface both so the user knows which schema + # class was actually checked. + if joined_class == base: + class_descriptor = f"joined class '{joined_class}'" + else: + class_descriptor = f"joined class '{joined_class}' (alias '{base}')" for attr in sorted(attrs): if attr not in joined_slots: messages.append( @@ -943,7 +983,7 @@ def _validate_slot_derivation( severity="error" if strict else "warning", path=sd_path, message=( - f"Expression references '{base}.{attr}' but '{attr}' is not a slot on joined class '{base}'" + f"Expression references '{base}.{attr}' but '{attr}' is not a slot on {class_descriptor}" ), ) ) diff --git a/tests/test_validator.py b/tests/test_validator.py index f184e246..da744988 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -1216,3 +1216,97 @@ def test_personinfo_to_agent_fixture_inheritance_resolves(): msgs = validate_spec_file(fixture) inheritance_errors = [m for m in _errors(msgs) if "does not resolve" in m.message] assert inheritance_errors == [] + + +def test_is_a_unresolved_skipped_when_no_target_schema(): + """Without target_sv, an is_a miss against the spec pool alone is not an error. + + The reference might be a valid target-schema class name; we can't tell + without the target schema, so we don't error on a half-checked pool. + """ + spec = normalize_spec_dict( + { + "class_derivations": { + "Agent": {"is_a": "TotallyMadeUpName", "populated_from": "Person"}, + } + } + ) + msgs = validate_spec_semantics(spec, source_schema=str(PERSONINFO_SRC_SCHEMA)) + inheritance_errors = [m for m in _errors(msgs) if "does not resolve" in m.message] + assert inheritance_errors == [] + + +# --------------------------------------------------------------------------- +# Incomplete-join-spec detection on cross-table boundaries +# --------------------------------------------------------------------------- + + +def test_cross_table_empty_explicit_join_emits_warning(): + """An explicit joins: entry that's empty fails at runtime — surface it here.""" + sv = SchemaView(JOINABLE_SCHEMA) + spec = _nested_spec( + outer_pf="Measurement", + inner_pf="Reading", + joins={"Reading": {}}, + ) + msgs = validate_spec_semantics(spec, source_schemaview=sv) + warnings = [m for m in _warnings(msgs) if "missing keys" in m.message] + assert len(warnings) == 1 + assert "Reading" in warnings[0].message + + +def test_cross_table_source_lookup_keys_satisfy_join(): + """A join spec using source_key + lookup_key (not join_on) is valid.""" + sv = SchemaView(JOINABLE_SCHEMA) + spec = _nested_spec( + outer_pf="Measurement", + inner_pf="Reading", + joins={"Reading": {"source_key": "subject_id", "lookup_key": "subject_id"}}, + ) + msgs = validate_spec_semantics(spec, source_schemaview=sv) + incomplete = [m for m in _warnings(msgs) if "missing keys" in m.message] + assert incomplete == [] + + +def test_cross_table_source_key_only_emits_warning(): + """source_key without lookup_key (and no join_on) fails — both keys required.""" + sv = SchemaView(JOINABLE_SCHEMA) + spec = _nested_spec( + outer_pf="Measurement", + inner_pf="Reading", + joins={"Reading": {"source_key": "subject_id"}}, + ) + msgs = validate_spec_semantics(spec, source_schemaview=sv) + warnings = [m for m in _warnings(msgs) if "missing keys" in m.message] + assert len(warnings) == 1 + + +# --------------------------------------------------------------------------- +# Joined-class error message includes resolved class name when aliased +# --------------------------------------------------------------------------- + + +def test_joined_class_attr_error_uses_class_named_when_aliased(): + """Error message names the actual joined class, not just the alias, when class_named is set.""" + sv = SchemaView(JOINABLE_SCHEMA) + spec = normalize_spec_dict( + { + "class_derivations": { + "Outer": { + "populated_from": "Measurement", + "joins": { + "readings": {"join_on": "subject_id", "class_named": "Reading"}, + }, + "slot_derivations": { + "computed": {"expr": "{readings.bogus_field}"}, + }, + } + } + } + ) + msgs = validate_spec_semantics(spec, source_schemaview=sv) + warnings = [m for m in _warnings(msgs) if "readings.bogus_field" in m.message] + assert len(warnings) == 1 + msg = warnings[0].message + assert "joined class 'Reading'" in msg + assert "alias 'readings'" in msg From d4a7ca2a2a22bb93ca26a63e402152065baf6f84 Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Tue, 12 May 2026 11:53:54 -0500 Subject: [PATCH 3/7] Warn on column-overlap hazard for implicit cross-table joins (#217) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When an implicit join would synthesize at runtime (#212), additionally check whether parent and child share non-join columns. Runtime behavior on those columns is data-dependent (#217 Bug 2): rows with a join match merge and mark shared columns _AMBIGUOUS (unqualified access errors), rows without a match leave the bare parent row in place and silently return the parent's value. Same spec, same dataset, can both error and silently succeed depending on which row is being processed. The hazard is fully determined by the schemas — surface it at validate-spec time instead of waiting for a sparse row to expose it. This addresses the static half of #217 Bug 2. The runtime behavior itself (consistent _AMBIGUOUS insertion regardless of join hit, and nested-object shape on no-match) remains a follow-up. --- src/linkml_map/validator.py | 26 ++++++++++++++++++++++ tests/test_validator.py | 44 +++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) diff --git a/src/linkml_map/validator.py b/src/linkml_map/validator.py index f467d6aa..0f7a1040 100644 --- a/src/linkml_map/validator.py +++ b/src/linkml_map/validator.py @@ -818,6 +818,32 @@ def _check_cross_table_join( ), ) ) + # Surface the column-overlap hazard for #217 Bug 2 (data-dependent + # ambiguity). When parent and child share non-join columns, runtime + # behavior depends on whether the join lookup hits: a match merges + # rows and marks shared columns _AMBIGUOUS (unqualified access + # errors), but a miss leaves the bare parent row in place and + # silently returns the parent's value for that column. Flag it + # statically so users disambiguate before sparse data exposes it. + parent_slots = {s.name for s in source_sv.class_induced_slots(parent_source)} + nested_slots = {s.name for s in source_sv.class_induced_slots(nested_source)} + overlap = (parent_slots & nested_slots) - {key} + if overlap: + shared = ", ".join(f"'{c}'" for c in sorted(overlap)) + messages.append( + ValidationMessage( + severity="warning", + path=nested_path, + message=( + f"Implicit join between '{parent_source}' and '{nested_source}' " + f"on '{key}' — overlapping non-join columns {{{shared}}} will " + f"resolve ambiguously: error on rows with a join match " + f"(_AMBIGUOUS sentinel), silent parent value on rows without " + f"(#217). Disambiguate with an explicit joins: alias or " + f"dotted-reference form." + ), + ) + ) return common = find_common_columns(source_sv, parent_source, nested_source) diff --git a/tests/test_validator.py b/tests/test_validator.py index da744988..22772010 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -856,6 +856,50 @@ def test_nested_cd_implicit_join_emits_info(): assert _nested_path_segment(infos[0].path) +def test_implicit_join_overlap_columns_emit_ambiguity_warning(): + """Non-join column overlap between parent and child surfaces #217 hazard. + + JOINABLE_SCHEMA: Measurement(id, subject_id, method) and Reading(id, + subject_id, score). Implicit join on 'subject_id' leaves 'id' as a + non-join overlap — runtime ambiguity behavior is data-dependent + (#217 Bug 2), so warn statically. + """ + sv = SchemaView(JOINABLE_SCHEMA) + spec = _nested_spec(outer_pf="Measurement", inner_pf="Reading") + msgs = validate_spec_semantics(spec, source_schemaview=sv) + overlap_warnings = [w for w in _warnings(msgs) if "#217" in w.message] + assert len(overlap_warnings) == 1 + assert "'id'" in overlap_warnings[0].message + assert "ambiguously" in overlap_warnings[0].message + + +def test_implicit_join_no_overlap_no_ambiguity_warning(): + """When parent and child share only the join column, no #217 warning fires.""" + sv = SchemaView(THREE_TABLE_SCHEMA) + # Outer + Middle share only 'shared_om' (which is the join key). + spec = _nested_spec(outer_pf="Outer", inner_pf="Middle") + msgs = validate_spec_semantics(spec, source_schemaview=sv) + overlap_warnings = [w for w in _warnings(msgs) if "#217" in w.message] + assert overlap_warnings == [] + + +def test_explicit_join_skips_overlap_warning(): + """An explicit joins: block opts out of the implicit-join ambiguity check. + + Users who declared the join explicitly have signaled intent; assume they + will disambiguate column refs themselves. + """ + sv = SchemaView(JOINABLE_SCHEMA) + spec = _nested_spec( + outer_pf="Measurement", + inner_pf="Reading", + joins={"Reading": {"join_on": "subject_id"}}, + ) + msgs = validate_spec_semantics(spec, source_schemaview=sv) + overlap_warnings = [w for w in _warnings(msgs) if "#217" in w.message] + assert overlap_warnings == [] + + def test_nested_cd_no_overlap_emits_warning(): """No shared columns between parent and nested tables emits a WARNING (closes #211).""" sv = SchemaView(NO_OVERLAP_SCHEMA) From c64299c453cd43fcd921db46fd2a67847b3991cc Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Tue, 12 May 2026 12:44:15 -0500 Subject: [PATCH 4/7] Precompute all_classes once in _build_joined_class_map --- src/linkml_map/validator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/linkml_map/validator.py b/src/linkml_map/validator.py index 0f7a1040..d299259d 100644 --- a/src/linkml_map/validator.py +++ b/src/linkml_map/validator.py @@ -713,6 +713,7 @@ def _build_joined_class_map( ``class_named`` joins. """ result: dict[str, tuple[str, set[str] | None]] = {} + all_classes = set(source_sv.all_classes()) if source_sv is not None else set() joins = cd.get("joins") or {} if isinstance(joins, dict): @@ -722,7 +723,7 @@ def _build_joined_class_map( joined_class = spec.get("class_named") or alias else: joined_class = alias - if source_sv is not None and joined_class in set(source_sv.all_classes()): + if source_sv is not None and joined_class in all_classes: slots = {s.name for s in source_sv.class_induced_slots(joined_class)} result[alias] = (joined_class, slots) else: @@ -730,7 +731,6 @@ def _build_joined_class_map( parent_source = cd.get("populated_from") if source_sv is not None and parent_source: - all_classes = set(source_sv.all_classes()) for sd in slot_derivation_dicts: for nested in _iter_derivation_dicts(sd.get("class_derivations", [])): nested_source = nested.get("populated_from") From 481d4617c45c866e8f34b1cc5db5e408ea8b9276 Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Tue, 12 May 2026 13:46:10 -0500 Subject: [PATCH 5/7] =?UTF-8?q?Revert=20#217=20column-overlap=20warning=20?= =?UTF-8?q?=E2=80=94=20unreachable=20in=20practice?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The overlap warning added in d4a7ca2 was unreachable given how pick_join_key works: it returns a non-None key only when exactly one non-identifier column is common (or when only identifier columns are common, as a fallback). Any remaining overlap after subtracting the join key is therefore identifier-only. Filtering identifiers (the only sensible refinement) would make the check fire on zero realistic inputs. The genuine "two non-identifier columns in common, one is the join key, the other is a real data column that resolves ambiguously" hazard is already covered by the existing 'multiple candidate join columns' WARNING — pick_join_key refuses to pick in that scenario, so we fall through to the ambiguous-candidates path, not the implicit-join INFO branch. Dropping the warning and the three tests rather than carrying dead code on speculation that pick_join_key might one day get smarter. --- src/linkml_map/validator.py | 26 ---------------------- tests/test_validator.py | 44 ------------------------------------- 2 files changed, 70 deletions(-) diff --git a/src/linkml_map/validator.py b/src/linkml_map/validator.py index d299259d..ff418338 100644 --- a/src/linkml_map/validator.py +++ b/src/linkml_map/validator.py @@ -818,32 +818,6 @@ def _check_cross_table_join( ), ) ) - # Surface the column-overlap hazard for #217 Bug 2 (data-dependent - # ambiguity). When parent and child share non-join columns, runtime - # behavior depends on whether the join lookup hits: a match merges - # rows and marks shared columns _AMBIGUOUS (unqualified access - # errors), but a miss leaves the bare parent row in place and - # silently returns the parent's value for that column. Flag it - # statically so users disambiguate before sparse data exposes it. - parent_slots = {s.name for s in source_sv.class_induced_slots(parent_source)} - nested_slots = {s.name for s in source_sv.class_induced_slots(nested_source)} - overlap = (parent_slots & nested_slots) - {key} - if overlap: - shared = ", ".join(f"'{c}'" for c in sorted(overlap)) - messages.append( - ValidationMessage( - severity="warning", - path=nested_path, - message=( - f"Implicit join between '{parent_source}' and '{nested_source}' " - f"on '{key}' — overlapping non-join columns {{{shared}}} will " - f"resolve ambiguously: error on rows with a join match " - f"(_AMBIGUOUS sentinel), silent parent value on rows without " - f"(#217). Disambiguate with an explicit joins: alias or " - f"dotted-reference form." - ), - ) - ) return common = find_common_columns(source_sv, parent_source, nested_source) diff --git a/tests/test_validator.py b/tests/test_validator.py index 22772010..da744988 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -856,50 +856,6 @@ def test_nested_cd_implicit_join_emits_info(): assert _nested_path_segment(infos[0].path) -def test_implicit_join_overlap_columns_emit_ambiguity_warning(): - """Non-join column overlap between parent and child surfaces #217 hazard. - - JOINABLE_SCHEMA: Measurement(id, subject_id, method) and Reading(id, - subject_id, score). Implicit join on 'subject_id' leaves 'id' as a - non-join overlap — runtime ambiguity behavior is data-dependent - (#217 Bug 2), so warn statically. - """ - sv = SchemaView(JOINABLE_SCHEMA) - spec = _nested_spec(outer_pf="Measurement", inner_pf="Reading") - msgs = validate_spec_semantics(spec, source_schemaview=sv) - overlap_warnings = [w for w in _warnings(msgs) if "#217" in w.message] - assert len(overlap_warnings) == 1 - assert "'id'" in overlap_warnings[0].message - assert "ambiguously" in overlap_warnings[0].message - - -def test_implicit_join_no_overlap_no_ambiguity_warning(): - """When parent and child share only the join column, no #217 warning fires.""" - sv = SchemaView(THREE_TABLE_SCHEMA) - # Outer + Middle share only 'shared_om' (which is the join key). - spec = _nested_spec(outer_pf="Outer", inner_pf="Middle") - msgs = validate_spec_semantics(spec, source_schemaview=sv) - overlap_warnings = [w for w in _warnings(msgs) if "#217" in w.message] - assert overlap_warnings == [] - - -def test_explicit_join_skips_overlap_warning(): - """An explicit joins: block opts out of the implicit-join ambiguity check. - - Users who declared the join explicitly have signaled intent; assume they - will disambiguate column refs themselves. - """ - sv = SchemaView(JOINABLE_SCHEMA) - spec = _nested_spec( - outer_pf="Measurement", - inner_pf="Reading", - joins={"Reading": {"join_on": "subject_id"}}, - ) - msgs = validate_spec_semantics(spec, source_schemaview=sv) - overlap_warnings = [w for w in _warnings(msgs) if "#217" in w.message] - assert overlap_warnings == [] - - def test_nested_cd_no_overlap_emits_warning(): """No shared columns between parent and nested tables emits a WARNING (closes #211).""" sv = SchemaView(NO_OVERLAP_SCHEMA) From 18938d4d5fa760df00fab7a2a790379c600c4256 Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Tue, 12 May 2026 14:14:49 -0500 Subject: [PATCH 6/7] Clarify cross-table diagnostic wording MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both messages referred to "joins: block" wholesale, but the path fires whenever a specific entry for nested_source is absent — the parent CD may already have a joins: mapping for other tables. Reword to point at the specific missing entry instead of implying the whole block needs adding. --- src/linkml_map/validator.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/linkml_map/validator.py b/src/linkml_map/validator.py index ff418338..4885e85a 100644 --- a/src/linkml_map/validator.py +++ b/src/linkml_map/validator.py @@ -812,9 +812,9 @@ def _check_cross_table_join( path=nested_path, message=( f"Nested 'populated_from={nested_source}' differs from parent " - f"'populated_from={parent_source}'. No explicit joins: block; " - f"implicit join will be synthesized on column '{key}'. " - f"Consider declaring the join explicitly." + f"'populated_from={parent_source}'. No explicit join entry for " + f"'{nested_source}'; implicit join will be synthesized on column " + f"'{key}'. Consider declaring the join explicitly." ), ) ) @@ -836,8 +836,8 @@ def _check_cross_table_join( message=( f"Nested 'populated_from={nested_source}' differs from parent " f"'populated_from={parent_source}', but no implicit join can be " - f"synthesized: {reason}. Add an explicit joins: block — cross-table " - f"values will otherwise resolve to null." + f"synthesized: {reason}. Add an explicit join entry for " + f"'{nested_source}' — cross-table values will otherwise resolve to null." ), ) ) From bd6c92df91552724eb823b481405b688ceb2c44d Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Wed, 13 May 2026 15:55:20 -0500 Subject: [PATCH 7/7] Match runtime's identity-case parent source _derive_nested_objects in the runtime uses ``parent_class_deriv.populated_from or parent_class_deriv.name``, so a CD that omits ``populated_from`` falls back to its own name as the source class (the "identity" case). _build_joined_class_map and _check_cross_table_join previously only consulted ``populated_from``, so an identity-style outer CD with a nested cross-table reference silently slipped past validation. Mirror the runtime fallback in both places and add a regression test. --- src/linkml_map/validator.py | 9 +++++++-- tests/test_validator.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/src/linkml_map/validator.py b/src/linkml_map/validator.py index 4885e85a..ba0ddd9e 100644 --- a/src/linkml_map/validator.py +++ b/src/linkml_map/validator.py @@ -729,7 +729,10 @@ def _build_joined_class_map( else: result[alias] = (joined_class, None) - parent_source = cd.get("populated_from") + # Identity case: when populated_from is omitted, the runtime treats the + # CD's own name as the parent source. Match that behavior here so nested + # CDs reachable from an identity-CD still get cross-table validation. + parent_source = cd.get("populated_from") or cd.get("name") if source_sv is not None and parent_source: for sd in slot_derivation_dicts: for nested in _iter_derivation_dicts(sd.get("class_derivations", [])): @@ -763,7 +766,9 @@ def _check_cross_table_join( with the same diagnostic the runtime would raise. """ nested_source = nested_cd.get("populated_from") - parent_source = parent_cd.get("populated_from") + # Identity case for the parent: fall back to its name when populated_from + # is omitted (matches _derive_nested_objects in the runtime). + parent_source = parent_cd.get("populated_from") or parent_cd.get("name") if not nested_source or not parent_source or nested_source == parent_source: return diff --git a/tests/test_validator.py b/tests/test_validator.py index da744988..ae41c89c 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -1062,6 +1062,40 @@ def test_nested_cd_no_source_schema_no_cross_table_messages(): assert cross == [] +def test_identity_case_parent_cd_validated_against_runtime(): + """A parent CD with no populated_from uses its name as the source class. + + Runtime falls back to ``parent_class_deriv.populated_from or + parent_class_deriv.name`` (see ObjectTransformer._derive_nested_objects), + so the validator must too — otherwise nested cross-table refs from an + identity-style outer CD silently pass. + """ + sv = SchemaView(JOINABLE_SCHEMA) + spec = normalize_spec_dict( + { + "class_derivations": { + # Outer "Measurement" CD — no populated_from, identity case. + "Measurement": { + "slot_derivations": { + "observation": { + "class_derivations": [{"Inner": {"populated_from": "Reading"}}], + }, + "computed": {"expr": "{Reading.bogus_field}"}, + }, + } + } + } + ) + msgs = validate_spec_semantics(spec, source_schemaview=sv) + # Cross-table check should fire (Measurement → Reading is a real boundary). + infos = [m for m in _infos(msgs) if "implicit join" in m.message] + assert len(infos) == 1 + assert "subject_id" in infos[0].message + # Attribute ref against Reading should be validated — bogus_field caught. + attr_warnings = [m for m in _warnings(msgs) if "Reading.bogus_field" in m.message] + assert len(attr_warnings) == 1 + + # --------------------------------------------------------------------------- # is_a / mixins resolution (#219, Option C) # ---------------------------------------------------------------------------