Add NMDC Biosample-Study cross-table join tests

turbomam · web-flow · commit 8a5559a3c19a · 2026-03-10T16:21:46.000-05:00
diff --git a/tests/test_transformer/test_nmdc_biosample_study_join.py b/tests/test_transformer/test_nmdc_biosample_study_join.py
@@ -0,0 +1,272 @@
+"""Tests for cross-table Biosample-Study denormalization using joins.
+
+Demonstrates the ``joins:`` feature from PR #136: Biosample rows are
+enriched with Study metadata (PI name, ecosystem, study name) by joining
+on ``associated_studies``. This is the pattern needed for NMDC lakehouse
+tables where biosample rows carry denormalized study context.
+
+This test requires the cross-table join feature introduced in PR #136.
+
+See also:
+- PR #136 (cross-table lookup support)
+- Issue #134 (cross-table slot lookup design)
+"""
+
+# ruff: noqa: PLR2004
+
+import textwrap
+
+import yaml
+from linkml_runtime import SchemaView
+
+from linkml_map.loaders.data_loaders import DataLoader
+from linkml_map.transformer.engine import transform_spec
+from linkml_map.transformer.object_transformer import ObjectTransformer
+
+# ---------------------------------------------------------------------------
+# Schemas
+# ---------------------------------------------------------------------------
+
+SOURCE_SCHEMA_YAML = textwrap.dedent("""\
+    id: https://example.org/nmdc-join-source
+    name: nmdc_join_source
+    prefixes:
+      linkml: https://w3id.org/linkml/
+    imports:
+      - linkml:types
+    default_range: string
+    classes:
+      Biosample:
+        attributes:
+          id:
+            identifier: true
+          name: {}
+          associated_studies: {}
+          depth_value: {}
+          depth_unit: {}
+          env_broad_scale_term_id: {}
+          ecosystem: {}
+          ecosystem_category: {}
+      Study:
+        attributes:
+          id:
+            identifier: true
+          name: {}
+          pi_name: {}
+          pi_email: {}
+          ecosystem: {}
+          ecosystem_category: {}
+          funding_sources: {}
+""")
+
+TARGET_SCHEMA_YAML = textwrap.dedent("""\
+    id: https://example.org/nmdc-join-target
+    name: nmdc_join_target
+    prefixes:
+      linkml: https://w3id.org/linkml/
+    imports:
+      - linkml:types
+    default_range: string
+    classes:
+      DenormalizedBiosample:
+        attributes:
+          biosample_id:
+            identifier: true
+          biosample_name: {}
+          depth_value: {}
+          depth_unit: {}
+          env_broad_scale_term_id: {}
+          ecosystem: {}
+          ecosystem_category: {}
+          study_id: {}
+          study_name: {}
+          pi_name: {}
+          pi_email: {}
+          study_ecosystem: {}
+          funding_sources: {}
+""")
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _make_transformer(source_sv, target_sv, spec_yaml):
+    tr = ObjectTransformer(unrestricted_eval=False)
+    tr.source_schemaview = source_sv
+    tr.target_schemaview = target_sv
+    tr.create_transformer_specification(yaml.safe_load(spec_yaml))
+    return tr
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+def test_biosample_study_join(tmp_path):
+    """Biosample rows are enriched with Study metadata via join on associated_studies."""
+    (tmp_path / "Biosample.tsv").write_text(
+        "id\tname\tassociated_studies\tdepth_value\tdepth_unit\t"
+        "env_broad_scale_term_id\tecosystem\tecosystem_category\n"
+        "nmdc:bsm-12-abc\tHopland soil\tnmdc:sty-11-001\t0.75\tm\t"
+        "ENVO:01000174\tEnvironmental\tTerrestrial\n"
+        "nmdc:bsm-12-def\tStream sediment\tnmdc:sty-11-002\t1.5\tm\t"
+        "ENVO:00002030\tEnvironmental\tAquatic\n"
+        "nmdc:bsm-12-ghi\tOrphan sample\tnmdc:sty-11-999\t0.1\tm\t"
+        "ENVO:01000174\tEnvironmental\tTerrestrial\n"
+    )
+    (tmp_path / "Study.tsv").write_text(
+        "id\tname\tpi_name\tpi_email\tecosystem\t"
+        "ecosystem_category\tfunding_sources\n"
+        "nmdc:sty-11-001\tHopland Metagenome\tElaine Faustman\t"
+        "faustman@example.org\tEnvironmental\tTerrestrial\tDOE BER\n"
+        "nmdc:sty-11-002\tStream Ecology\tJane Doe\t"
+        "jdoe@example.org\tEnvironmental\tAquatic\tNSF\n"
+    )
+
+    spec = textwrap.dedent("""\
+        class_derivations:
+          DenormalizedBiosample:
+            populated_from: Biosample
+            joins:
+              Study:
+                source_key: associated_studies
+                lookup_key: id
+            slot_derivations:
+              biosample_id:
+                populated_from: id
+              biosample_name:
+                populated_from: name
+              depth_value:
+                populated_from: depth_value
+              depth_unit:
+                populated_from: depth_unit
+              env_broad_scale_term_id:
+                populated_from: env_broad_scale_term_id
+              ecosystem:
+                populated_from: ecosystem
+              ecosystem_category:
+                populated_from: ecosystem_category
+              study_id:
+                expr: "{Study.id}"
+              study_name:
+                expr: "{Study.name}"
+              pi_name:
+                expr: "{Study.pi_name}"
+              pi_email:
+                expr: "{Study.pi_email}"
+              study_ecosystem:
+                expr: "{Study.ecosystem}"
+              funding_sources:
+                expr: "{Study.funding_sources}"
+    """)
+
+    source_sv = SchemaView(SOURCE_SCHEMA_YAML)
+    target_sv = SchemaView(TARGET_SCHEMA_YAML)
+    tr = _make_transformer(source_sv, target_sv, spec)
+    loader = DataLoader(tmp_path)
+
+    results = list(transform_spec(tr, loader))
+    assert len(results) == 3
+
+    # Biosample 1: joined to Study 001
+    r0 = results[0]
+    assert r0["biosample_id"] == "nmdc:bsm-12-abc"
+    assert r0["biosample_name"] == "Hopland soil"
+    assert str(r0["depth_value"]) == "0.75"
+    assert r0["depth_unit"] == "m"
+    assert r0["study_id"] == "nmdc:sty-11-001"
+    assert r0["study_name"] == "Hopland Metagenome"
+    assert r0["pi_name"] == "Elaine Faustman"
+    assert r0["pi_email"] == "faustman@example.org"
+    assert r0["funding_sources"] == "DOE BER"
+
+    # Biosample 2: joined to Study 002
+    r1 = results[1]
+    assert r1["biosample_id"] == "nmdc:bsm-12-def"
+    assert r1["study_id"] == "nmdc:sty-11-002"
+    assert r1["study_name"] == "Stream Ecology"
+    assert r1["pi_name"] == "Jane Doe"
+
+    # Biosample 3: orphan — Study 999 doesn't exist → null propagation
+    r2 = results[2]
+    assert r2["biosample_id"] == "nmdc:bsm-12-ghi"
+    assert r2["biosample_name"] == "Orphan sample"
+    assert r2.get("study_id") is None
+    assert r2.get("study_name") is None
+    assert r2.get("pi_name") is None
+
+
+def test_biosample_study_join_preserves_biosample_fields(tmp_path):
+    """All biosample-native fields pass through unchanged when joins are present."""
+    (tmp_path / "Biosample.tsv").write_text(
+        "id\tname\tassociated_studies\tdepth_value\tdepth_unit\t"
+        "env_broad_scale_term_id\tecosystem\tecosystem_category\n"
+        "nmdc:bsm-12-xyz\tTest sample\tnmdc:sty-11-001\t2.0\tft\t"
+        "ENVO:00002030\tHost-associated\tHuman\n"
+    )
+    (tmp_path / "Study.tsv").write_text(
+        "id\tname\tpi_name\tpi_email\tecosystem\t"
+        "ecosystem_category\tfunding_sources\n"
+        "nmdc:sty-11-001\tTest Study\tTest PI\t"
+        "test@example.org\tEnvironmental\tTerrestrial\tDOE\n"
+    )
+
+    spec = textwrap.dedent("""\
+        class_derivations:
+          DenormalizedBiosample:
+            populated_from: Biosample
+            joins:
+              Study:
+                source_key: associated_studies
+                lookup_key: id
+            slot_derivations:
+              biosample_id:
+                populated_from: id
+              biosample_name:
+                populated_from: name
+              depth_value:
+                populated_from: depth_value
+              depth_unit:
+                populated_from: depth_unit
+              env_broad_scale_term_id:
+                populated_from: env_broad_scale_term_id
+              ecosystem:
+                populated_from: ecosystem
+              ecosystem_category:
+                populated_from: ecosystem_category
+              study_id:
+                expr: "{Study.id}"
+              study_name:
+                expr: "{Study.name}"
+              pi_name:
+                expr: "{Study.pi_name}"
+              pi_email:
+                expr: "{Study.pi_email}"
+              study_ecosystem:
+                expr: "{Study.ecosystem}"
+              funding_sources:
+                expr: "{Study.funding_sources}"
+    """)
+
+    source_sv = SchemaView(SOURCE_SCHEMA_YAML)
+    target_sv = SchemaView(TARGET_SCHEMA_YAML)
+    tr = _make_transformer(source_sv, target_sv, spec)
+    loader = DataLoader(tmp_path)
+
+    results = list(transform_spec(tr, loader))
+    assert len(results) == 1
+
+    r = results[0]
+    # All biosample fields preserved
+    assert r["biosample_id"] == "nmdc:bsm-12-xyz"
+    assert r["biosample_name"] == "Test sample"
+    assert str(r["depth_value"]) == "2.0"
+    assert r["depth_unit"] == "ft"
+    assert r["env_broad_scale_term_id"] == "ENVO:00002030"
+    assert r["ecosystem"] == "Host-associated"
+    assert r["ecosystem_category"] == "Human"
+    # Study fields joined
+    assert r["study_id"] == "nmdc:sty-11-001"
+    assert r["pi_name"] == "Test PI"