Skip to content

Commit 8a5559a

Browse files
authored
Add NMDC Biosample-Study cross-table join tests
1 parent eb19c49 commit 8a5559a

1 file changed

Lines changed: 272 additions & 0 deletions

File tree

Lines changed: 272 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,272 @@
1+
"""Tests for cross-table Biosample-Study denormalization using joins.
2+
3+
Demonstrates the ``joins:`` feature from PR #136: Biosample rows are
4+
enriched with Study metadata (PI name, ecosystem, study name) by joining
5+
on ``associated_studies``. This is the pattern needed for NMDC lakehouse
6+
tables where biosample rows carry denormalized study context.
7+
8+
This test requires the cross-table join feature introduced in PR #136.
9+
10+
See also:
11+
- PR #136 (cross-table lookup support)
12+
- Issue #134 (cross-table slot lookup design)
13+
"""
14+
15+
# ruff: noqa: PLR2004
16+
17+
import textwrap
18+
19+
import yaml
20+
from linkml_runtime import SchemaView
21+
22+
from linkml_map.loaders.data_loaders import DataLoader
23+
from linkml_map.transformer.engine import transform_spec
24+
from linkml_map.transformer.object_transformer import ObjectTransformer
25+
26+
# ---------------------------------------------------------------------------
27+
# Schemas
28+
# ---------------------------------------------------------------------------
29+
30+
SOURCE_SCHEMA_YAML = textwrap.dedent("""\
31+
id: https://example.org/nmdc-join-source
32+
name: nmdc_join_source
33+
prefixes:
34+
linkml: https://w3id.org/linkml/
35+
imports:
36+
- linkml:types
37+
default_range: string
38+
classes:
39+
Biosample:
40+
attributes:
41+
id:
42+
identifier: true
43+
name: {}
44+
associated_studies: {}
45+
depth_value: {}
46+
depth_unit: {}
47+
env_broad_scale_term_id: {}
48+
ecosystem: {}
49+
ecosystem_category: {}
50+
Study:
51+
attributes:
52+
id:
53+
identifier: true
54+
name: {}
55+
pi_name: {}
56+
pi_email: {}
57+
ecosystem: {}
58+
ecosystem_category: {}
59+
funding_sources: {}
60+
""")
61+
62+
TARGET_SCHEMA_YAML = textwrap.dedent("""\
63+
id: https://example.org/nmdc-join-target
64+
name: nmdc_join_target
65+
prefixes:
66+
linkml: https://w3id.org/linkml/
67+
imports:
68+
- linkml:types
69+
default_range: string
70+
classes:
71+
DenormalizedBiosample:
72+
attributes:
73+
biosample_id:
74+
identifier: true
75+
biosample_name: {}
76+
depth_value: {}
77+
depth_unit: {}
78+
env_broad_scale_term_id: {}
79+
ecosystem: {}
80+
ecosystem_category: {}
81+
study_id: {}
82+
study_name: {}
83+
pi_name: {}
84+
pi_email: {}
85+
study_ecosystem: {}
86+
funding_sources: {}
87+
""")
88+
89+
90+
# ---------------------------------------------------------------------------
91+
# Helpers
92+
# ---------------------------------------------------------------------------
93+
94+
def _make_transformer(source_sv, target_sv, spec_yaml):
95+
tr = ObjectTransformer(unrestricted_eval=False)
96+
tr.source_schemaview = source_sv
97+
tr.target_schemaview = target_sv
98+
tr.create_transformer_specification(yaml.safe_load(spec_yaml))
99+
return tr
100+
101+
102+
# ---------------------------------------------------------------------------
103+
# Tests
104+
# ---------------------------------------------------------------------------
105+
106+
107+
def test_biosample_study_join(tmp_path):
108+
"""Biosample rows are enriched with Study metadata via join on associated_studies."""
109+
(tmp_path / "Biosample.tsv").write_text(
110+
"id\tname\tassociated_studies\tdepth_value\tdepth_unit\t"
111+
"env_broad_scale_term_id\tecosystem\tecosystem_category\n"
112+
"nmdc:bsm-12-abc\tHopland soil\tnmdc:sty-11-001\t0.75\tm\t"
113+
"ENVO:01000174\tEnvironmental\tTerrestrial\n"
114+
"nmdc:bsm-12-def\tStream sediment\tnmdc:sty-11-002\t1.5\tm\t"
115+
"ENVO:00002030\tEnvironmental\tAquatic\n"
116+
"nmdc:bsm-12-ghi\tOrphan sample\tnmdc:sty-11-999\t0.1\tm\t"
117+
"ENVO:01000174\tEnvironmental\tTerrestrial\n"
118+
)
119+
(tmp_path / "Study.tsv").write_text(
120+
"id\tname\tpi_name\tpi_email\tecosystem\t"
121+
"ecosystem_category\tfunding_sources\n"
122+
"nmdc:sty-11-001\tHopland Metagenome\tElaine Faustman\t"
123+
"faustman@example.org\tEnvironmental\tTerrestrial\tDOE BER\n"
124+
"nmdc:sty-11-002\tStream Ecology\tJane Doe\t"
125+
"jdoe@example.org\tEnvironmental\tAquatic\tNSF\n"
126+
)
127+
128+
spec = textwrap.dedent("""\
129+
class_derivations:
130+
DenormalizedBiosample:
131+
populated_from: Biosample
132+
joins:
133+
Study:
134+
source_key: associated_studies
135+
lookup_key: id
136+
slot_derivations:
137+
biosample_id:
138+
populated_from: id
139+
biosample_name:
140+
populated_from: name
141+
depth_value:
142+
populated_from: depth_value
143+
depth_unit:
144+
populated_from: depth_unit
145+
env_broad_scale_term_id:
146+
populated_from: env_broad_scale_term_id
147+
ecosystem:
148+
populated_from: ecosystem
149+
ecosystem_category:
150+
populated_from: ecosystem_category
151+
study_id:
152+
expr: "{Study.id}"
153+
study_name:
154+
expr: "{Study.name}"
155+
pi_name:
156+
expr: "{Study.pi_name}"
157+
pi_email:
158+
expr: "{Study.pi_email}"
159+
study_ecosystem:
160+
expr: "{Study.ecosystem}"
161+
funding_sources:
162+
expr: "{Study.funding_sources}"
163+
""")
164+
165+
source_sv = SchemaView(SOURCE_SCHEMA_YAML)
166+
target_sv = SchemaView(TARGET_SCHEMA_YAML)
167+
tr = _make_transformer(source_sv, target_sv, spec)
168+
loader = DataLoader(tmp_path)
169+
170+
results = list(transform_spec(tr, loader))
171+
assert len(results) == 3
172+
173+
# Biosample 1: joined to Study 001
174+
r0 = results[0]
175+
assert r0["biosample_id"] == "nmdc:bsm-12-abc"
176+
assert r0["biosample_name"] == "Hopland soil"
177+
assert str(r0["depth_value"]) == "0.75"
178+
assert r0["depth_unit"] == "m"
179+
assert r0["study_id"] == "nmdc:sty-11-001"
180+
assert r0["study_name"] == "Hopland Metagenome"
181+
assert r0["pi_name"] == "Elaine Faustman"
182+
assert r0["pi_email"] == "faustman@example.org"
183+
assert r0["funding_sources"] == "DOE BER"
184+
185+
# Biosample 2: joined to Study 002
186+
r1 = results[1]
187+
assert r1["biosample_id"] == "nmdc:bsm-12-def"
188+
assert r1["study_id"] == "nmdc:sty-11-002"
189+
assert r1["study_name"] == "Stream Ecology"
190+
assert r1["pi_name"] == "Jane Doe"
191+
192+
# Biosample 3: orphan — Study 999 doesn't exist → null propagation
193+
r2 = results[2]
194+
assert r2["biosample_id"] == "nmdc:bsm-12-ghi"
195+
assert r2["biosample_name"] == "Orphan sample"
196+
assert r2.get("study_id") is None
197+
assert r2.get("study_name") is None
198+
assert r2.get("pi_name") is None
199+
200+
201+
def test_biosample_study_join_preserves_biosample_fields(tmp_path):
202+
"""All biosample-native fields pass through unchanged when joins are present."""
203+
(tmp_path / "Biosample.tsv").write_text(
204+
"id\tname\tassociated_studies\tdepth_value\tdepth_unit\t"
205+
"env_broad_scale_term_id\tecosystem\tecosystem_category\n"
206+
"nmdc:bsm-12-xyz\tTest sample\tnmdc:sty-11-001\t2.0\tft\t"
207+
"ENVO:00002030\tHost-associated\tHuman\n"
208+
)
209+
(tmp_path / "Study.tsv").write_text(
210+
"id\tname\tpi_name\tpi_email\tecosystem\t"
211+
"ecosystem_category\tfunding_sources\n"
212+
"nmdc:sty-11-001\tTest Study\tTest PI\t"
213+
"test@example.org\tEnvironmental\tTerrestrial\tDOE\n"
214+
)
215+
216+
spec = textwrap.dedent("""\
217+
class_derivations:
218+
DenormalizedBiosample:
219+
populated_from: Biosample
220+
joins:
221+
Study:
222+
source_key: associated_studies
223+
lookup_key: id
224+
slot_derivations:
225+
biosample_id:
226+
populated_from: id
227+
biosample_name:
228+
populated_from: name
229+
depth_value:
230+
populated_from: depth_value
231+
depth_unit:
232+
populated_from: depth_unit
233+
env_broad_scale_term_id:
234+
populated_from: env_broad_scale_term_id
235+
ecosystem:
236+
populated_from: ecosystem
237+
ecosystem_category:
238+
populated_from: ecosystem_category
239+
study_id:
240+
expr: "{Study.id}"
241+
study_name:
242+
expr: "{Study.name}"
243+
pi_name:
244+
expr: "{Study.pi_name}"
245+
pi_email:
246+
expr: "{Study.pi_email}"
247+
study_ecosystem:
248+
expr: "{Study.ecosystem}"
249+
funding_sources:
250+
expr: "{Study.funding_sources}"
251+
""")
252+
253+
source_sv = SchemaView(SOURCE_SCHEMA_YAML)
254+
target_sv = SchemaView(TARGET_SCHEMA_YAML)
255+
tr = _make_transformer(source_sv, target_sv, spec)
256+
loader = DataLoader(tmp_path)
257+
258+
results = list(transform_spec(tr, loader))
259+
assert len(results) == 1
260+
261+
r = results[0]
262+
# All biosample fields preserved
263+
assert r["biosample_id"] == "nmdc:bsm-12-xyz"
264+
assert r["biosample_name"] == "Test sample"
265+
assert str(r["depth_value"]) == "2.0"
266+
assert r["depth_unit"] == "ft"
267+
assert r["env_broad_scale_term_id"] == "ENVO:00002030"
268+
assert r["ecosystem"] == "Host-associated"
269+
assert r["ecosystem_category"] == "Human"
270+
# Study fields joined
271+
assert r["study_id"] == "nmdc:sty-11-001"
272+
assert r["pi_name"] == "Test PI"

0 commit comments

Comments
 (0)