|
| 1 | +"""Tests for cross-table Biosample-Study denormalization using joins. |
| 2 | +
|
| 3 | +Demonstrates the ``joins:`` feature from PR #136: Biosample rows are |
| 4 | +enriched with Study metadata (PI name, ecosystem, study name) by joining |
| 5 | +on ``associated_studies``. This is the pattern needed for NMDC lakehouse |
| 6 | +tables where biosample rows carry denormalized study context. |
| 7 | +
|
| 8 | +This test requires the cross-table join feature introduced in PR #136. |
| 9 | +
|
| 10 | +See also: |
| 11 | +- PR #136 (cross-table lookup support) |
| 12 | +- Issue #134 (cross-table slot lookup design) |
| 13 | +""" |
| 14 | + |
| 15 | +# ruff: noqa: PLR2004 |
| 16 | + |
| 17 | +import textwrap |
| 18 | + |
| 19 | +import yaml |
| 20 | +from linkml_runtime import SchemaView |
| 21 | + |
| 22 | +from linkml_map.loaders.data_loaders import DataLoader |
| 23 | +from linkml_map.transformer.engine import transform_spec |
| 24 | +from linkml_map.transformer.object_transformer import ObjectTransformer |
| 25 | + |
| 26 | +# --------------------------------------------------------------------------- |
| 27 | +# Schemas |
| 28 | +# --------------------------------------------------------------------------- |
| 29 | + |
| 30 | +SOURCE_SCHEMA_YAML = textwrap.dedent("""\ |
| 31 | + id: https://example.org/nmdc-join-source |
| 32 | + name: nmdc_join_source |
| 33 | + prefixes: |
| 34 | + linkml: https://w3id.org/linkml/ |
| 35 | + imports: |
| 36 | + - linkml:types |
| 37 | + default_range: string |
| 38 | + classes: |
| 39 | + Biosample: |
| 40 | + attributes: |
| 41 | + id: |
| 42 | + identifier: true |
| 43 | + name: {} |
| 44 | + associated_studies: {} |
| 45 | + depth_value: {} |
| 46 | + depth_unit: {} |
| 47 | + env_broad_scale_term_id: {} |
| 48 | + ecosystem: {} |
| 49 | + ecosystem_category: {} |
| 50 | + Study: |
| 51 | + attributes: |
| 52 | + id: |
| 53 | + identifier: true |
| 54 | + name: {} |
| 55 | + pi_name: {} |
| 56 | + pi_email: {} |
| 57 | + ecosystem: {} |
| 58 | + ecosystem_category: {} |
| 59 | + funding_sources: {} |
| 60 | +""") |
| 61 | + |
| 62 | +TARGET_SCHEMA_YAML = textwrap.dedent("""\ |
| 63 | + id: https://example.org/nmdc-join-target |
| 64 | + name: nmdc_join_target |
| 65 | + prefixes: |
| 66 | + linkml: https://w3id.org/linkml/ |
| 67 | + imports: |
| 68 | + - linkml:types |
| 69 | + default_range: string |
| 70 | + classes: |
| 71 | + DenormalizedBiosample: |
| 72 | + attributes: |
| 73 | + biosample_id: |
| 74 | + identifier: true |
| 75 | + biosample_name: {} |
| 76 | + depth_value: {} |
| 77 | + depth_unit: {} |
| 78 | + env_broad_scale_term_id: {} |
| 79 | + ecosystem: {} |
| 80 | + ecosystem_category: {} |
| 81 | + study_id: {} |
| 82 | + study_name: {} |
| 83 | + pi_name: {} |
| 84 | + pi_email: {} |
| 85 | + study_ecosystem: {} |
| 86 | + funding_sources: {} |
| 87 | +""") |
| 88 | + |
| 89 | + |
| 90 | +# --------------------------------------------------------------------------- |
| 91 | +# Helpers |
| 92 | +# --------------------------------------------------------------------------- |
| 93 | + |
| 94 | +def _make_transformer(source_sv, target_sv, spec_yaml): |
| 95 | + tr = ObjectTransformer(unrestricted_eval=False) |
| 96 | + tr.source_schemaview = source_sv |
| 97 | + tr.target_schemaview = target_sv |
| 98 | + tr.create_transformer_specification(yaml.safe_load(spec_yaml)) |
| 99 | + return tr |
| 100 | + |
| 101 | + |
| 102 | +# --------------------------------------------------------------------------- |
| 103 | +# Tests |
| 104 | +# --------------------------------------------------------------------------- |
| 105 | + |
| 106 | + |
| 107 | +def test_biosample_study_join(tmp_path): |
| 108 | + """Biosample rows are enriched with Study metadata via join on associated_studies.""" |
| 109 | + (tmp_path / "Biosample.tsv").write_text( |
| 110 | + "id\tname\tassociated_studies\tdepth_value\tdepth_unit\t" |
| 111 | + "env_broad_scale_term_id\tecosystem\tecosystem_category\n" |
| 112 | + "nmdc:bsm-12-abc\tHopland soil\tnmdc:sty-11-001\t0.75\tm\t" |
| 113 | + "ENVO:01000174\tEnvironmental\tTerrestrial\n" |
| 114 | + "nmdc:bsm-12-def\tStream sediment\tnmdc:sty-11-002\t1.5\tm\t" |
| 115 | + "ENVO:00002030\tEnvironmental\tAquatic\n" |
| 116 | + "nmdc:bsm-12-ghi\tOrphan sample\tnmdc:sty-11-999\t0.1\tm\t" |
| 117 | + "ENVO:01000174\tEnvironmental\tTerrestrial\n" |
| 118 | + ) |
| 119 | + (tmp_path / "Study.tsv").write_text( |
| 120 | + "id\tname\tpi_name\tpi_email\tecosystem\t" |
| 121 | + "ecosystem_category\tfunding_sources\n" |
| 122 | + "nmdc:sty-11-001\tHopland Metagenome\tElaine Faustman\t" |
| 123 | + "faustman@example.org\tEnvironmental\tTerrestrial\tDOE BER\n" |
| 124 | + "nmdc:sty-11-002\tStream Ecology\tJane Doe\t" |
| 125 | + "jdoe@example.org\tEnvironmental\tAquatic\tNSF\n" |
| 126 | + ) |
| 127 | + |
| 128 | + spec = textwrap.dedent("""\ |
| 129 | + class_derivations: |
| 130 | + DenormalizedBiosample: |
| 131 | + populated_from: Biosample |
| 132 | + joins: |
| 133 | + Study: |
| 134 | + source_key: associated_studies |
| 135 | + lookup_key: id |
| 136 | + slot_derivations: |
| 137 | + biosample_id: |
| 138 | + populated_from: id |
| 139 | + biosample_name: |
| 140 | + populated_from: name |
| 141 | + depth_value: |
| 142 | + populated_from: depth_value |
| 143 | + depth_unit: |
| 144 | + populated_from: depth_unit |
| 145 | + env_broad_scale_term_id: |
| 146 | + populated_from: env_broad_scale_term_id |
| 147 | + ecosystem: |
| 148 | + populated_from: ecosystem |
| 149 | + ecosystem_category: |
| 150 | + populated_from: ecosystem_category |
| 151 | + study_id: |
| 152 | + expr: "{Study.id}" |
| 153 | + study_name: |
| 154 | + expr: "{Study.name}" |
| 155 | + pi_name: |
| 156 | + expr: "{Study.pi_name}" |
| 157 | + pi_email: |
| 158 | + expr: "{Study.pi_email}" |
| 159 | + study_ecosystem: |
| 160 | + expr: "{Study.ecosystem}" |
| 161 | + funding_sources: |
| 162 | + expr: "{Study.funding_sources}" |
| 163 | + """) |
| 164 | + |
| 165 | + source_sv = SchemaView(SOURCE_SCHEMA_YAML) |
| 166 | + target_sv = SchemaView(TARGET_SCHEMA_YAML) |
| 167 | + tr = _make_transformer(source_sv, target_sv, spec) |
| 168 | + loader = DataLoader(tmp_path) |
| 169 | + |
| 170 | + results = list(transform_spec(tr, loader)) |
| 171 | + assert len(results) == 3 |
| 172 | + |
| 173 | + # Biosample 1: joined to Study 001 |
| 174 | + r0 = results[0] |
| 175 | + assert r0["biosample_id"] == "nmdc:bsm-12-abc" |
| 176 | + assert r0["biosample_name"] == "Hopland soil" |
| 177 | + assert str(r0["depth_value"]) == "0.75" |
| 178 | + assert r0["depth_unit"] == "m" |
| 179 | + assert r0["study_id"] == "nmdc:sty-11-001" |
| 180 | + assert r0["study_name"] == "Hopland Metagenome" |
| 181 | + assert r0["pi_name"] == "Elaine Faustman" |
| 182 | + assert r0["pi_email"] == "faustman@example.org" |
| 183 | + assert r0["funding_sources"] == "DOE BER" |
| 184 | + |
| 185 | + # Biosample 2: joined to Study 002 |
| 186 | + r1 = results[1] |
| 187 | + assert r1["biosample_id"] == "nmdc:bsm-12-def" |
| 188 | + assert r1["study_id"] == "nmdc:sty-11-002" |
| 189 | + assert r1["study_name"] == "Stream Ecology" |
| 190 | + assert r1["pi_name"] == "Jane Doe" |
| 191 | + |
| 192 | + # Biosample 3: orphan — Study 999 doesn't exist → null propagation |
| 193 | + r2 = results[2] |
| 194 | + assert r2["biosample_id"] == "nmdc:bsm-12-ghi" |
| 195 | + assert r2["biosample_name"] == "Orphan sample" |
| 196 | + assert r2.get("study_id") is None |
| 197 | + assert r2.get("study_name") is None |
| 198 | + assert r2.get("pi_name") is None |
| 199 | + |
| 200 | + |
| 201 | +def test_biosample_study_join_preserves_biosample_fields(tmp_path): |
| 202 | + """All biosample-native fields pass through unchanged when joins are present.""" |
| 203 | + (tmp_path / "Biosample.tsv").write_text( |
| 204 | + "id\tname\tassociated_studies\tdepth_value\tdepth_unit\t" |
| 205 | + "env_broad_scale_term_id\tecosystem\tecosystem_category\n" |
| 206 | + "nmdc:bsm-12-xyz\tTest sample\tnmdc:sty-11-001\t2.0\tft\t" |
| 207 | + "ENVO:00002030\tHost-associated\tHuman\n" |
| 208 | + ) |
| 209 | + (tmp_path / "Study.tsv").write_text( |
| 210 | + "id\tname\tpi_name\tpi_email\tecosystem\t" |
| 211 | + "ecosystem_category\tfunding_sources\n" |
| 212 | + "nmdc:sty-11-001\tTest Study\tTest PI\t" |
| 213 | + "test@example.org\tEnvironmental\tTerrestrial\tDOE\n" |
| 214 | + ) |
| 215 | + |
| 216 | + spec = textwrap.dedent("""\ |
| 217 | + class_derivations: |
| 218 | + DenormalizedBiosample: |
| 219 | + populated_from: Biosample |
| 220 | + joins: |
| 221 | + Study: |
| 222 | + source_key: associated_studies |
| 223 | + lookup_key: id |
| 224 | + slot_derivations: |
| 225 | + biosample_id: |
| 226 | + populated_from: id |
| 227 | + biosample_name: |
| 228 | + populated_from: name |
| 229 | + depth_value: |
| 230 | + populated_from: depth_value |
| 231 | + depth_unit: |
| 232 | + populated_from: depth_unit |
| 233 | + env_broad_scale_term_id: |
| 234 | + populated_from: env_broad_scale_term_id |
| 235 | + ecosystem: |
| 236 | + populated_from: ecosystem |
| 237 | + ecosystem_category: |
| 238 | + populated_from: ecosystem_category |
| 239 | + study_id: |
| 240 | + expr: "{Study.id}" |
| 241 | + study_name: |
| 242 | + expr: "{Study.name}" |
| 243 | + pi_name: |
| 244 | + expr: "{Study.pi_name}" |
| 245 | + pi_email: |
| 246 | + expr: "{Study.pi_email}" |
| 247 | + study_ecosystem: |
| 248 | + expr: "{Study.ecosystem}" |
| 249 | + funding_sources: |
| 250 | + expr: "{Study.funding_sources}" |
| 251 | + """) |
| 252 | + |
| 253 | + source_sv = SchemaView(SOURCE_SCHEMA_YAML) |
| 254 | + target_sv = SchemaView(TARGET_SCHEMA_YAML) |
| 255 | + tr = _make_transformer(source_sv, target_sv, spec) |
| 256 | + loader = DataLoader(tmp_path) |
| 257 | + |
| 258 | + results = list(transform_spec(tr, loader)) |
| 259 | + assert len(results) == 1 |
| 260 | + |
| 261 | + r = results[0] |
| 262 | + # All biosample fields preserved |
| 263 | + assert r["biosample_id"] == "nmdc:bsm-12-xyz" |
| 264 | + assert r["biosample_name"] == "Test sample" |
| 265 | + assert str(r["depth_value"]) == "2.0" |
| 266 | + assert r["depth_unit"] == "ft" |
| 267 | + assert r["env_broad_scale_term_id"] == "ENVO:00002030" |
| 268 | + assert r["ecosystem"] == "Host-associated" |
| 269 | + assert r["ecosystem_category"] == "Human" |
| 270 | + # Study fields joined |
| 271 | + assert r["study_id"] == "nmdc:sty-11-001" |
| 272 | + assert r["pi_name"] == "Test PI" |
0 commit comments