From 84125081dc5785d3358770768f5bdac277e80d8d Mon Sep 17 00:00:00 2001 From: Benjamin Capodanno Date: Thu, 14 May 2026 14:03:34 -0700 Subject: [PATCH 1/2] fix(annotation): handle all VRS allele state types in deserialization allele_from_mapped_variant_dictionary_result unconditionally constructed a LiteralSequenceExpression from the stored state dict, causing a Pydantic ValidationError (500) for score sets containing reference-identical variants whose state is a ReferenceLengthExpression. - Dispatch on state type to construct RLE, LengthExpression, or LSE - Raise ValueError with an actionable message for unknown state types, replacing the cryptic multi-field Pydantic error as the failure mode - Add test constants for RLE and LengthExpression allele dicts - Parametrize state-type and CisPhasedBlock member tests to cover all three state types and enforce coverage of future additions --- src/mavedb/lib/annotation/util.py | 21 ++++++++++- tests/helpers/constants.py | 39 ++++++++++++++++++++ tests/lib/annotation/test_util.py | 59 ++++++++++++++++++++++++++++--- 3 files changed, 113 insertions(+), 6 deletions(-) diff --git a/src/mavedb/lib/annotation/util.py b/src/mavedb/lib/annotation/util.py index 760fedb3..d49d0ed2 100644 --- a/src/mavedb/lib/annotation/util.py +++ b/src/mavedb/lib/annotation/util.py @@ -7,8 +7,10 @@ Allele, CisPhasedBlock, Expression, + LengthExpression, LiteralSequenceExpression, MolecularVariation, + ReferenceLengthExpression, SequenceLocation, SequenceReference, ) @@ -81,9 +83,26 @@ def allele_from_mapped_variant_dictionary_result(allelic_mapping_results: dict) except KeyError: variation = allelic_mapping_results + state_dict = variation["state"] + state: ReferenceLengthExpression | LengthExpression | LiteralSequenceExpression + if state_dict.get("type") == "ReferenceLengthExpression": + state = ReferenceLengthExpression(**state_dict) + elif state_dict.get("type") == "LengthExpression": + state = LengthExpression(**state_dict) + elif state_dict.get("type") == "LiteralSequenceExpression": + state = LiteralSequenceExpression(**state_dict) + else: + raise ValueError( + f"Unsupported VRS Allele state type {state_dict.get('type')!r}. " + "Update allele_from_mapped_variant_dictionary_result to handle this type." + ) + + # Explicit field extraction for alleles is intentional: stored dicts may contain extra fields (e.g. "type" on + # Extension, "label" on SequenceReference) that the strict VRS Pydantic models forbid, so using model_validate() + # directly is not possible against stored mapping results. return Allele( id=variation.get("id"), - state=LiteralSequenceExpression(**variation["state"]), + state=state, digest=variation.get("digest"), location=SequenceLocation( start=variation.get("location", {}).get("start"), diff --git a/tests/helpers/constants.py b/tests/helpers/constants.py index 96d73b4f..bf78d38f 100644 --- a/tests/helpers/constants.py +++ b/tests/helpers/constants.py @@ -126,6 +126,45 @@ "members": [TEST_VALID_POST_MAPPED_VRS_ALLELE, TEST_VALID_POST_MAPPED_VRS_ALLELE], } +TEST_VALID_POST_MAPPED_VRS_ALLELE_RLE = { + "id": TEST_GA4GH_IDENTIFIER, + "type": "Allele", + "state": {"type": "ReferenceLengthExpression", "length": 5, "repeatSubunitLength": 5}, + "digest": TEST_GA4GH_DIGEST, + "location": { + "id": TEST_SEQUENCE_LOCATION_ACCESSION, + "end": 6, + "type": "SequenceLocation", + "start": 5, + "digest": TEST_GA4GH_DIGEST, + "sequenceReference": { + "type": "SequenceReference", + "label": TEST_REFSEQ_IDENTIFIER, + "refgetAccession": TEST_REFGET_ACCESSION, + }, + }, + "expressions": [{"value": f"{TEST_REFSEQ_IDENTIFIER}:p.=", "syntax": "hgvs.p"}], +} + +TEST_VALID_POST_MAPPED_VRS_ALLELE_LENGTH_EXPRESSION = { + "id": TEST_GA4GH_IDENTIFIER, + "type": "Allele", + "state": {"type": "LengthExpression", "length": 5}, + "digest": TEST_GA4GH_DIGEST, + "location": { + "id": TEST_SEQUENCE_LOCATION_ACCESSION, + "end": 6, + "type": "SequenceLocation", + "start": 5, + "digest": TEST_GA4GH_DIGEST, + "sequenceReference": { + "type": "SequenceReference", + "label": TEST_REFSEQ_IDENTIFIER, + "refgetAccession": TEST_REFGET_ACCESSION, + }, + }, +} + TEST_PUBMED_PUBLICATION = { "identifier": TEST_PUBMED_IDENTIFIER, "db_name": "PubMed", diff --git a/tests/lib/annotation/test_util.py b/tests/lib/annotation/test_util.py index 7e5d6359..fcafd8aa 100644 --- a/tests/lib/annotation/test_util.py +++ b/tests/lib/annotation/test_util.py @@ -30,7 +30,12 @@ variation_from_mapped_variant, vrs_object_from_mapped_variant, ) -from tests.helpers.constants import TEST_SEQUENCE_LOCATION_ACCESSION, TEST_VALID_POST_MAPPED_VRS_ALLELE +from tests.helpers.constants import ( + TEST_SEQUENCE_LOCATION_ACCESSION, + TEST_VALID_POST_MAPPED_VRS_ALLELE, + TEST_VALID_POST_MAPPED_VRS_ALLELE_LENGTH_EXPRESSION, + TEST_VALID_POST_MAPPED_VRS_ALLELE_RLE, +) @pytest.mark.unit @@ -55,16 +60,60 @@ def test_variation_from_mapped_variant_no_post_mapped(self, mock_mapped_variant) with pytest.raises(MappingDataDoesntExistException): variation_from_mapped_variant(mock_mapped_variant) - def test_vrs_object_from_mapped_variant_handles_haplotype_member_list(self): - mapping_results = { - "type": "Haplotype", - "members": [TEST_VALID_POST_MAPPED_VRS_ALLELE, TEST_VALID_POST_MAPPED_VRS_ALLELE], + @pytest.mark.parametrize( + "allele_dict, expected_state_type, expected_state_fields", + [ + ( + TEST_VALID_POST_MAPPED_VRS_ALLELE, + "LiteralSequenceExpression", + {"sequence": "F"}, + ), + ( + TEST_VALID_POST_MAPPED_VRS_ALLELE_RLE, + "ReferenceLengthExpression", + {"length": 5, "repeatSubunitLength": 5}, + ), + ( + TEST_VALID_POST_MAPPED_VRS_ALLELE_LENGTH_EXPRESSION, + "LengthExpression", + {"length": 5}, + ), + ], + ids=["lse_state", "rle_state", "length_expression_state"], + ) + def test_allele_state_type_is_deserialized_correctly(self, allele_dict, expected_state_type, expected_state_fields): + result = vrs_object_from_mapped_variant(allele_dict).model_dump() + + assert result["state"]["type"] == expected_state_type + for k, v in expected_state_fields.items(): + assert result["state"][k] == v + + def test_unknown_allele_state_type_raises_value_error(self): + allele_dict = { + **TEST_VALID_POST_MAPPED_VRS_ALLELE, + "state": {"type": "UnknownFutureExpression", "someField": "someValue"}, } + with pytest.raises(ValueError, match="Unsupported VRS Allele state type"): + vrs_object_from_mapped_variant(allele_dict) + + @pytest.mark.parametrize( + "member_dict, expected_state_type", + [ + (TEST_VALID_POST_MAPPED_VRS_ALLELE, "LiteralSequenceExpression"), + (TEST_VALID_POST_MAPPED_VRS_ALLELE_RLE, "ReferenceLengthExpression"), + (TEST_VALID_POST_MAPPED_VRS_ALLELE_LENGTH_EXPRESSION, "LengthExpression"), + ], + ids=["lse_members", "rle_members", "length_expression_members"], + ) + def test_cis_phased_block_member_state_types_are_deserialized_correctly(self, member_dict, expected_state_type): + mapping_results = {"type": "CisPhasedBlock", "members": [member_dict, member_dict]} + result = vrs_object_from_mapped_variant(mapping_results).model_dump() assert result["type"] == "CisPhasedBlock" assert len(result["members"]) == 2 + assert result["members"][0]["state"]["type"] == expected_state_type @pytest.mark.unit From cc5e19c13f5612c2595439183c3788c250b82316 Mon Sep 17 00:00:00 2001 From: Benjamin Capodanno Date: Fri, 15 May 2026 11:15:19 -0700 Subject: [PATCH 2/2] fix(annotation): pass evidence model instances directly to va-spec fields Serializing evidence to dicts via `serialize_evidence_items` before assigning to `hasEvidenceItems`/`hasEvidenceLines` caused `VariantPathogenicityEvidenceLine` validation to fail for Statement objects containing nested VRS Alleles with production genomic data (regex constraints on `digest`, `refgetAccession`, etc. failed during dict reconstruction). - Remove `serialize_evidence_items` from `util.py` entirely - `acmg_evidence_line`: pass `list(evidence)` directly; Statement is in `has_evidence_items_models` and passes the isinstance check - `functional_evidence_line`: wrap items as `StudyResult(root=item)`; `ExperimentalVariantFunctionalImpactStudyResult` does not inherit `StudyResult` so direct instances fail the isinstance check - `mapped_variant_to_pathogenicity_statement`: pass `list(evidence)` - Add regression tests in `test_evidence_line.py` and `test_annotate.py` that assert evidence items are model instances, not raw dicts - Remove `TestSerializeEvidenceItems` from `test_util.py` --- src/mavedb/lib/annotation/evidence_line.py | 5 +-- src/mavedb/lib/annotation/statement.py | 3 +- src/mavedb/lib/annotation/util.py | 30 +------------- tests/lib/annotation/test_annotate.py | 21 ++++++++++ tests/lib/annotation/test_evidence_line.py | 46 ++++++++++++++++++++-- tests/lib/annotation/test_util.py | 19 --------- 6 files changed, 68 insertions(+), 56 deletions(-) diff --git a/src/mavedb/lib/annotation/evidence_line.py b/src/mavedb/lib/annotation/evidence_line.py index 2ae7d8fa..8ebf7f16 100644 --- a/src/mavedb/lib/annotation/evidence_line.py +++ b/src/mavedb/lib/annotation/evidence_line.py @@ -30,7 +30,6 @@ functional_score_calibration_as_method, pathogenicity_score_calibration_as_method, ) -from mavedb.lib.annotation.util import serialize_evidence_items from mavedb.models.mapped_variant import MappedVariant from mavedb.models.score_calibration import ScoreCalibration @@ -65,7 +64,7 @@ def acmg_evidence_line( return VariantPathogenicityEvidenceLine( description=f"Pathogenicity evidence line for {mapped_variant.variant.urn}.", - hasEvidenceItems=serialize_evidence_items(evidence), + hasEvidenceItems=list(evidence), specifiedBy=pathogenicity_score_calibration_as_method(score_calibration, evidence_outcome), evidenceOutcome={ "primaryCoding": Coding( @@ -102,7 +101,7 @@ def functional_evidence_line( return EvidenceLine( description=f"Functional evidence line for {mapped_variant.variant.urn}", - hasEvidenceItems=serialize_evidence_items(evidence), + hasEvidenceItems=[StudyResult(root=item) for item in evidence], directionOfEvidenceProvided=direction_of_support_for_functional_classification(classification), evidenceOutcome=MappableConcept( primaryCoding=Coding( diff --git a/src/mavedb/lib/annotation/statement.py b/src/mavedb/lib/annotation/statement.py index edd99f0e..c77d30f1 100644 --- a/src/mavedb/lib/annotation/statement.py +++ b/src/mavedb/lib/annotation/statement.py @@ -20,7 +20,6 @@ variant_interpretation_functional_guideline_method, variant_interpretation_pathogenicity_guideline_method, ) -from mavedb.lib.annotation.util import serialize_evidence_items from mavedb.models.mapped_variant import MappedVariant from mavedb.models.score_calibration import ScoreCalibration from mavedb.models.score_calibration_functional_classification import ScoreCalibrationFunctionalClassification @@ -121,5 +120,5 @@ def mapped_variant_to_pathogenicity_statement( system="ACMG Guidelines, 2015", ), ), - hasEvidenceLines=serialize_evidence_items(evidence), + hasEvidenceLines=list(evidence), ) diff --git a/src/mavedb/lib/annotation/util.py b/src/mavedb/lib/annotation/util.py index d49d0ed2..b8c21515 100644 --- a/src/mavedb/lib/annotation/util.py +++ b/src/mavedb/lib/annotation/util.py @@ -1,5 +1,4 @@ -from collections.abc import Sequence -from typing import Any, Literal, Optional +from typing import Literal, Optional from ga4gh.core.models import Extension from ga4gh.va_spec.base.enums import StrengthOfEvidenceProvided as VaSpecStrengthOfEvidenceProvided @@ -29,33 +28,6 @@ from mavedb.models.score_calibration_functional_classification import ScoreCalibrationFunctionalClassification -def serialize_evidence_items(evidence: Sequence[Any]) -> list[dict[str, Any]]: - """Serialize evidence objects to dictionaries using `model_dump(exclude_none=True)`. - - Args: - evidence (Sequence[Any]): Evidence objects expected to provide a - `model_dump` method. - - Returns: - list[dict[str, Any]]: Evidence payloads suitable for assignment to - GA4GH VA model fields such as `hasEvidenceItems`. - - Raises: - TypeError: If any item does not expose a callable `model_dump` method. - """ - - serialized_evidence: list[dict[str, Any]] = [] - - for evidence_item in evidence: - model_dump = getattr(evidence_item, "model_dump", None) - if not callable(model_dump): - raise TypeError("Evidence items must provide a callable model_dump method.") - - serialized_evidence.append(model_dump(exclude_none=True)) - - return serialized_evidence - - def allele_from_mapped_variant_dictionary_result(allelic_mapping_results: dict) -> Allele: """ Converts a dictionary containing allelic mapping results into an Allele object. diff --git a/tests/lib/annotation/test_annotate.py b/tests/lib/annotation/test_annotate.py index b5c73683..f9118fc2 100644 --- a/tests/lib/annotation/test_annotate.py +++ b/tests/lib/annotation/test_annotate.py @@ -273,3 +273,24 @@ def test_variant_not_in_any_range_returns_uncertain_significance( assert result.type == "Statement" # Classification should be UNCERTAIN_SIGNIFICANCE assert result.classification.primaryCoding.code.root == "uncertain significance" + + def test_pathogenicity_evidence_line_has_evidence_items_are_statement_instances( + self, mock_mapped_variant_with_pathogenicity_calibration_score_set + ): + """Regression test: hasEvidenceItems on VariantPathogenicityEvidenceLine must be model instances. + + Passing serialized dict representations of Statement objects to hasEvidenceItems caused + VariantPathogenicityEvidenceLine validation to fail when reconstructing nested VRS objects + (e.g. Allele with production genomic coordinates). Model instances must be stored directly. + """ + result = variant_pathogenicity_statement(mock_mapped_variant_with_pathogenicity_calibration_score_set) + + assert result is not None + for evidence_line in result.hasEvidenceLines: + assert evidence_line.hasEvidenceItems is not None + for evidence_item in evidence_line.hasEvidenceItems: + # Must be a model instance, not a raw dict + assert not isinstance( + evidence_item, dict + ), "hasEvidenceItems contained a raw dict instead of a model instance" + assert evidence_item.type == "Statement" diff --git a/tests/lib/annotation/test_evidence_line.py b/tests/lib/annotation/test_evidence_line.py index b2c2dcd6..fb6f33b7 100644 --- a/tests/lib/annotation/test_evidence_line.py +++ b/tests/lib/annotation/test_evidence_line.py @@ -18,8 +18,13 @@ from ga4gh.va_spec.base.enums import StrengthOfEvidenceProvided from mavedb.lib.annotation.annotate import variant_study_result +from mavedb.lib.annotation.classification import ExperimentalVariantFunctionalImpactClassification from mavedb.lib.annotation.evidence_line import acmg_evidence_line, functional_evidence_line -from mavedb.lib.annotation.proposition import mapped_variant_to_experimental_variant_clinical_impact_proposition +from mavedb.lib.annotation.proposition import ( + mapped_variant_to_experimental_variant_clinical_impact_proposition, + mapped_variant_to_experimental_variant_functional_impact_proposition, +) +from mavedb.lib.annotation.statement import mapped_variant_to_functional_statement @pytest.mark.unit @@ -59,7 +64,8 @@ def test_acmg_evidence_line_with_met_valid_clinical_classification( ): proposition = mapped_variant_to_experimental_variant_clinical_impact_proposition(mapped_variant) study_result = variant_study_result(mapped_variant) - result = acmg_evidence_line(mapped_variant, score_calibration, proposition, [study_result]) + evidence = functional_evidence_line(mapped_variant, score_calibration, [study_result]) + result = acmg_evidence_line(mapped_variant, score_calibration, proposition, [evidence]) if expected_strength == StrengthOfEvidenceProvided.STRONG: expected_evidence_outcome = expected_outcome.value @@ -96,7 +102,8 @@ def test_acmg_evidence_line_with_not_met_clinical_classification( ): proposition = mapped_variant_to_experimental_variant_clinical_impact_proposition(mapped_variant) study_result = variant_study_result(mapped_variant) - result = acmg_evidence_line(mapped_variant, score_calibration, proposition, [study_result]) + evidence = functional_evidence_line(mapped_variant, score_calibration, [study_result]) + result = acmg_evidence_line(mapped_variant, score_calibration, proposition, [evidence]) assert isinstance(result, VariantPathogenicityEvidenceLine) assert result.description == f"Pathogenicity evidence line for {mapped_variant.variant.urn}." @@ -120,6 +127,39 @@ def test_acmg_evidence_line_with_no_calibrations_raises_error(self, mock_mapped_ study_result = variant_study_result(mock_mapped_variant) acmg_evidence_line(mock_mapped_variant, score_calibration, proposition, [study_result]) + def test_acmg_evidence_line_accepts_statement_evidence_without_serialization_error( + self, + mock_mapped_variant_with_pathogenicity_calibration_score_set, + ): + """Regression test: VariantPathogenicityEvidenceLine must accept Statement instances directly. + + Previously, acmg_evidence_line serialized Statement evidence to dicts via model_dump, + which caused VariantPathogenicityEvidenceLine validation to fail when reconstructing + nested VRS objects from those dicts (e.g. Allele with real genomic coordinates). + Evidence model instances must be passed directly, not serialized. + """ + mapped_variant = mock_mapped_variant_with_pathogenicity_calibration_score_set + score_calibration = mapped_variant.variant.score_set.score_calibrations[0] + + study_result = variant_study_result(mapped_variant) + functional_proposition = mapped_variant_to_experimental_variant_functional_impact_proposition(mapped_variant) + functional_evidence = functional_evidence_line(mapped_variant, score_calibration, [study_result]) + functional_statement = mapped_variant_to_functional_statement( + mapped_variant, + functional_proposition, + [functional_evidence], + score_calibration, + ExperimentalVariantFunctionalImpactClassification.NORMAL, + ) + clinical_proposition = mapped_variant_to_experimental_variant_clinical_impact_proposition(mapped_variant) + + result = acmg_evidence_line(mapped_variant, score_calibration, clinical_proposition, [functional_statement]) + + assert isinstance(result, VariantPathogenicityEvidenceLine) + assert len(result.hasEvidenceItems) == 1 + # Evidence items must be Statement model instances, not raw dicts + assert result.hasEvidenceItems[0].type == "Statement" + @pytest.mark.unit class TestFunctionalEvidenceLine: diff --git a/tests/lib/annotation/test_util.py b/tests/lib/annotation/test_util.py index fcafd8aa..515ae628 100644 --- a/tests/lib/annotation/test_util.py +++ b/tests/lib/annotation/test_util.py @@ -26,7 +26,6 @@ select_strongest_functional_calibration, select_strongest_pathogenicity_calibration, sequence_feature_for_mapped_variant, - serialize_evidence_items, variation_from_mapped_variant, vrs_object_from_mapped_variant, ) @@ -572,24 +571,6 @@ def test_sequence_feature_raises_when_target_has_no_name_or_ids(self, mock_mappe sequence_feature_for_mapped_variant(mock_mapped_variant) -@pytest.mark.unit -class TestSerializeEvidenceItems: - def test_serialize_evidence_items_serializes_all_items_in_order(self): - first_item = SimpleNamespace(model_dump=lambda *, exclude_none: {"id": "first", "exclude_none": exclude_none}) - second_item = SimpleNamespace(model_dump=lambda *, exclude_none: {"id": "second", "exclude_none": exclude_none}) - - result = serialize_evidence_items([first_item, second_item]) - - assert result == [ - {"id": "first", "exclude_none": True}, - {"id": "second", "exclude_none": True}, - ] - - def test_serialize_evidence_items_raises_for_non_dumpable_item(self): - with pytest.raises(TypeError, match="model_dump"): - serialize_evidence_items([SimpleNamespace(not_model_dump=True)]) - - @pytest.mark.integration class TestAnnotationUtilIntegration: """Integration tests for utility helpers using persisted DB-backed variants."""