From 3a0eec3a502678a5aaed94ce00477dc47a6eb7ef Mon Sep 17 00:00:00 2001 From: Eleanor Lewis Date: Fri, 17 Apr 2026 15:29:41 -0700 Subject: [PATCH 1/8] add seqqc section and test --- pori_python/ipr/content.spec.json | 96 ++++++++ tests/test_ipr/test_upload.py | 394 +++++++++++++++++------------- 2 files changed, 319 insertions(+), 171 deletions(-) diff --git a/pori_python/ipr/content.spec.json b/pori_python/ipr/content.spec.json index 5a1793a2..711f9eb5 100644 --- a/pori_python/ipr/content.spec.json +++ b/pori_python/ipr/content.spec.json @@ -892,6 +892,102 @@ "example": "POG", "type": "string" }, + "seqQC": { + "type": "array", + "items": { + "type": "object", + "properties": { + "reads": { + "description": "Number of reads", + "example": "2534M", + "type": [ + "string", + "null" + ] + }, + "bioQC": { + "description": "Biological QC status", + "example": "passed", + "type": [ + "string", + "null" + ] + }, + "labQC": { + "description": "Lab QC status", + "example": "passed", + "type": [ + "string", + "null" + ] + }, + "sample": { + "description": "Sample identifier, e.g. Tumour DNA, Constitutional DNA", + "example": "Tumour DNA", + "type": [ + "string", + "null" + ] + }, + "library": { + "description": "Library identifier", + "example": "LIB0001", + "type": [ + "string", + "null" + ] + }, + "coverage": { + "description": "Sequencing coverage", + "example": "80x", + "type": [ + "string", + "null" + ] + }, + "inputNg": { + "description": "Input amount in nanograms", + "example": "500", + "type": [ + "string", + "null" + ] + }, + "inputUg": { + "description": "Input amount in micrograms", + "example": "0.5", + "type": [ + "string", + "null" + ] + }, + "protocol": { + "description": "Sequencing protocol", + "example": "WGS", + "type": [ + "string", + "null" + ] + }, + "sampleName": { + "description": "Full sample name", + "example": "SAMPLE1-FF-1", + "type": [ + "string", + "null" + ] + }, + "duplicateReadsPerc": { + "description": "Percentage of duplicate reads", + "example": "12.3", + "type": [ + "string", + "null" + ] + } + } + } + }, "smallMutations": { "default": [], "items": { diff --git a/tests/test_ipr/test_upload.py b/tests/test_ipr/test_upload.py index 2c6fb73c..80bdd4b5 100644 --- a/tests/test_ipr/test_upload.py +++ b/tests/test_ipr/test_upload.py @@ -13,100 +13,132 @@ from .constants import EXCLUDE_INTEGRATION_TESTS -EXCLUDE_BCGSC_TESTS = os.environ.get('EXCLUDE_BCGSC_TESTS') == '1' -EXCLUDE_ONCOKB_TESTS = os.environ.get('EXCLUDE_ONCOKB_TESTS') == '1' -INCLUDE_UPLOAD_TESTS = os.environ.get('INCLUDE_UPLOAD_TESTS', '0') == '1' -DELETE_UPLOAD_TEST_REPORTS = os.environ.get('DELETE_UPLOAD_TEST_REPORTS', '1') == '1' +EXCLUDE_BCGSC_TESTS = os.environ.get("EXCLUDE_BCGSC_TESTS") == "1" +EXCLUDE_ONCOKB_TESTS = os.environ.get("EXCLUDE_ONCOKB_TESTS") == "1" +INCLUDE_UPLOAD_TESTS = os.environ.get("INCLUDE_UPLOAD_TESTS", "0") == "1" +DELETE_UPLOAD_TEST_REPORTS = os.environ.get("DELETE_UPLOAD_TEST_REPORTS", "1") == "1" def get_test_spec(): - ipr_spec = {'components': {'schemas': {'genesCreate': {'properties': {}}}}} + ipr_spec = {"components": {"schemas": {"genesCreate": {"properties": {}}}}} ipr_gene_keys = IprGene.__required_keys__ | IprGene.__optional_keys__ for key in ipr_gene_keys: - ipr_spec['components']['schemas']['genesCreate']['properties'][key] = '' + ipr_spec["components"]["schemas"]["genesCreate"]["properties"][key] = "" return ipr_spec def get_test_file(name: str) -> str: - return os.path.join(os.path.dirname(__file__), 'test_data', name) + return os.path.join(os.path.dirname(__file__), "test_data", name) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def loaded_reports(tmp_path_factory) -> Generator: - json_file = tmp_path_factory.mktemp('inputs') / 'content.json' - async_json_file = tmp_path_factory.mktemp('inputs') / 'async_content.json' - patient_id = f'TEST_{str(uuid.uuid4())}' - async_patient_id = f'TEST_ASYNC_{str(uuid.uuid4())}' + json_file = tmp_path_factory.mktemp("inputs") / "content.json" + async_json_file = tmp_path_factory.mktemp("inputs") / "async_content.json" + patient_id = f"TEST_{str(uuid.uuid4())}" + async_patient_id = f"TEST_ASYNC_{str(uuid.uuid4())}" json_contents = { - 'comparators': [ - {'analysisRole': 'expression (disease)', 'name': '1'}, - {'analysisRole': 'expression (primary site)', 'name': '2'}, - {'analysisRole': 'expression (biopsy site)', 'name': '3'}, + "comparators": [ + {"analysisRole": "expression (disease)", "name": "1"}, + {"analysisRole": "expression (primary site)", "name": "2"}, + {"analysisRole": "expression (biopsy site)", "name": "3"}, { - 'analysisRole': 'expression (internal pancancer cohort)', - 'name': '4', + "analysisRole": "expression (internal pancancer cohort)", + "name": "4", }, ], - 'patientId': patient_id, - 'project': 'TEST', - 'sampleInfo': [ + "patientId": patient_id, + "project": "TEST", + "sampleInfo": [ { - 'sample': 'Constitutional', - 'biopsySite': 'Normal tissue', - 'sampleName': 'SAMPLE1-PB', - 'primarySite': 'Blood-Peripheral', - 'collectionDate': '11-11-11', + "sample": "Constitutional", + "biopsySite": "Normal tissue", + "sampleName": "SAMPLE1-PB", + "primarySite": "Blood-Peripheral", + "collectionDate": "11-11-11", }, { - 'sample': 'Tumour', - 'pathoTc': '90%', - 'biopsySite': 'hepatic', - 'sampleName': 'SAMPLE2-FF-1', - 'primarySite': 'Vena Cava-Hepatic', - 'collectionDate': '12-12-12', + "sample": "Tumour", + "pathoTc": "90%", + "biopsySite": "hepatic", + "sampleName": "SAMPLE2-FF-1", + "primarySite": "Vena Cava-Hepatic", + "collectionDate": "12-12-12", }, ], - 'msi': [ + "msi": [ { - 'score': 1000.0, - 'kbCategory': 'microsatellite instability', + "score": 1000.0, + "kbCategory": "microsatellite instability", } ], - 'hrd': { - 'score': 9999.0, - 'cutoff': 5, + "hrd": { + "score": 9999.0, + "cutoff": 5, }, - 'expressionVariants': json.loads( - pd.read_csv(get_test_file('expression.short.tab'), sep='\t').to_json(orient='records') + "expressionVariants": json.loads( + pd.read_csv(get_test_file("expression.short.tab"), sep="\t").to_json( + orient="records" + ) ), - 'smallMutations': json.loads( - pd.read_csv(get_test_file('small_mutations.short.tab'), sep='\t').to_json( - orient='records' + "smallMutations": json.loads( + pd.read_csv(get_test_file("small_mutations.short.tab"), sep="\t").to_json( + orient="records" ) ), - 'copyVariants': json.loads( - pd.read_csv(get_test_file('copy_variants.short.tab'), sep='\t').to_json( - orient='records' + "copyVariants": json.loads( + pd.read_csv(get_test_file("copy_variants.short.tab"), sep="\t").to_json( + orient="records" ) ), - 'structuralVariants': json.loads( - pd.read_csv(get_test_file('fusions.tab'), sep='\t').to_json(orient='records') + "structuralVariants": json.loads( + pd.read_csv(get_test_file("fusions.tab"), sep="\t").to_json( + orient="records" + ) ), - 'kbDiseaseMatch': 'colorectal cancer', - 'cosmicSignatures': pd.read_csv( - get_test_file('cosmic_variants.tab'), sep='\t' + "kbDiseaseMatch": "colorectal cancer", + "cosmicSignatures": pd.read_csv( + get_test_file("cosmic_variants.tab"), sep="\t" ).signature.tolist(), - 'hlaTypes': json.loads( - pd.read_csv(get_test_file('hla_variants.tab'), sep='\t').to_json(orient='records') + "hlaTypes": json.loads( + pd.read_csv(get_test_file("hla_variants.tab"), sep="\t").to_json( + orient="records" + ) ), - 'images': [ + "images": [ { - 'key': 'cnvLoh.circos', - 'path': 'test/testData/images/cnvLoh.png', - 'caption': 'Test adding a caption to an image', + "key": "cnvLoh.circos", + "path": "test/testData/images/cnvLoh.png", + "caption": "Test adding a caption to an image", } ], - 'config': 'test config', + "seqQC": [ + { + "sample": "Tumour DNA", + "reads": "2534M", + "library": "LIB0001", + "coverage": "80x", + "inputNg": "500", + "protocol": "WGS", + "sampleName": "SAMPLE2-FF-1", + "bioQC": "passed", + "labQC": "passed", + "duplicateReadsPerc": "12.3", + }, + { + "sample": "Constitutional DNA", + "reads": "1200M", + "library": "LIB0002", + "coverage": "40x", + "inputNg": "300", + "protocol": "WGS", + "sampleName": "SAMPLE1-PB", + "bioQC": "passed", + "labQC": "passed", + "duplicateReadsPerc": "8.1", + }, + ], + "config": "test config", } json_file.write_text( @@ -116,7 +148,7 @@ def loaded_reports(tmp_path_factory) -> Generator: ) ) - json_contents['patientId'] = async_patient_id + json_contents["patientId"] = async_patient_id async_json_file.write_text( json.dumps( json_contents, @@ -125,46 +157,46 @@ def loaded_reports(tmp_path_factory) -> Generator: ) argslist = [ - 'ipr', - '--username', - os.environ.get('IPR_USER', os.environ['USER']), - '--password', - os.environ['IPR_PASS'], - '--graphkb_username', - os.environ.get('GRAPHKB_USER', os.environ.get('IPR_USER', os.environ['USER'])), - '--graphkb_password', - os.environ.get('GRAPHKB_PASS', os.environ['IPR_PASS']), - '--ipr_url', - os.environ['IPR_TEST_URL'], - '--graphkb_url', - os.environ.get('GRAPHKB_URL', False), - '--therapeutics', - '--allow_partial_matches', + "ipr", + "--username", + os.environ.get("IPR_USER", os.environ["USER"]), + "--password", + os.environ["IPR_PASS"], + "--graphkb_username", + os.environ.get("GRAPHKB_USER", os.environ.get("IPR_USER", os.environ["USER"])), + "--graphkb_password", + os.environ.get("GRAPHKB_PASS", os.environ["IPR_PASS"]), + "--ipr_url", + os.environ["IPR_TEST_URL"], + "--graphkb_url", + os.environ.get("GRAPHKB_URL", False), + "--therapeutics", + "--allow_partial_matches", ] sync_argslist = argslist.copy() - sync_argslist.extend(['--content', str(json_file)]) - with patch.object(sys, 'argv', sync_argslist): - with patch.object(IprConnection, 'get_spec', return_value=get_test_spec()): + sync_argslist.extend(["--content", str(json_file)]) + with patch.object(sys, "argv", sync_argslist): + with patch.object(IprConnection, "get_spec", return_value=get_test_spec()): command_interface() async_argslist = argslist.copy() - async_argslist.extend(['--content', str(async_json_file), '--async_upload']) - with patch.object(sys, 'argv', async_argslist): - with patch.object(IprConnection, 'get_spec', return_value=get_test_spec()): + async_argslist.extend(["--content", str(async_json_file), "--async_upload"]) + with patch.object(sys, "argv", async_argslist): + with patch.object(IprConnection, "get_spec", return_value=get_test_spec()): command_interface() ipr_conn = IprConnection( - username=os.environ.get('IPR_USER', os.environ['USER']), - password=os.environ['IPR_PASS'], - url=os.environ['IPR_TEST_URL'], + username=os.environ.get("IPR_USER", os.environ["USER"]), + password=os.environ["IPR_PASS"], + url=os.environ["IPR_TEST_URL"], ) - loaded_report = ipr_conn.get(uri=f'reports?searchText={patient_id}') - async_loaded_report = ipr_conn.get(uri=f'reports?searchText={async_patient_id}') + loaded_report = ipr_conn.get(uri=f"reports?searchText={patient_id}") + async_loaded_report = ipr_conn.get(uri=f"reports?searchText={async_patient_id}") loaded_reports_result = { - 'sync': (patient_id, loaded_report), - 'async': (async_patient_id, async_loaded_report), + "sync": (patient_id, loaded_report), + "async": (async_patient_id, async_loaded_report), } yield loaded_reports_result if DELETE_UPLOAD_TEST_REPORTS: @@ -173,13 +205,13 @@ def loaded_reports(tmp_path_factory) -> Generator: def get_section(loaded_report, section_name): - ident = loaded_report[1]['reports'][0]['ident'] + ident = loaded_report[1]["reports"][0]["ident"] ipr_conn = IprConnection( - username=os.environ.get('IPR_USER', os.environ['USER']), - password=os.environ['IPR_PASS'], - url=os.environ['IPR_TEST_URL'], + username=os.environ.get("IPR_USER", os.environ["USER"]), + password=os.environ["IPR_PASS"], + url=os.environ["IPR_TEST_URL"], ) - return ipr_conn.get(uri=f'reports/{ident}/{section_name}') + return ipr_conn.get(uri=f"reports/{ident}/{section_name}") def stringify_sorted(obj): @@ -192,7 +224,7 @@ def stringify_sorted(obj): obj.sort() return str(obj) elif isinstance(obj, dict): - for key in ('ident', 'updatedAt', 'createdAt', 'deletedAt'): + for key in ("ident", "updatedAt", "createdAt", "deletedAt"): obj.pop(key, None) keys = obj.keys() for key in keys: @@ -208,135 +240,145 @@ def stringify_sorted(obj): @pytest.mark.skipif( - not INCLUDE_UPLOAD_TESTS, reason='excluding tests of upload to live ipr instance' + not INCLUDE_UPLOAD_TESTS, reason="excluding tests of upload to live ipr instance" +) +@pytest.mark.skipif( + EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests" ) -@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason='excluding long running integration tests') class TestCreateReport: def test_patient_id_loaded_once(self, loaded_reports) -> None: - sync_patient_id = loaded_reports['sync'][0] - assert loaded_reports['sync'][1]['total'] == 1 - assert loaded_reports['sync'][1]['reports'][0]['patientId'] == sync_patient_id - async_patient_id = loaded_reports['async'][0] - assert loaded_reports['async'][1]['total'] == 1 - assert loaded_reports['async'][1]['reports'][0]['patientId'] == async_patient_id + sync_patient_id = loaded_reports["sync"][0] + assert loaded_reports["sync"][1]["total"] == 1 + assert loaded_reports["sync"][1]["reports"][0]["patientId"] == sync_patient_id + async_patient_id = loaded_reports["async"][0] + assert loaded_reports["async"][1]["total"] == 1 + assert loaded_reports["async"][1]["reports"][0]["patientId"] == async_patient_id def test_expression_variants_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports['sync'], 'expression-variants') - kbmatched = [item for item in section if item['kbMatches']] - assert 'PTP4A3' in [item['gene']['name'] for item in kbmatched] - async_section = get_section(loaded_reports['async'], 'expression-variants') + section = get_section(loaded_reports["sync"], "expression-variants") + kbmatched = [item for item in section if item["kbMatches"]] + assert "PTP4A3" in [item["gene"]["name"] for item in kbmatched] + async_section = get_section(loaded_reports["async"], "expression-variants") async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_structural_variants_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports['sync'], 'structural-variants') - kbmatched = [item for item in section if item['kbMatches']] - assert '(EWSR1,FLI1):fusion(e.7,e.4)' in [item['displayName'] for item in kbmatched] - async_section = get_section(loaded_reports['async'], 'structural-variants') + section = get_section(loaded_reports["sync"], "structural-variants") + kbmatched = [item for item in section if item["kbMatches"]] + assert "(EWSR1,FLI1):fusion(e.7,e.4)" in [ + item["displayName"] for item in kbmatched + ] + async_section = get_section(loaded_reports["async"], "structural-variants") async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_small_mutations_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports['sync'], 'small-mutations') - kbmatched = [item for item in section if item['kbMatches']] - assert 'FGFR2:p.R421C' in [item['displayName'] for item in kbmatched] - assert 'CDKN2A:p.T18M' in [item['displayName'] for item in kbmatched] - async_section = get_section(loaded_reports['async'], 'small-mutations') + section = get_section(loaded_reports["sync"], "small-mutations") + kbmatched = [item for item in section if item["kbMatches"]] + assert "FGFR2:p.R421C" in [item["displayName"] for item in kbmatched] + assert "CDKN2A:p.T18M" in [item["displayName"] for item in kbmatched] + async_section = get_section(loaded_reports["async"], "small-mutations") async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_copy_variants_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports['sync'], 'copy-variants') - kbmatched = [item for item in section if item['kbMatches']] - assert ('ERBB2', 'amplification') in [ - (item['gene']['name'], item['displayName']) for item in kbmatched + section = get_section(loaded_reports["sync"], "copy-variants") + kbmatched = [item for item in section if item["kbMatches"]] + assert ("ERBB2", "amplification") in [ + (item["gene"]["name"], item["displayName"]) for item in kbmatched ] - async_section = get_section(loaded_reports['async'], 'copy-variants') + async_section = get_section(loaded_reports["async"], "copy-variants") async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_signature_variants_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports['sync'], 'signature-variants') - kbmatched = [item for item in section if item['kbMatches']] + section = get_section(loaded_reports["sync"], "signature-variants") + kbmatched = [item for item in section if item["kbMatches"]] # Check for COSMIC signatures - assert ('SBS2', 'high signature') in [ - (item['signatureName'], item['variantTypeName']) for item in kbmatched + assert ("SBS2", "high signature") in [ + (item["signatureName"], item["variantTypeName"]) for item in kbmatched ] # Check for HRD signature (score 9999 > cutoff 5, so strong signature) - assert ('homologous recombination deficiency', 'strong signature') in [ - (item['signatureName'], item['variantTypeName']) for item in kbmatched + assert ("homologous recombination deficiency", "strong signature") in [ + (item["signatureName"], item["variantTypeName"]) for item in kbmatched ] # Check for MSI signature - assert ('microsatellite instability', 'high signature') in [ - (item['signatureName'], item['variantTypeName']) for item in kbmatched + assert ("microsatellite instability", "high signature") in [ + (item["signatureName"], item["variantTypeName"]) for item in kbmatched ] - async_section = get_section(loaded_reports['async'], 'signature-variants') + async_section = get_section(loaded_reports["async"], "signature-variants") async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_hrd_score_in_report(self, loaded_reports) -> None: """Test that HRD score is present in the loaded report.""" - report = loaded_reports['sync'][1]['reports'][0] - assert 'hrdScore' in report - assert report['hrdScore'] == 9999.0 + report = loaded_reports["sync"][1]["reports"][0] + assert "hrdScore" in report + assert report["hrdScore"] == 9999.0 def test_kb_matches_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports['sync'], 'kb-matches') + section = get_section(loaded_reports["sync"], "kb-matches") observed_and_matched = set( - [(item['kbVariant'], item['variant']['displayName']) for item in section] + [(item["kbVariant"], item["variant"]["displayName"]) for item in section] ) for pair in [ - ('ERBB2 amplification', 'amplification'), - ('FGFR2 mutation', 'FGFR2:p.R421C'), - ('PTP4A3 overexpression', 'increased expression'), - ('EWSR1 and FLI1 fusion', '(EWSR1,FLI1):fusion(e.7,e.4)'), - ('CDKN2A mutation', 'CDKN2A:p.T18M'), + ("ERBB2 amplification", "amplification"), + ("FGFR2 mutation", "FGFR2:p.R421C"), + ("PTP4A3 overexpression", "increased expression"), + ("EWSR1 and FLI1 fusion", "(EWSR1,FLI1):fusion(e.7,e.4)"), + ("CDKN2A mutation", "CDKN2A:p.T18M"), ]: assert pair in observed_and_matched - async_section = get_section(loaded_reports['async'], 'kb-matches') + async_section = get_section(loaded_reports["async"], "kb-matches") async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_therapeutic_targets_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports['sync'], 'therapeutic-targets') - therapeutic_target_genes = set([item['gene'] for item in section]) - for gene in ['CDKN2A', 'ERBB2', 'FGFR2', 'PTP4A3']: + section = get_section(loaded_reports["sync"], "therapeutic-targets") + therapeutic_target_genes = set([item["gene"] for item in section]) + for gene in ["CDKN2A", "ERBB2", "FGFR2", "PTP4A3"]: assert gene in therapeutic_target_genes - async_section = get_section(loaded_reports['async'], 'therapeutic-targets') + async_section = get_section(loaded_reports["async"], "therapeutic-targets") async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_genomic_alterations_identified_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports['sync'], 'summary/genomic-alterations-identified') - variants = set([item['geneVariant'] for item in section]) + section = get_section( + loaded_reports["sync"], "summary/genomic-alterations-identified" + ) + variants = set([item["geneVariant"] for item in section]) for variant in [ - 'FGFR2:p.R421C', - 'PTP4A3 (high_percentile)', - 'ERBB2 (Amplification)', - '(EWSR1,FLI1):fusion(e.7,e.4)', - 'CDKN2A:p.T18M', + "FGFR2:p.R421C", + "PTP4A3 (high_percentile)", + "ERBB2 (Amplification)", + "(EWSR1,FLI1):fusion(e.7,e.4)", + "CDKN2A:p.T18M", ]: assert variant in variants async_section = get_section( - loaded_reports['async'], 'summary/genomic-alterations-identified' + loaded_reports["async"], "summary/genomic-alterations-identified" ) async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_analyst_comments_loaded(self, loaded_reports) -> None: - sync_section = get_section(loaded_reports['sync'], 'summary/analyst-comments') - assert sync_section['comments'] - async_section = get_section(loaded_reports['async'], 'summary/analyst-comments') - assert async_section['comments'] - assert sync_section['comments'] == async_section['comments'] + sync_section = get_section(loaded_reports["sync"], "summary/analyst-comments") + assert sync_section["comments"] + async_section = get_section(loaded_reports["async"], "summary/analyst-comments") + assert async_section["comments"] + assert sync_section["comments"] == async_section["comments"] def test_sample_info_loaded(self, loaded_reports) -> None: - sync_section = get_section(loaded_reports['sync'], 'sample-info') - async_section = get_section(loaded_reports['async'], 'sample-info') - async_equals_sync = stringify_sorted(sync_section) == stringify_sorted(async_section) + sync_section = get_section(loaded_reports["sync"], "sample-info") + async_section = get_section(loaded_reports["async"], "sample-info") + async_equals_sync = stringify_sorted(sync_section) == stringify_sorted( + async_section + ) assert async_equals_sync - def test_multivariant_multiconditionset_statements_loaded(self, loaded_reports) -> None: + def test_multivariant_multiconditionset_statements_loaded( + self, loaded_reports + ) -> None: """ Checks that multivariant statements and multiple condition sets prepared correctly by this package are handled as expected by the api. @@ -348,31 +390,41 @@ def test_multivariant_multiconditionset_statements_loaded(self, loaded_reports) are met. This is also a test of multiple condition sets since there are two variants in the test data that satisfy one of the conditions (the APC mutation).""" - section = get_section(loaded_reports['sync'], 'kb-matches/kb-matched-statements') - multivariant_stmts = [item for item in section if item['reference'] == 'pmid:27302369'] + section = get_section( + loaded_reports["sync"], "kb-matches/kb-matched-statements" + ) + multivariant_stmts = [ + item for item in section if item["reference"] == "pmid:27302369" + ] # if this statement is entered more than once there may be multiple sets of records to # check, so to make sure the count checks work, go stmt_id by stmt_id: - stmt_ids = list(set([item['kbStatementId'] for item in multivariant_stmts])) + stmt_ids = list(set([item["kbStatementId"] for item in multivariant_stmts])) for stmt_id in stmt_ids: - stmts = [item for item in multivariant_stmts if item['kbStatementId'] == stmt_id] + stmts = [ + item for item in multivariant_stmts if item["kbStatementId"] == stmt_id + ] # we expect three stmts, one for each condition set assert len(stmts) == 3 # we expect each condition set to have two kb variants in it # we expect the two kb variants to be the same in each stmt - assert len(stmts[0]['kbMatches']) == 2 - assert len(stmts[1]['kbMatches']) == 2 - kbmatches1 = [item['kbVariant'] for item in stmts[0]['kbMatches']] - kbmatches2 = [item['kbVariant'] for item in stmts[1]['kbMatches']] + assert len(stmts[0]["kbMatches"]) == 2 + assert len(stmts[1]["kbMatches"]) == 2 + kbmatches1 = [item["kbVariant"] for item in stmts[0]["kbMatches"]] + kbmatches2 = [item["kbVariant"] for item in stmts[1]["kbMatches"]] kbmatches1.sort() kbmatches2.sort() - assert kbmatches1 == kbmatches2 == ['APC mutation', 'KRAS mutation'] + assert kbmatches1 == kbmatches2 == ["APC mutation", "KRAS mutation"] # we expect the two stmts to have different observed variant sets - observedVariants1 = [item['variant']['ident'] for item in stmts[0]['kbMatches']] - observedVariants2 = [item['variant']['ident'] for item in stmts[1]['kbMatches']] + observedVariants1 = [ + item["variant"]["ident"] for item in stmts[0]["kbMatches"] + ] + observedVariants2 = [ + item["variant"]["ident"] for item in stmts[1]["kbMatches"] + ] observedVariants1.sort() observedVariants2.sort() assert observedVariants1 != observedVariants2 From 72d6805e1c51e73b627997971b0d07910dc91b39 Mon Sep 17 00:00:00 2001 From: Eleanor Lewis Date: Mon, 20 Apr 2026 11:04:34 -0700 Subject: [PATCH 2/8] commit to save --- pori_python/ipr/content.spec.json | 1 + tests/test_ipr/test_upload.py | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/pori_python/ipr/content.spec.json b/pori_python/ipr/content.spec.json index 711f9eb5..dfbfb324 100644 --- a/pori_python/ipr/content.spec.json +++ b/pori_python/ipr/content.spec.json @@ -893,6 +893,7 @@ "type": "string" }, "seqQC": { + "default": [], "type": "array", "items": { "type": "object", diff --git a/tests/test_ipr/test_upload.py b/tests/test_ipr/test_upload.py index 80bdd4b5..f8eaa246 100644 --- a/tests/test_ipr/test_upload.py +++ b/tests/test_ipr/test_upload.py @@ -368,6 +368,18 @@ def test_analyst_comments_loaded(self, loaded_reports) -> None: assert async_section["comments"] assert sync_section["comments"] == async_section["comments"] + def test_seqqc_loaded(self, loaded_reports) -> None: + """Test that seqQC data is present in the loaded report.""" + sync_report = loaded_reports["sync"][1]["reports"][0] + assert "seqQC" in sync_report + assert len(sync_report["seqQC"]) == 2 + samples = [item["sample"] for item in sync_report["seqQC"]] + assert "Tumour DNA" in samples + assert "Constitutional DNA" in samples + async_report = loaded_reports["async"][1]["reports"][0] + assert "seqQC" in async_report + assert len(async_report["seqQC"]) == 2 + def test_sample_info_loaded(self, loaded_reports) -> None: sync_section = get_section(loaded_reports["sync"], "sample-info") async_section = get_section(loaded_reports["async"], "sample-info") From 4d0181bdcefe95e363cbbfde486d4b2b6f0ea0d7 Mon Sep 17 00:00:00 2001 From: Eleanor Lewis Date: Mon, 20 Apr 2026 11:08:47 -0700 Subject: [PATCH 3/8] format --- tests/test_ipr/test_upload.py | 428 ++++++++++++++++------------------ 1 file changed, 201 insertions(+), 227 deletions(-) diff --git a/tests/test_ipr/test_upload.py b/tests/test_ipr/test_upload.py index f8eaa246..06d60eb2 100644 --- a/tests/test_ipr/test_upload.py +++ b/tests/test_ipr/test_upload.py @@ -13,132 +13,126 @@ from .constants import EXCLUDE_INTEGRATION_TESTS -EXCLUDE_BCGSC_TESTS = os.environ.get("EXCLUDE_BCGSC_TESTS") == "1" -EXCLUDE_ONCOKB_TESTS = os.environ.get("EXCLUDE_ONCOKB_TESTS") == "1" -INCLUDE_UPLOAD_TESTS = os.environ.get("INCLUDE_UPLOAD_TESTS", "0") == "1" -DELETE_UPLOAD_TEST_REPORTS = os.environ.get("DELETE_UPLOAD_TEST_REPORTS", "1") == "1" +EXCLUDE_BCGSC_TESTS = os.environ.get('EXCLUDE_BCGSC_TESTS') == '1' +EXCLUDE_ONCOKB_TESTS = os.environ.get('EXCLUDE_ONCOKB_TESTS') == '1' +INCLUDE_UPLOAD_TESTS = os.environ.get('INCLUDE_UPLOAD_TESTS', '0') == '1' +DELETE_UPLOAD_TEST_REPORTS = os.environ.get('DELETE_UPLOAD_TEST_REPORTS', '1') == '1' def get_test_spec(): - ipr_spec = {"components": {"schemas": {"genesCreate": {"properties": {}}}}} + ipr_spec = {'components': {'schemas': {'genesCreate': {'properties': {}}}}} ipr_gene_keys = IprGene.__required_keys__ | IprGene.__optional_keys__ for key in ipr_gene_keys: - ipr_spec["components"]["schemas"]["genesCreate"]["properties"][key] = "" + ipr_spec['components']['schemas']['genesCreate']['properties'][key] = '' return ipr_spec def get_test_file(name: str) -> str: - return os.path.join(os.path.dirname(__file__), "test_data", name) + return os.path.join(os.path.dirname(__file__), 'test_data', name) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def loaded_reports(tmp_path_factory) -> Generator: - json_file = tmp_path_factory.mktemp("inputs") / "content.json" - async_json_file = tmp_path_factory.mktemp("inputs") / "async_content.json" - patient_id = f"TEST_{str(uuid.uuid4())}" - async_patient_id = f"TEST_ASYNC_{str(uuid.uuid4())}" + json_file = tmp_path_factory.mktemp('inputs') / 'content.json' + async_json_file = tmp_path_factory.mktemp('inputs') / 'async_content.json' + patient_id = f'TEST_{str(uuid.uuid4())}' + async_patient_id = f'TEST_ASYNC_{str(uuid.uuid4())}' json_contents = { - "comparators": [ - {"analysisRole": "expression (disease)", "name": "1"}, - {"analysisRole": "expression (primary site)", "name": "2"}, - {"analysisRole": "expression (biopsy site)", "name": "3"}, + 'comparators': [ + {'analysisRole': 'expression (disease)', 'name': '1'}, + {'analysisRole': 'expression (primary site)', 'name': '2'}, + {'analysisRole': 'expression (biopsy site)', 'name': '3'}, { - "analysisRole": "expression (internal pancancer cohort)", - "name": "4", + 'analysisRole': 'expression (internal pancancer cohort)', + 'name': '4', }, ], - "patientId": patient_id, - "project": "TEST", - "sampleInfo": [ + 'patientId': patient_id, + 'project': 'TEST', + 'sampleInfo': [ { - "sample": "Constitutional", - "biopsySite": "Normal tissue", - "sampleName": "SAMPLE1-PB", - "primarySite": "Blood-Peripheral", - "collectionDate": "11-11-11", + 'sample': 'Constitutional', + 'biopsySite': 'Normal tissue', + 'sampleName': 'SAMPLE1-PB', + 'primarySite': 'Blood-Peripheral', + 'collectionDate': '11-11-11', }, { - "sample": "Tumour", - "pathoTc": "90%", - "biopsySite": "hepatic", - "sampleName": "SAMPLE2-FF-1", - "primarySite": "Vena Cava-Hepatic", - "collectionDate": "12-12-12", + 'sample': 'Tumour', + 'pathoTc': '90%', + 'biopsySite': 'hepatic', + 'sampleName': 'SAMPLE2-FF-1', + 'primarySite': 'Vena Cava-Hepatic', + 'collectionDate': '12-12-12', }, ], - "msi": [ + 'msi': [ { - "score": 1000.0, - "kbCategory": "microsatellite instability", + 'score': 1000.0, + 'kbCategory': 'microsatellite instability', } ], - "hrd": { - "score": 9999.0, - "cutoff": 5, + 'hrd': { + 'score': 9999.0, + 'cutoff': 5, }, - "expressionVariants": json.loads( - pd.read_csv(get_test_file("expression.short.tab"), sep="\t").to_json( - orient="records" - ) + 'expressionVariants': json.loads( + pd.read_csv(get_test_file('expression.short.tab'), sep='\t').to_json(orient='records') ), - "smallMutations": json.loads( - pd.read_csv(get_test_file("small_mutations.short.tab"), sep="\t").to_json( - orient="records" + 'smallMutations': json.loads( + pd.read_csv(get_test_file('small_mutations.short.tab'), sep='\t').to_json( + orient='records' ) ), - "copyVariants": json.loads( - pd.read_csv(get_test_file("copy_variants.short.tab"), sep="\t").to_json( - orient="records" + 'copyVariants': json.loads( + pd.read_csv(get_test_file('copy_variants.short.tab'), sep='\t').to_json( + orient='records' ) ), - "structuralVariants": json.loads( - pd.read_csv(get_test_file("fusions.tab"), sep="\t").to_json( - orient="records" - ) + 'structuralVariants': json.loads( + pd.read_csv(get_test_file('fusions.tab'), sep='\t').to_json(orient='records') ), - "kbDiseaseMatch": "colorectal cancer", - "cosmicSignatures": pd.read_csv( - get_test_file("cosmic_variants.tab"), sep="\t" + 'kbDiseaseMatch': 'colorectal cancer', + 'cosmicSignatures': pd.read_csv( + get_test_file('cosmic_variants.tab'), sep='\t' ).signature.tolist(), - "hlaTypes": json.loads( - pd.read_csv(get_test_file("hla_variants.tab"), sep="\t").to_json( - orient="records" - ) + 'hlaTypes': json.loads( + pd.read_csv(get_test_file('hla_variants.tab'), sep='\t').to_json(orient='records') ), - "images": [ + 'images': [ { - "key": "cnvLoh.circos", - "path": "test/testData/images/cnvLoh.png", - "caption": "Test adding a caption to an image", + 'key': 'cnvLoh.circos', + 'path': 'test/testData/images/cnvLoh.png', + 'caption': 'Test adding a caption to an image', } ], - "seqQC": [ + 'seqQC': [ { - "sample": "Tumour DNA", - "reads": "2534M", - "library": "LIB0001", - "coverage": "80x", - "inputNg": "500", - "protocol": "WGS", - "sampleName": "SAMPLE2-FF-1", - "bioQC": "passed", - "labQC": "passed", - "duplicateReadsPerc": "12.3", + 'sample': 'Tumour DNA', + 'reads': '2534M', + 'library': 'LIB0001', + 'coverage': '80x', + 'inputNg': '500', + 'protocol': 'WGS', + 'sampleName': 'SAMPLE2-FF-1', + 'bioQC': 'passed', + 'labQC': 'passed', + 'duplicateReadsPerc': '12.3', }, { - "sample": "Constitutional DNA", - "reads": "1200M", - "library": "LIB0002", - "coverage": "40x", - "inputNg": "300", - "protocol": "WGS", - "sampleName": "SAMPLE1-PB", - "bioQC": "passed", - "labQC": "passed", - "duplicateReadsPerc": "8.1", + 'sample': 'Constitutional DNA', + 'reads': '1200M', + 'library': 'LIB0002', + 'coverage': '40x', + 'inputNg': '300', + 'protocol': 'WGS', + 'sampleName': 'SAMPLE1-PB', + 'bioQC': 'passed', + 'labQC': 'passed', + 'duplicateReadsPerc': '8.1', }, ], - "config": "test config", + 'config': 'test config', } json_file.write_text( @@ -148,7 +142,7 @@ def loaded_reports(tmp_path_factory) -> Generator: ) ) - json_contents["patientId"] = async_patient_id + json_contents['patientId'] = async_patient_id async_json_file.write_text( json.dumps( json_contents, @@ -157,46 +151,46 @@ def loaded_reports(tmp_path_factory) -> Generator: ) argslist = [ - "ipr", - "--username", - os.environ.get("IPR_USER", os.environ["USER"]), - "--password", - os.environ["IPR_PASS"], - "--graphkb_username", - os.environ.get("GRAPHKB_USER", os.environ.get("IPR_USER", os.environ["USER"])), - "--graphkb_password", - os.environ.get("GRAPHKB_PASS", os.environ["IPR_PASS"]), - "--ipr_url", - os.environ["IPR_TEST_URL"], - "--graphkb_url", - os.environ.get("GRAPHKB_URL", False), - "--therapeutics", - "--allow_partial_matches", + 'ipr', + '--username', + os.environ.get('IPR_USER', os.environ['USER']), + '--password', + os.environ['IPR_PASS'], + '--graphkb_username', + os.environ.get('GRAPHKB_USER', os.environ.get('IPR_USER', os.environ['USER'])), + '--graphkb_password', + os.environ.get('GRAPHKB_PASS', os.environ['IPR_PASS']), + '--ipr_url', + os.environ['IPR_TEST_URL'], + '--graphkb_url', + os.environ.get('GRAPHKB_URL', False), + '--therapeutics', + '--allow_partial_matches', ] sync_argslist = argslist.copy() - sync_argslist.extend(["--content", str(json_file)]) - with patch.object(sys, "argv", sync_argslist): - with patch.object(IprConnection, "get_spec", return_value=get_test_spec()): + sync_argslist.extend(['--content', str(json_file)]) + with patch.object(sys, 'argv', sync_argslist): + with patch.object(IprConnection, 'get_spec', return_value=get_test_spec()): command_interface() async_argslist = argslist.copy() - async_argslist.extend(["--content", str(async_json_file), "--async_upload"]) - with patch.object(sys, "argv", async_argslist): - with patch.object(IprConnection, "get_spec", return_value=get_test_spec()): + async_argslist.extend(['--content', str(async_json_file), '--async_upload']) + with patch.object(sys, 'argv', async_argslist): + with patch.object(IprConnection, 'get_spec', return_value=get_test_spec()): command_interface() ipr_conn = IprConnection( - username=os.environ.get("IPR_USER", os.environ["USER"]), - password=os.environ["IPR_PASS"], - url=os.environ["IPR_TEST_URL"], + username=os.environ.get('IPR_USER', os.environ['USER']), + password=os.environ['IPR_PASS'], + url=os.environ['IPR_TEST_URL'], ) - loaded_report = ipr_conn.get(uri=f"reports?searchText={patient_id}") - async_loaded_report = ipr_conn.get(uri=f"reports?searchText={async_patient_id}") + loaded_report = ipr_conn.get(uri=f'reports?searchText={patient_id}') + async_loaded_report = ipr_conn.get(uri=f'reports?searchText={async_patient_id}') loaded_reports_result = { - "sync": (patient_id, loaded_report), - "async": (async_patient_id, async_loaded_report), + 'sync': (patient_id, loaded_report), + 'async': (async_patient_id, async_loaded_report), } yield loaded_reports_result if DELETE_UPLOAD_TEST_REPORTS: @@ -205,13 +199,13 @@ def loaded_reports(tmp_path_factory) -> Generator: def get_section(loaded_report, section_name): - ident = loaded_report[1]["reports"][0]["ident"] + ident = loaded_report[1]['reports'][0]['ident'] ipr_conn = IprConnection( - username=os.environ.get("IPR_USER", os.environ["USER"]), - password=os.environ["IPR_PASS"], - url=os.environ["IPR_TEST_URL"], + username=os.environ.get('IPR_USER', os.environ['USER']), + password=os.environ['IPR_PASS'], + url=os.environ['IPR_TEST_URL'], ) - return ipr_conn.get(uri=f"reports/{ident}/{section_name}") + return ipr_conn.get(uri=f'reports/{ident}/{section_name}') def stringify_sorted(obj): @@ -224,7 +218,7 @@ def stringify_sorted(obj): obj.sort() return str(obj) elif isinstance(obj, dict): - for key in ("ident", "updatedAt", "createdAt", "deletedAt"): + for key in ('ident', 'updatedAt', 'createdAt', 'deletedAt'): obj.pop(key, None) keys = obj.keys() for key in keys: @@ -240,157 +234,147 @@ def stringify_sorted(obj): @pytest.mark.skipif( - not INCLUDE_UPLOAD_TESTS, reason="excluding tests of upload to live ipr instance" -) -@pytest.mark.skipif( - EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests" + not INCLUDE_UPLOAD_TESTS, reason='excluding tests of upload to live ipr instance' ) +@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason='excluding long running integration tests') class TestCreateReport: def test_patient_id_loaded_once(self, loaded_reports) -> None: - sync_patient_id = loaded_reports["sync"][0] - assert loaded_reports["sync"][1]["total"] == 1 - assert loaded_reports["sync"][1]["reports"][0]["patientId"] == sync_patient_id - async_patient_id = loaded_reports["async"][0] - assert loaded_reports["async"][1]["total"] == 1 - assert loaded_reports["async"][1]["reports"][0]["patientId"] == async_patient_id + sync_patient_id = loaded_reports['sync'][0] + assert loaded_reports['sync'][1]['total'] == 1 + assert loaded_reports['sync'][1]['reports'][0]['patientId'] == sync_patient_id + async_patient_id = loaded_reports['async'][0] + assert loaded_reports['async'][1]['total'] == 1 + assert loaded_reports['async'][1]['reports'][0]['patientId'] == async_patient_id def test_expression_variants_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports["sync"], "expression-variants") - kbmatched = [item for item in section if item["kbMatches"]] - assert "PTP4A3" in [item["gene"]["name"] for item in kbmatched] - async_section = get_section(loaded_reports["async"], "expression-variants") + section = get_section(loaded_reports['sync'], 'expression-variants') + kbmatched = [item for item in section if item['kbMatches']] + assert 'PTP4A3' in [item['gene']['name'] for item in kbmatched] + async_section = get_section(loaded_reports['async'], 'expression-variants') async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_structural_variants_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports["sync"], "structural-variants") - kbmatched = [item for item in section if item["kbMatches"]] - assert "(EWSR1,FLI1):fusion(e.7,e.4)" in [ - item["displayName"] for item in kbmatched - ] - async_section = get_section(loaded_reports["async"], "structural-variants") + section = get_section(loaded_reports['sync'], 'structural-variants') + kbmatched = [item for item in section if item['kbMatches']] + assert '(EWSR1,FLI1):fusion(e.7,e.4)' in [item['displayName'] for item in kbmatched] + async_section = get_section(loaded_reports['async'], 'structural-variants') async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_small_mutations_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports["sync"], "small-mutations") - kbmatched = [item for item in section if item["kbMatches"]] - assert "FGFR2:p.R421C" in [item["displayName"] for item in kbmatched] - assert "CDKN2A:p.T18M" in [item["displayName"] for item in kbmatched] - async_section = get_section(loaded_reports["async"], "small-mutations") + section = get_section(loaded_reports['sync'], 'small-mutations') + kbmatched = [item for item in section if item['kbMatches']] + assert 'FGFR2:p.R421C' in [item['displayName'] for item in kbmatched] + assert 'CDKN2A:p.T18M' in [item['displayName'] for item in kbmatched] + async_section = get_section(loaded_reports['async'], 'small-mutations') async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_copy_variants_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports["sync"], "copy-variants") - kbmatched = [item for item in section if item["kbMatches"]] - assert ("ERBB2", "amplification") in [ - (item["gene"]["name"], item["displayName"]) for item in kbmatched + section = get_section(loaded_reports['sync'], 'copy-variants') + kbmatched = [item for item in section if item['kbMatches']] + assert ('ERBB2', 'amplification') in [ + (item['gene']['name'], item['displayName']) for item in kbmatched ] - async_section = get_section(loaded_reports["async"], "copy-variants") + async_section = get_section(loaded_reports['async'], 'copy-variants') async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_signature_variants_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports["sync"], "signature-variants") - kbmatched = [item for item in section if item["kbMatches"]] + section = get_section(loaded_reports['sync'], 'signature-variants') + kbmatched = [item for item in section if item['kbMatches']] # Check for COSMIC signatures - assert ("SBS2", "high signature") in [ - (item["signatureName"], item["variantTypeName"]) for item in kbmatched + assert ('SBS2', 'high signature') in [ + (item['signatureName'], item['variantTypeName']) for item in kbmatched ] # Check for HRD signature (score 9999 > cutoff 5, so strong signature) - assert ("homologous recombination deficiency", "strong signature") in [ - (item["signatureName"], item["variantTypeName"]) for item in kbmatched + assert ('homologous recombination deficiency', 'strong signature') in [ + (item['signatureName'], item['variantTypeName']) for item in kbmatched ] # Check for MSI signature - assert ("microsatellite instability", "high signature") in [ - (item["signatureName"], item["variantTypeName"]) for item in kbmatched + assert ('microsatellite instability', 'high signature') in [ + (item['signatureName'], item['variantTypeName']) for item in kbmatched ] - async_section = get_section(loaded_reports["async"], "signature-variants") + async_section = get_section(loaded_reports['async'], 'signature-variants') async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_hrd_score_in_report(self, loaded_reports) -> None: """Test that HRD score is present in the loaded report.""" - report = loaded_reports["sync"][1]["reports"][0] - assert "hrdScore" in report - assert report["hrdScore"] == 9999.0 + report = loaded_reports['sync'][1]['reports'][0] + assert 'hrdScore' in report + assert report['hrdScore'] == 9999.0 def test_kb_matches_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports["sync"], "kb-matches") + section = get_section(loaded_reports['sync'], 'kb-matches') observed_and_matched = set( - [(item["kbVariant"], item["variant"]["displayName"]) for item in section] + [(item['kbVariant'], item['variant']['displayName']) for item in section] ) for pair in [ - ("ERBB2 amplification", "amplification"), - ("FGFR2 mutation", "FGFR2:p.R421C"), - ("PTP4A3 overexpression", "increased expression"), - ("EWSR1 and FLI1 fusion", "(EWSR1,FLI1):fusion(e.7,e.4)"), - ("CDKN2A mutation", "CDKN2A:p.T18M"), + ('ERBB2 amplification', 'amplification'), + ('FGFR2 mutation', 'FGFR2:p.R421C'), + ('PTP4A3 overexpression', 'increased expression'), + ('EWSR1 and FLI1 fusion', '(EWSR1,FLI1):fusion(e.7,e.4)'), + ('CDKN2A mutation', 'CDKN2A:p.T18M'), ]: assert pair in observed_and_matched - async_section = get_section(loaded_reports["async"], "kb-matches") + async_section = get_section(loaded_reports['async'], 'kb-matches') async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_therapeutic_targets_loaded(self, loaded_reports) -> None: - section = get_section(loaded_reports["sync"], "therapeutic-targets") - therapeutic_target_genes = set([item["gene"] for item in section]) - for gene in ["CDKN2A", "ERBB2", "FGFR2", "PTP4A3"]: + section = get_section(loaded_reports['sync'], 'therapeutic-targets') + therapeutic_target_genes = set([item['gene'] for item in section]) + for gene in ['CDKN2A', 'ERBB2', 'FGFR2', 'PTP4A3']: assert gene in therapeutic_target_genes - async_section = get_section(loaded_reports["async"], "therapeutic-targets") + async_section = get_section(loaded_reports['async'], 'therapeutic-targets') async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_genomic_alterations_identified_loaded(self, loaded_reports) -> None: - section = get_section( - loaded_reports["sync"], "summary/genomic-alterations-identified" - ) - variants = set([item["geneVariant"] for item in section]) + section = get_section(loaded_reports['sync'], 'summary/genomic-alterations-identified') + variants = set([item['geneVariant'] for item in section]) for variant in [ - "FGFR2:p.R421C", - "PTP4A3 (high_percentile)", - "ERBB2 (Amplification)", - "(EWSR1,FLI1):fusion(e.7,e.4)", - "CDKN2A:p.T18M", + 'FGFR2:p.R421C', + 'PTP4A3 (high_percentile)', + 'ERBB2 (Amplification)', + '(EWSR1,FLI1):fusion(e.7,e.4)', + 'CDKN2A:p.T18M', ]: assert variant in variants async_section = get_section( - loaded_reports["async"], "summary/genomic-alterations-identified" + loaded_reports['async'], 'summary/genomic-alterations-identified' ) async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section) assert async_equals_sync def test_analyst_comments_loaded(self, loaded_reports) -> None: - sync_section = get_section(loaded_reports["sync"], "summary/analyst-comments") - assert sync_section["comments"] - async_section = get_section(loaded_reports["async"], "summary/analyst-comments") - assert async_section["comments"] - assert sync_section["comments"] == async_section["comments"] + sync_section = get_section(loaded_reports['sync'], 'summary/analyst-comments') + assert sync_section['comments'] + async_section = get_section(loaded_reports['async'], 'summary/analyst-comments') + assert async_section['comments'] + assert sync_section['comments'] == async_section['comments'] def test_seqqc_loaded(self, loaded_reports) -> None: """Test that seqQC data is present in the loaded report.""" - sync_report = loaded_reports["sync"][1]["reports"][0] - assert "seqQC" in sync_report - assert len(sync_report["seqQC"]) == 2 - samples = [item["sample"] for item in sync_report["seqQC"]] - assert "Tumour DNA" in samples - assert "Constitutional DNA" in samples - async_report = loaded_reports["async"][1]["reports"][0] - assert "seqQC" in async_report - assert len(async_report["seqQC"]) == 2 + sync_report = loaded_reports['sync'][1]['reports'][0] + assert 'seqQC' in sync_report + assert len(sync_report['seqQC']) == 2 + samples = [item['sample'] for item in sync_report['seqQC']] + assert 'Tumour DNA' in samples + assert 'Constitutional DNA' in samples + async_report = loaded_reports['async'][1]['reports'][0] + assert 'seqQC' in async_report + assert len(async_report['seqQC']) == 2 def test_sample_info_loaded(self, loaded_reports) -> None: - sync_section = get_section(loaded_reports["sync"], "sample-info") - async_section = get_section(loaded_reports["async"], "sample-info") - async_equals_sync = stringify_sorted(sync_section) == stringify_sorted( - async_section - ) + sync_section = get_section(loaded_reports['sync'], 'sample-info') + async_section = get_section(loaded_reports['async'], 'sample-info') + async_equals_sync = stringify_sorted(sync_section) == stringify_sorted(async_section) assert async_equals_sync - def test_multivariant_multiconditionset_statements_loaded( - self, loaded_reports - ) -> None: + def test_multivariant_multiconditionset_statements_loaded(self, loaded_reports) -> None: """ Checks that multivariant statements and multiple condition sets prepared correctly by this package are handled as expected by the api. @@ -402,41 +386,31 @@ def test_multivariant_multiconditionset_statements_loaded( are met. This is also a test of multiple condition sets since there are two variants in the test data that satisfy one of the conditions (the APC mutation).""" - section = get_section( - loaded_reports["sync"], "kb-matches/kb-matched-statements" - ) - multivariant_stmts = [ - item for item in section if item["reference"] == "pmid:27302369" - ] + section = get_section(loaded_reports['sync'], 'kb-matches/kb-matched-statements') + multivariant_stmts = [item for item in section if item['reference'] == 'pmid:27302369'] # if this statement is entered more than once there may be multiple sets of records to # check, so to make sure the count checks work, go stmt_id by stmt_id: - stmt_ids = list(set([item["kbStatementId"] for item in multivariant_stmts])) + stmt_ids = list(set([item['kbStatementId'] for item in multivariant_stmts])) for stmt_id in stmt_ids: - stmts = [ - item for item in multivariant_stmts if item["kbStatementId"] == stmt_id - ] + stmts = [item for item in multivariant_stmts if item['kbStatementId'] == stmt_id] # we expect three stmts, one for each condition set assert len(stmts) == 3 # we expect each condition set to have two kb variants in it # we expect the two kb variants to be the same in each stmt - assert len(stmts[0]["kbMatches"]) == 2 - assert len(stmts[1]["kbMatches"]) == 2 - kbmatches1 = [item["kbVariant"] for item in stmts[0]["kbMatches"]] - kbmatches2 = [item["kbVariant"] for item in stmts[1]["kbMatches"]] + assert len(stmts[0]['kbMatches']) == 2 + assert len(stmts[1]['kbMatches']) == 2 + kbmatches1 = [item['kbVariant'] for item in stmts[0]['kbMatches']] + kbmatches2 = [item['kbVariant'] for item in stmts[1]['kbMatches']] kbmatches1.sort() kbmatches2.sort() - assert kbmatches1 == kbmatches2 == ["APC mutation", "KRAS mutation"] + assert kbmatches1 == kbmatches2 == ['APC mutation', 'KRAS mutation'] # we expect the two stmts to have different observed variant sets - observedVariants1 = [ - item["variant"]["ident"] for item in stmts[0]["kbMatches"] - ] - observedVariants2 = [ - item["variant"]["ident"] for item in stmts[1]["kbMatches"] - ] + observedVariants1 = [item['variant']['ident'] for item in stmts[0]['kbMatches']] + observedVariants2 = [item['variant']['ident'] for item in stmts[1]['kbMatches']] observedVariants1.sort() observedVariants2.sort() assert observedVariants1 != observedVariants2 From 1e9d46d5b1a20a432f59cc7766875f5c1c62a158 Mon Sep 17 00:00:00 2001 From: Eleanor Lewis Date: Mon, 20 Apr 2026 11:42:23 -0700 Subject: [PATCH 4/8] handle existing input format --- pori_python/ipr/inputs.py | 40 ++++++++++++ pori_python/ipr/main.py | 2 + tests/test_ipr/test_inputs.py | 115 ++++++++++++++++++++++++++++++++++ 3 files changed, 157 insertions(+) diff --git a/pori_python/ipr/inputs.py b/pori_python/ipr/inputs.py index f14fc696..6dbb9401 100644 --- a/pori_python/ipr/inputs.py +++ b/pori_python/ipr/inputs.py @@ -796,6 +796,46 @@ def check_null(checker, instance): DefaultValidatingDraft7Validator = extend_with_default(jsonschema.Draft7Validator) +def normalize_seqqc(content: Dict) -> Dict: + """ + Normalize seqQC field names from production report format to schema format. + + Maps inconsistent casing and underscores in field names to match content.spec.json requirements. + For example: 'Reads' -> 'reads', 'Sample_Name' -> 'sampleName', etc. + + Args: + content: Report content dictionary that may contain seqQC array + + Returns: + The content dictionary with seqQC fields normalized in-place + """ + # Field name mapping from production/legacy format to schema format + field_mapping = { + 'Reads': 'reads', + 'Sample': 'sample', + 'Library': 'library', + 'Coverage': 'coverage', + 'Input_ng': 'inputNg', + 'Input_ug': 'inputUg', + 'Protocol': 'protocol', + 'Sample Name': 'sampleName', + 'Duplicate_Reads_Perc': 'duplicateReadsPerc', + } + + if 'seqQC' in content and isinstance(content['seqQC'], list): + for item in content['seqQC']: + # Create a new dict with normalized keys + normalized_item = {} + for old_key, value in item.items(): + # Use mapped key if it exists, otherwise keep original + new_key = field_mapping.get(old_key, old_key) + normalized_item[new_key] = value + # Replace the item with normalized version + content['seqQC'][content['seqQC'].index(item)] = normalized_item + + return content + + def validate_report_content(content: Dict, schema_file: str = SPECIFICATION) -> None: """ Validate a report content input JSON object against the schema specification diff --git a/pori_python/ipr/main.py b/pori_python/ipr/main.py index cbb7c128..eea1987e 100644 --- a/pori_python/ipr/main.py +++ b/pori_python/ipr/main.py @@ -27,6 +27,7 @@ from .inputs import ( check_comparators, check_variant_links, + normalize_seqqc, preprocess_copy_variants, preprocess_cosmic, preprocess_expression_variants, @@ -380,6 +381,7 @@ def ipr_report( return ipr_result # validate the JSON content follows the specification + normalize_seqqc(content) try: validate_report_content(content) except jsonschema.exceptions.ValidationError as err: diff --git a/tests/test_ipr/test_inputs.py b/tests/test_ipr/test_inputs.py index 4bdd6b6d..d6e12493 100644 --- a/tests/test_ipr/test_inputs.py +++ b/tests/test_ipr/test_inputs.py @@ -17,6 +17,7 @@ check_comparators, check_variant_links, create_graphkb_sv_notation, + normalize_seqqc, preprocess_copy_variants, preprocess_cosmic, preprocess_expression_variants, @@ -558,3 +559,117 @@ def test_valid_json_inputs(example_name: str): with open(os.path.join(DATA_DIR, 'json_examples', f'{example_name}.json'), 'r') as fh: content = json.load(fh) validate_report_content(content) + + +class TestNormalizeSeqQC: + """Test seqQC field name normalization from production format to schema format.""" + + def test_normalize_seqqc_production_format(self): + """Test normalization of production report field names.""" + content = { + 'seqQC': [ + { + 'Reads': '2407M', + 'Sample': 'Tumour DNA', + 'Library': 'LIB0001', + 'Coverage': '96X', + 'Input_ng': 400, + 'Input_ug': '', + 'Protocol': 'Genome Shotgun FFPE 4.2', + 'Sample Name': 'SAMPLE-T-01', + 'bioQC': 'Passed', + 'labQC': 'Approved', + 'Duplicate_Reads_Perc': 18, + } + ] + } + + result = normalize_seqqc(content) + + assert result['seqQC'][0]['reads'] == '2407M' + assert result['seqQC'][0]['sample'] == 'Tumour DNA' + assert result['seqQC'][0]['library'] == 'LIB0001' + assert result['seqQC'][0]['coverage'] == '96X' + assert result['seqQC'][0]['inputNg'] == 400 + assert result['seqQC'][0]['inputUg'] == '' + assert result['seqQC'][0]['protocol'] == 'Genome Shotgun FFPE 4.2' + assert result['seqQC'][0]['sampleName'] == 'SAMPLE-T-01' + assert result['seqQC'][0]['bioQC'] == 'Passed' + assert result['seqQC'][0]['labQC'] == 'Approved' + assert result['seqQC'][0]['duplicateReadsPerc'] == 18 + # Old keys should be gone + assert 'Reads' not in result['seqQC'][0] + assert 'Sample' not in result['seqQC'][0] + + def test_normalize_seqqc_already_normalized(self): + """Test that already-normalized field names are preserved.""" + content = { + 'seqQC': [ + { + 'reads': '1200M', + 'sample': 'Constitutional DNA', + 'library': 'LIB0002', + 'coverage': '40x', + 'inputNg': '300', + 'protocol': 'WGS', + 'sampleName': 'SAMPLE-N-01', + 'bioQC': 'passed', + 'labQC': 'passed', + 'duplicateReadsPerc': '8.1', + } + ] + } + + result = normalize_seqqc(content) + + # All normalized keys should still exist with same values + assert result['seqQC'][0]['reads'] == '1200M' + assert result['seqQC'][0]['sample'] == 'Constitutional DNA' + assert result['seqQC'][0]['inputNg'] == '300' + + def test_normalize_seqqc_no_seqqc_field(self): + """Test that content without seqQC is unchanged.""" + content = { + 'patientId': 'TEST001', + 'project': 'TEST', + } + + result = normalize_seqqc(content) + + assert result == content + assert 'seqQC' not in result + + def test_normalize_seqqc_empty_seqqc(self): + """Test that empty seqQC array is handled.""" + content = {'seqQC': []} + + result = normalize_seqqc(content) + + assert result['seqQC'] == [] + + def test_normalize_seqqc_multiple_items(self): + """Test normalization of multiple seqQC items.""" + content = { + 'seqQC': [ + { + 'Reads': '2534M', + 'Sample': 'Tumour DNA', + 'Duplicate_Reads_Perc': 12.3, + }, + { + 'Reads': '1200M', + 'Sample': 'Constitutional DNA', + 'Duplicate_Reads_Perc': 8.1, + }, + ] + } + + result = normalize_seqqc(content) + + assert len(result['seqQC']) == 2 + assert result['seqQC'][0]['reads'] == '2534M' + assert result['seqQC'][0]['sample'] == 'Tumour DNA' + assert result['seqQC'][0]['duplicateReadsPerc'] == 12.3 + assert result['seqQC'][1]['reads'] == '1200M' + assert result['seqQC'][1]['sample'] == 'Constitutional DNA' + assert result['seqQC'][1]['duplicateReadsPerc'] == 8.1 From e0ee8ec7cdc8e7901cbde5e53877898f45de3bc1 Mon Sep 17 00:00:00 2001 From: Eleanor Lewis Date: Fri, 8 May 2026 10:49:43 -0700 Subject: [PATCH 5/8] format with ruff --- pori_python/ipr/inputs.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pori_python/ipr/inputs.py b/pori_python/ipr/inputs.py index f76843e0..2dfac499 100644 --- a/pori_python/ipr/inputs.py +++ b/pori_python/ipr/inputs.py @@ -804,13 +804,13 @@ def check_null(checker, instance): def normalize_seqqc(content: Dict) -> Dict: """ Normalize seqQC field names from production report format to schema format. - + Maps inconsistent casing and underscores in field names to match content.spec.json requirements. For example: 'Reads' -> 'reads', 'Sample_Name' -> 'sampleName', etc. - + Args: content: Report content dictionary that may contain seqQC array - + Returns: The content dictionary with seqQC fields normalized in-place """ @@ -826,7 +826,7 @@ def normalize_seqqc(content: Dict) -> Dict: 'Sample Name': 'sampleName', 'Duplicate_Reads_Perc': 'duplicateReadsPerc', } - + if 'seqQC' in content and isinstance(content['seqQC'], list): for item in content['seqQC']: # Create a new dict with normalized keys @@ -837,7 +837,7 @@ def normalize_seqqc(content: Dict) -> Dict: normalized_item[new_key] = value # Replace the item with normalized version content['seqQC'][content['seqQC'].index(item)] = normalized_item - + return content From dcedd6390c095baac3c7750fc89bf8b7c2e0e1c0 Mon Sep 17 00:00:00 2001 From: Eleanor Lewis Date: Mon, 11 May 2026 13:01:55 -0700 Subject: [PATCH 6/8] fix issues raised in pr --- pori_python/ipr/content.spec.json | 6 +++++ pori_python/ipr/inputs.py | 8 +++--- tests/test_ipr/test_inputs.py | 45 +++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 3 deletions(-) diff --git a/pori_python/ipr/content.spec.json b/pori_python/ipr/content.spec.json index 830218ae..c994ba6f 100644 --- a/pori_python/ipr/content.spec.json +++ b/pori_python/ipr/content.spec.json @@ -971,6 +971,8 @@ "example": "500", "type": [ "string", + "number", + "integer", "null" ] }, @@ -979,6 +981,8 @@ "example": "0.5", "type": [ "string", + "number", + "integer", "null" ] }, @@ -1003,6 +1007,8 @@ "example": "12.3", "type": [ "string", + "number", + "integer", "null" ] } diff --git a/pori_python/ipr/inputs.py b/pori_python/ipr/inputs.py index 2dfac499..5c7f10fb 100644 --- a/pori_python/ipr/inputs.py +++ b/pori_python/ipr/inputs.py @@ -806,7 +806,7 @@ def normalize_seqqc(content: Dict) -> Dict: Normalize seqQC field names from production report format to schema format. Maps inconsistent casing and underscores in field names to match content.spec.json requirements. - For example: 'Reads' -> 'reads', 'Sample_Name' -> 'sampleName', etc. + For example: 'Reads' -> 'reads', 'Sample Name' -> 'sampleName', etc. Args: content: Report content dictionary that may contain seqQC array @@ -828,7 +828,9 @@ def normalize_seqqc(content: Dict) -> Dict: } if 'seqQC' in content and isinstance(content['seqQC'], list): - for item in content['seqQC']: + for i, item in enumerate(content['seqQC']): + if not isinstance(item, dict): + continue # Create a new dict with normalized keys normalized_item = {} for old_key, value in item.items(): @@ -836,7 +838,7 @@ def normalize_seqqc(content: Dict) -> Dict: new_key = field_mapping.get(old_key, old_key) normalized_item[new_key] = value # Replace the item with normalized version - content['seqQC'][content['seqQC'].index(item)] = normalized_item + content['seqQC'][i] = normalized_item return content diff --git a/tests/test_ipr/test_inputs.py b/tests/test_ipr/test_inputs.py index d6e12493..f3cd6f99 100644 --- a/tests/test_ipr/test_inputs.py +++ b/tests/test_ipr/test_inputs.py @@ -673,3 +673,48 @@ def test_normalize_seqqc_multiple_items(self): assert result['seqQC'][1]['reads'] == '1200M' assert result['seqQC'][1]['sample'] == 'Constitutional DNA' assert result['seqQC'][1]['duplicateReadsPerc'] == 8.1 + + def test_normalize_seqqc_numeric_fields_pass_validation(self): + """Test that integer/float values for inputNg, inputUg, duplicateReadsPerc pass schema validation.""" + content = { + 'patientId': 'PATIENT001', + 'kbDiseaseMatch': 'colorectal cancer', + 'project': 'TEST', + 'template': 'genomic', + 'seqQC': [ + { + 'reads': '2407M', + 'sample': 'Tumour DNA', + 'library': 'LIB0001', + 'inputNg': 400, + 'inputUg': 0.4, + 'duplicateReadsPerc': 18, + } + ], + } + # Should not raise + validate_report_content(content) + + def test_normalize_seqqc_numeric_float_duplicateReadsPerc_passes_validation(self): + """Test that a float duplicateReadsPerc value passes schema validation after normalization.""" + content = { + 'patientId': 'PATIENT001', + 'kbDiseaseMatch': 'colorectal cancer', + 'project': 'TEST', + 'template': 'genomic', + 'seqQC': [ + { + 'Reads': '2534M', + 'Sample': 'Tumour DNA', + 'Duplicate_Reads_Perc': 12.3, + 'Input_ng': 500, + 'Input_ug': 0.5, + } + ], + } + result = normalize_seqqc(content) + assert result['seqQC'][0]['duplicateReadsPerc'] == 12.3 + assert result['seqQC'][0]['inputNg'] == 500 + assert result['seqQC'][0]['inputUg'] == 0.5 + # Should not raise after normalization + validate_report_content(result) From 416146e0ba5b9b67d73ef89e65cf2cfad4c3153c Mon Sep 17 00:00:00 2001 From: Eleanor Lewis Date: Mon, 11 May 2026 13:14:41 -0700 Subject: [PATCH 7/8] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- pori_python/ipr/inputs.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/pori_python/ipr/inputs.py b/pori_python/ipr/inputs.py index 5c7f10fb..25901fc3 100644 --- a/pori_python/ipr/inputs.py +++ b/pori_python/ipr/inputs.py @@ -826,17 +826,25 @@ def normalize_seqqc(content: Dict) -> Dict: 'Sample Name': 'sampleName', 'Duplicate_Reads_Perc': 'duplicateReadsPerc', } + normalized_keys = set(field_mapping.values()) if 'seqQC' in content and isinstance(content['seqQC'], list): for i, item in enumerate(content['seqQC']): if not isinstance(item, dict): continue - # Create a new dict with normalized keys + # Preserve already-normalized keys (and unrelated keys) first so + # legacy aliases cannot overwrite them based on insertion order. normalized_item = {} - for old_key, value in item.items(): - # Use mapped key if it exists, otherwise keep original - new_key = field_mapping.get(old_key, old_key) - normalized_item[new_key] = value + for key, value in item.items(): + if key in normalized_keys or key not in field_mapping: + normalized_item[key] = value + + # Add legacy aliases only when the normalized key is not already + # present. This makes collision handling explicit and stable. + for old_key, new_key in field_mapping.items(): + if old_key in item and new_key not in normalized_item: + normalized_item[new_key] = item[old_key] + # Replace the item with normalized version content['seqQC'][i] = normalized_item From e2c25efa7017b42af99c19805eee475a7a31402e Mon Sep 17 00:00:00 2001 From: Eleanor Lewis Date: Mon, 11 May 2026 13:15:35 -0700 Subject: [PATCH 8/8] normalize before upload_json as well --- pori_python/ipr/main.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pori_python/ipr/main.py b/pori_python/ipr/main.py index 9d8f0568..c249b791 100644 --- a/pori_python/ipr/main.py +++ b/pori_python/ipr/main.py @@ -403,6 +403,10 @@ def ipr_report( ipr_result = ipr_conn.validate_json(content) return ipr_result + # seqqc normalization is a bridging measure only; + # validate_json should be called on non-normalized json + normalize_seqqc(content) + if upload_json: if not ipr_conn: raise ValueError('ipr_url required to upload json') @@ -412,7 +416,6 @@ def ipr_report( return ipr_result # validate the JSON content follows the specification - normalize_seqqc(content) try: validate_report_content(content) except jsonschema.exceptions.ValidationError as err: