diff --git a/pori_python/ipr/content.spec.json b/pori_python/ipr/content.spec.json index 2a092db7..c994ba6f 100644 --- a/pori_python/ipr/content.spec.json +++ b/pori_python/ipr/content.spec.json @@ -912,6 +912,109 @@ "example": "POG", "type": "string" }, + "seqQC": { + "default": [], + "type": "array", + "items": { + "type": "object", + "properties": { + "reads": { + "description": "Number of reads", + "example": "2534M", + "type": [ + "string", + "null" + ] + }, + "bioQC": { + "description": "Biological QC status", + "example": "passed", + "type": [ + "string", + "null" + ] + }, + "labQC": { + "description": "Lab QC status", + "example": "passed", + "type": [ + "string", + "null" + ] + }, + "sample": { + "description": "Sample identifier, e.g. Tumour DNA, Constitutional DNA", + "example": "Tumour DNA", + "type": [ + "string", + "null" + ] + }, + "library": { + "description": "Library identifier", + "example": "LIB0001", + "type": [ + "string", + "null" + ] + }, + "coverage": { + "description": "Sequencing coverage", + "example": "80x", + "type": [ + "string", + "null" + ] + }, + "inputNg": { + "description": "Input amount in nanograms", + "example": "500", + "type": [ + "string", + "number", + "integer", + "null" + ] + }, + "inputUg": { + "description": "Input amount in micrograms", + "example": "0.5", + "type": [ + "string", + "number", + "integer", + "null" + ] + }, + "protocol": { + "description": "Sequencing protocol", + "example": "WGS", + "type": [ + "string", + "null" + ] + }, + "sampleName": { + "description": "Full sample name", + "example": "SAMPLE1-FF-1", + "type": [ + "string", + "null" + ] + }, + "duplicateReadsPerc": { + "description": "Percentage of duplicate reads", + "example": "12.3", + "type": [ + "string", + "number", + "integer", + "null" + ] + } + } + } + }, "smallMutations": { "default": [], "items": { diff --git a/pori_python/ipr/inputs.py b/pori_python/ipr/inputs.py index 68f7759a..25901fc3 100644 --- a/pori_python/ipr/inputs.py +++ b/pori_python/ipr/inputs.py @@ -801,6 +801,56 @@ def check_null(checker, instance): DefaultValidatingDraft7Validator = extend_with_default(jsonschema.Draft7Validator) +def normalize_seqqc(content: Dict) -> Dict: + """ + Normalize seqQC field names from production report format to schema format. + + Maps inconsistent casing and underscores in field names to match content.spec.json requirements. + For example: 'Reads' -> 'reads', 'Sample Name' -> 'sampleName', etc. + + Args: + content: Report content dictionary that may contain seqQC array + + Returns: + The content dictionary with seqQC fields normalized in-place + """ + # Field name mapping from production/legacy format to schema format + field_mapping = { + 'Reads': 'reads', + 'Sample': 'sample', + 'Library': 'library', + 'Coverage': 'coverage', + 'Input_ng': 'inputNg', + 'Input_ug': 'inputUg', + 'Protocol': 'protocol', + 'Sample Name': 'sampleName', + 'Duplicate_Reads_Perc': 'duplicateReadsPerc', + } + normalized_keys = set(field_mapping.values()) + + if 'seqQC' in content and isinstance(content['seqQC'], list): + for i, item in enumerate(content['seqQC']): + if not isinstance(item, dict): + continue + # Preserve already-normalized keys (and unrelated keys) first so + # legacy aliases cannot overwrite them based on insertion order. + normalized_item = {} + for key, value in item.items(): + if key in normalized_keys or key not in field_mapping: + normalized_item[key] = value + + # Add legacy aliases only when the normalized key is not already + # present. This makes collision handling explicit and stable. + for old_key, new_key in field_mapping.items(): + if old_key in item and new_key not in normalized_item: + normalized_item[new_key] = item[old_key] + + # Replace the item with normalized version + content['seqQC'][i] = normalized_item + + return content + + def validate_report_content(content: Dict, schema_file: str = SPECIFICATION) -> None: """ Validate a report content input JSON object against the schema specification diff --git a/pori_python/ipr/main.py b/pori_python/ipr/main.py index 2ed1d3b0..c249b791 100644 --- a/pori_python/ipr/main.py +++ b/pori_python/ipr/main.py @@ -28,6 +28,7 @@ from .inputs import ( check_comparators, check_variant_links, + normalize_seqqc, preprocess_copy_variants, preprocess_cosmic, preprocess_expression_variants, @@ -402,6 +403,10 @@ def ipr_report( ipr_result = ipr_conn.validate_json(content) return ipr_result + # seqqc normalization is a bridging measure only; + # validate_json should be called on non-normalized json + normalize_seqqc(content) + if upload_json: if not ipr_conn: raise ValueError('ipr_url required to upload json') diff --git a/tests/test_ipr/test_inputs.py b/tests/test_ipr/test_inputs.py index 4bdd6b6d..f3cd6f99 100644 --- a/tests/test_ipr/test_inputs.py +++ b/tests/test_ipr/test_inputs.py @@ -17,6 +17,7 @@ check_comparators, check_variant_links, create_graphkb_sv_notation, + normalize_seqqc, preprocess_copy_variants, preprocess_cosmic, preprocess_expression_variants, @@ -558,3 +559,162 @@ def test_valid_json_inputs(example_name: str): with open(os.path.join(DATA_DIR, 'json_examples', f'{example_name}.json'), 'r') as fh: content = json.load(fh) validate_report_content(content) + + +class TestNormalizeSeqQC: + """Test seqQC field name normalization from production format to schema format.""" + + def test_normalize_seqqc_production_format(self): + """Test normalization of production report field names.""" + content = { + 'seqQC': [ + { + 'Reads': '2407M', + 'Sample': 'Tumour DNA', + 'Library': 'LIB0001', + 'Coverage': '96X', + 'Input_ng': 400, + 'Input_ug': '', + 'Protocol': 'Genome Shotgun FFPE 4.2', + 'Sample Name': 'SAMPLE-T-01', + 'bioQC': 'Passed', + 'labQC': 'Approved', + 'Duplicate_Reads_Perc': 18, + } + ] + } + + result = normalize_seqqc(content) + + assert result['seqQC'][0]['reads'] == '2407M' + assert result['seqQC'][0]['sample'] == 'Tumour DNA' + assert result['seqQC'][0]['library'] == 'LIB0001' + assert result['seqQC'][0]['coverage'] == '96X' + assert result['seqQC'][0]['inputNg'] == 400 + assert result['seqQC'][0]['inputUg'] == '' + assert result['seqQC'][0]['protocol'] == 'Genome Shotgun FFPE 4.2' + assert result['seqQC'][0]['sampleName'] == 'SAMPLE-T-01' + assert result['seqQC'][0]['bioQC'] == 'Passed' + assert result['seqQC'][0]['labQC'] == 'Approved' + assert result['seqQC'][0]['duplicateReadsPerc'] == 18 + # Old keys should be gone + assert 'Reads' not in result['seqQC'][0] + assert 'Sample' not in result['seqQC'][0] + + def test_normalize_seqqc_already_normalized(self): + """Test that already-normalized field names are preserved.""" + content = { + 'seqQC': [ + { + 'reads': '1200M', + 'sample': 'Constitutional DNA', + 'library': 'LIB0002', + 'coverage': '40x', + 'inputNg': '300', + 'protocol': 'WGS', + 'sampleName': 'SAMPLE-N-01', + 'bioQC': 'passed', + 'labQC': 'passed', + 'duplicateReadsPerc': '8.1', + } + ] + } + + result = normalize_seqqc(content) + + # All normalized keys should still exist with same values + assert result['seqQC'][0]['reads'] == '1200M' + assert result['seqQC'][0]['sample'] == 'Constitutional DNA' + assert result['seqQC'][0]['inputNg'] == '300' + + def test_normalize_seqqc_no_seqqc_field(self): + """Test that content without seqQC is unchanged.""" + content = { + 'patientId': 'TEST001', + 'project': 'TEST', + } + + result = normalize_seqqc(content) + + assert result == content + assert 'seqQC' not in result + + def test_normalize_seqqc_empty_seqqc(self): + """Test that empty seqQC array is handled.""" + content = {'seqQC': []} + + result = normalize_seqqc(content) + + assert result['seqQC'] == [] + + def test_normalize_seqqc_multiple_items(self): + """Test normalization of multiple seqQC items.""" + content = { + 'seqQC': [ + { + 'Reads': '2534M', + 'Sample': 'Tumour DNA', + 'Duplicate_Reads_Perc': 12.3, + }, + { + 'Reads': '1200M', + 'Sample': 'Constitutional DNA', + 'Duplicate_Reads_Perc': 8.1, + }, + ] + } + + result = normalize_seqqc(content) + + assert len(result['seqQC']) == 2 + assert result['seqQC'][0]['reads'] == '2534M' + assert result['seqQC'][0]['sample'] == 'Tumour DNA' + assert result['seqQC'][0]['duplicateReadsPerc'] == 12.3 + assert result['seqQC'][1]['reads'] == '1200M' + assert result['seqQC'][1]['sample'] == 'Constitutional DNA' + assert result['seqQC'][1]['duplicateReadsPerc'] == 8.1 + + def test_normalize_seqqc_numeric_fields_pass_validation(self): + """Test that integer/float values for inputNg, inputUg, duplicateReadsPerc pass schema validation.""" + content = { + 'patientId': 'PATIENT001', + 'kbDiseaseMatch': 'colorectal cancer', + 'project': 'TEST', + 'template': 'genomic', + 'seqQC': [ + { + 'reads': '2407M', + 'sample': 'Tumour DNA', + 'library': 'LIB0001', + 'inputNg': 400, + 'inputUg': 0.4, + 'duplicateReadsPerc': 18, + } + ], + } + # Should not raise + validate_report_content(content) + + def test_normalize_seqqc_numeric_float_duplicateReadsPerc_passes_validation(self): + """Test that a float duplicateReadsPerc value passes schema validation after normalization.""" + content = { + 'patientId': 'PATIENT001', + 'kbDiseaseMatch': 'colorectal cancer', + 'project': 'TEST', + 'template': 'genomic', + 'seqQC': [ + { + 'Reads': '2534M', + 'Sample': 'Tumour DNA', + 'Duplicate_Reads_Perc': 12.3, + 'Input_ng': 500, + 'Input_ug': 0.5, + } + ], + } + result = normalize_seqqc(content) + assert result['seqQC'][0]['duplicateReadsPerc'] == 12.3 + assert result['seqQC'][0]['inputNg'] == 500 + assert result['seqQC'][0]['inputUg'] == 0.5 + # Should not raise after normalization + validate_report_content(result) diff --git a/tests/test_ipr/test_upload.py b/tests/test_ipr/test_upload.py index 6cf7b789..6e7f9b05 100644 --- a/tests/test_ipr/test_upload.py +++ b/tests/test_ipr/test_upload.py @@ -129,6 +129,32 @@ def loaded_reports(tmp_path_factory) -> Generator: 'caption': 'Test adding a caption to an image', } ], + 'seqQC': [ + { + 'sample': 'Tumour DNA', + 'reads': '2534M', + 'library': 'LIB0001', + 'coverage': '80x', + 'inputNg': '500', + 'protocol': 'WGS', + 'sampleName': 'SAMPLE2-FF-1', + 'bioQC': 'passed', + 'labQC': 'passed', + 'duplicateReadsPerc': '12.3', + }, + { + 'sample': 'Constitutional DNA', + 'reads': '1200M', + 'library': 'LIB0002', + 'coverage': '40x', + 'inputNg': '300', + 'protocol': 'WGS', + 'sampleName': 'SAMPLE1-PB', + 'bioQC': 'passed', + 'labQC': 'passed', + 'duplicateReadsPerc': '8.1', + }, + ], 'config': 'test config', } @@ -360,6 +386,18 @@ def test_analyst_comments_loaded(self, loaded_reports) -> None: assert async_section['comments'] assert sync_section['comments'] == async_section['comments'] + def test_seqqc_loaded(self, loaded_reports) -> None: + """Test that seqQC data is present in the loaded report.""" + sync_report = loaded_reports['sync'][1]['reports'][0] + assert 'seqQC' in sync_report + assert len(sync_report['seqQC']) == 2 + samples = [item['sample'] for item in sync_report['seqQC']] + assert 'Tumour DNA' in samples + assert 'Constitutional DNA' in samples + async_report = loaded_reports['async'][1]['reports'][0] + assert 'seqQC' in async_report + assert len(async_report['seqQC']) == 2 + def test_sample_info_loaded(self, loaded_reports) -> None: sync_section = get_section(loaded_reports['sync'], 'sample-info') async_section = get_section(loaded_reports['async'], 'sample-info')