bcgsc · elewis2 · Apr 17, 2026 · Apr 20, 2026 · Apr 20, 2026 · Apr 20, 2026
diff --git a/pori_python/ipr/content.spec.json b/pori_python/ipr/content.spec.json
@@ -912,6 +912,109 @@
             "example": "POG",
             "type": "string"
         },
+        "seqQC": {
+            "default": [],
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "reads": {
+                        "description": "Number of reads",
+                        "example": "2534M",
+                        "type": [
+                            "string",
+                            "null"
+                        ]
+                    },
+                    "bioQC": {
+                        "description": "Biological QC status",
+                        "example": "passed",
+                        "type": [
+                            "string",
+                            "null"
+                        ]
+                    },
+                    "labQC": {
+                        "description": "Lab QC status",
+                        "example": "passed",
+                        "type": [
+                            "string",
+                            "null"
+                        ]
+                    },
+                    "sample": {
+                        "description": "Sample identifier, e.g. Tumour DNA, Constitutional DNA",
+                        "example": "Tumour DNA",
+                        "type": [
+                            "string",
+                            "null"
+                        ]
+                    },
+                    "library": {
+                        "description": "Library identifier",
+                        "example": "LIB0001",
+                        "type": [
+                            "string",
+                            "null"
+                        ]
+                    },
+                    "coverage": {
+                        "description": "Sequencing coverage",
+                        "example": "80x",
+                        "type": [
+                            "string",
+                            "null"
+                        ]
+                    },
+                    "inputNg": {
+                        "description": "Input amount in nanograms",
+                        "example": "500",
+                        "type": [
+                            "string",
+                            "number",
+                            "integer",
+                            "null"
+                        ]
+                    },
+                    "inputUg": {
+                        "description": "Input amount in micrograms",
+                        "example": "0.5",
+                        "type": [
+                            "string",
+                            "number",
+                            "integer",
+                            "null"
+                        ]
+                    },
+                    "protocol": {
+                        "description": "Sequencing protocol",
+                        "example": "WGS",
+                        "type": [
+                            "string",
+                            "null"
+                        ]
+                    },
+                    "sampleName": {
+                        "description": "Full sample name",
+                        "example": "SAMPLE1-FF-1",
+                        "type": [
+                            "string",
+                            "null"
+                        ]
+                    },
+                    "duplicateReadsPerc": {
+                        "description": "Percentage of duplicate reads",
+                        "example": "12.3",
+                        "type": [
+                            "string",
+                            "number",
+                            "integer",
+                            "null"
+                        ]
+                    }
+                }
+            }
+        },
         "smallMutations": {
             "default": [],
             "items": {

diff --git a/pori_python/ipr/inputs.py b/pori_python/ipr/inputs.py
@@ -801,6 +801,56 @@ def check_null(checker, instance):
 DefaultValidatingDraft7Validator = extend_with_default(jsonschema.Draft7Validator)
 
 
+def normalize_seqqc(content: Dict) -> Dict:
+    """
+    Normalize seqQC field names from production report format to schema format.
+
+    Maps inconsistent casing and underscores in field names to match content.spec.json requirements.
+    For example: 'Reads' -> 'reads', 'Sample Name' -> 'sampleName', etc.
+
+    Args:
+        content: Report content dictionary that may contain seqQC array
+
+    Returns:
+        The content dictionary with seqQC fields normalized in-place
+    """
+    # Field name mapping from production/legacy format to schema format
+    field_mapping = {
+        'Reads': 'reads',
+        'Sample': 'sample',
+        'Library': 'library',
+        'Coverage': 'coverage',
+        'Input_ng': 'inputNg',
+        'Input_ug': 'inputUg',
+        'Protocol': 'protocol',
+        'Sample Name': 'sampleName',
+        'Duplicate_Reads_Perc': 'duplicateReadsPerc',
+    }
+    normalized_keys = set(field_mapping.values())
+
+    if 'seqQC' in content and isinstance(content['seqQC'], list):
+        for i, item in enumerate(content['seqQC']):
+            if not isinstance(item, dict):
+                continue
+            # Preserve already-normalized keys (and unrelated keys) first so
+            # legacy aliases cannot overwrite them based on insertion order.
+            normalized_item = {}
+            for key, value in item.items():
+                if key in normalized_keys or key not in field_mapping:
+                    normalized_item[key] = value
+
+            # Add legacy aliases only when the normalized key is not already
+            # present. This makes collision handling explicit and stable.
+            for old_key, new_key in field_mapping.items():
+                if old_key in item and new_key not in normalized_item:
+                    normalized_item[new_key] = item[old_key]
+
+            # Replace the item with normalized version
+            content['seqQC'][i] = normalized_item
+
+    return content
+
+
 def validate_report_content(content: Dict, schema_file: str = SPECIFICATION) -> None:
     """
     Validate a report content input JSON object against the schema specification

diff --git a/pori_python/ipr/main.py b/pori_python/ipr/main.py
@@ -28,6 +28,7 @@
 from .inputs import (
     check_comparators,
     check_variant_links,
+    normalize_seqqc,
     preprocess_copy_variants,
     preprocess_cosmic,
     preprocess_expression_variants,
@@ -402,6 +403,10 @@ def ipr_report(
         ipr_result = ipr_conn.validate_json(content)
         return ipr_result
 
+    # seqqc normalization is a bridging measure only;
+    # validate_json should be called on non-normalized json
+    normalize_seqqc(content)
+
     if upload_json:
         if not ipr_conn:
             raise ValueError('ipr_url required to upload json')

diff --git a/tests/test_ipr/test_inputs.py b/tests/test_ipr/test_inputs.py
@@ -17,6 +17,7 @@
     check_comparators,
     check_variant_links,
     create_graphkb_sv_notation,
+    normalize_seqqc,
     preprocess_copy_variants,
     preprocess_cosmic,
     preprocess_expression_variants,
@@ -558,3 +559,162 @@ def test_valid_json_inputs(example_name: str):
     with open(os.path.join(DATA_DIR, 'json_examples', f'{example_name}.json'), 'r') as fh:
         content = json.load(fh)
     validate_report_content(content)
+
+
+class TestNormalizeSeqQC:
+    """Test seqQC field name normalization from production format to schema format."""
+
+    def test_normalize_seqqc_production_format(self):
+        """Test normalization of production report field names."""
+        content = {
+            'seqQC': [
+                {
+                    'Reads': '2407M',
+                    'Sample': 'Tumour DNA',
+                    'Library': 'LIB0001',
+                    'Coverage': '96X',
+                    'Input_ng': 400,
+                    'Input_ug': '',
+                    'Protocol': 'Genome Shotgun FFPE 4.2',
+                    'Sample Name': 'SAMPLE-T-01',
+                    'bioQC': 'Passed',
+                    'labQC': 'Approved',
+                    'Duplicate_Reads_Perc': 18,
+                }
+            ]
+        }
+
+        result = normalize_seqqc(content)
+
+        assert result['seqQC'][0]['reads'] == '2407M'
+        assert result['seqQC'][0]['sample'] == 'Tumour DNA'
+        assert result['seqQC'][0]['library'] == 'LIB0001'
+        assert result['seqQC'][0]['coverage'] == '96X'
+        assert result['seqQC'][0]['inputNg'] == 400
+        assert result['seqQC'][0]['inputUg'] == ''
+        assert result['seqQC'][0]['protocol'] == 'Genome Shotgun FFPE 4.2'
+        assert result['seqQC'][0]['sampleName'] == 'SAMPLE-T-01'
+        assert result['seqQC'][0]['bioQC'] == 'Passed'
+        assert result['seqQC'][0]['labQC'] == 'Approved'
+        assert result['seqQC'][0]['duplicateReadsPerc'] == 18
+        # Old keys should be gone
+        assert 'Reads' not in result['seqQC'][0]
+        assert 'Sample' not in result['seqQC'][0]
+
+    def test_normalize_seqqc_already_normalized(self):
+        """Test that already-normalized field names are preserved."""
+        content = {
+            'seqQC': [
+                {
+                    'reads': '1200M',
+                    'sample': 'Constitutional DNA',
+                    'library': 'LIB0002',
+                    'coverage': '40x',
+                    'inputNg': '300',
+                    'protocol': 'WGS',
+                    'sampleName': 'SAMPLE-N-01',
+                    'bioQC': 'passed',
+                    'labQC': 'passed',
+                    'duplicateReadsPerc': '8.1',
+                }
+            ]
+        }
+
+        result = normalize_seqqc(content)
+
+        # All normalized keys should still exist with same values
+        assert result['seqQC'][0]['reads'] == '1200M'
+        assert result['seqQC'][0]['sample'] == 'Constitutional DNA'
+        assert result['seqQC'][0]['inputNg'] == '300'
+
+    def test_normalize_seqqc_no_seqqc_field(self):
+        """Test that content without seqQC is unchanged."""
+        content = {
+            'patientId': 'TEST001',
+            'project': 'TEST',
+        }
+
+        result = normalize_seqqc(content)
+
+        assert result == content
+        assert 'seqQC' not in result
+
+    def test_normalize_seqqc_empty_seqqc(self):
+        """Test that empty seqQC array is handled."""
+        content = {'seqQC': []}
+
+        result = normalize_seqqc(content)
+
+        assert result['seqQC'] == []
+
+    def test_normalize_seqqc_multiple_items(self):
+        """Test normalization of multiple seqQC items."""
+        content = {
+            'seqQC': [
+                {
+                    'Reads': '2534M',
+                    'Sample': 'Tumour DNA',
+                    'Duplicate_Reads_Perc': 12.3,
+                },
+                {
+                    'Reads': '1200M',
+                    'Sample': 'Constitutional DNA',
+                    'Duplicate_Reads_Perc': 8.1,
+                },
+            ]
+        }
+
+        result = normalize_seqqc(content)
+
+        assert len(result['seqQC']) == 2
+        assert result['seqQC'][0]['reads'] == '2534M'
+        assert result['seqQC'][0]['sample'] == 'Tumour DNA'
+        assert result['seqQC'][0]['duplicateReadsPerc'] == 12.3
+        assert result['seqQC'][1]['reads'] == '1200M'
+        assert result['seqQC'][1]['sample'] == 'Constitutional DNA'
+        assert result['seqQC'][1]['duplicateReadsPerc'] == 8.1
+
+    def test_normalize_seqqc_numeric_fields_pass_validation(self):
+        """Test that integer/float values for inputNg, inputUg, duplicateReadsPerc pass schema validation."""
+        content = {
+            'patientId': 'PATIENT001',
+            'kbDiseaseMatch': 'colorectal cancer',
+            'project': 'TEST',
+            'template': 'genomic',
+            'seqQC': [
+                {
+                    'reads': '2407M',
+                    'sample': 'Tumour DNA',
+                    'library': 'LIB0001',
+                    'inputNg': 400,
+                    'inputUg': 0.4,
+                    'duplicateReadsPerc': 18,
+                }
+            ],
+        }
+        # Should not raise
+        validate_report_content(content)
+
+    def test_normalize_seqqc_numeric_float_duplicateReadsPerc_passes_validation(self):
+        """Test that a float duplicateReadsPerc value passes schema validation after normalization."""
+        content = {
+            'patientId': 'PATIENT001',
+            'kbDiseaseMatch': 'colorectal cancer',
+            'project': 'TEST',
+            'template': 'genomic',
+            'seqQC': [
+                {
+                    'Reads': '2534M',
+                    'Sample': 'Tumour DNA',
+                    'Duplicate_Reads_Perc': 12.3,
+                    'Input_ng': 500,
+                    'Input_ug': 0.5,
+                }
+            ],
+        }
+        result = normalize_seqqc(content)
+        assert result['seqQC'][0]['duplicateReadsPerc'] == 12.3
+        assert result['seqQC'][0]['inputNg'] == 500
+        assert result['seqQC'][0]['inputUg'] == 0.5
+        # Should not raise after normalization
+        validate_report_content(result)