Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 103 additions & 0 deletions pori_python/ipr/content.spec.json
Original file line number Diff line number Diff line change
Expand Up @@ -912,6 +912,109 @@
"example": "POG",
"type": "string"
},
"seqQC": {
"default": [],
"type": "array",
"items": {
"type": "object",
"properties": {
"reads": {
"description": "Number of reads",
"example": "2534M",
"type": [
"string",
"null"
]
},
"bioQC": {
"description": "Biological QC status",
"example": "passed",
"type": [
"string",
"null"
]
},
"labQC": {
"description": "Lab QC status",
"example": "passed",
"type": [
"string",
"null"
]
},
"sample": {
"description": "Sample identifier, e.g. Tumour DNA, Constitutional DNA",
"example": "Tumour DNA",
"type": [
"string",
"null"
]
},
"library": {
"description": "Library identifier",
"example": "LIB0001",
"type": [
"string",
"null"
]
},
"coverage": {
"description": "Sequencing coverage",
"example": "80x",
"type": [
"string",
"null"
]
},
"inputNg": {
"description": "Input amount in nanograms",
"example": "500",
"type": [
"string",
"number",
"integer",
"null"
]
},
"inputUg": {
"description": "Input amount in micrograms",
"example": "0.5",
"type": [
"string",
"number",
"integer",
"null"
]
},
"protocol": {
"description": "Sequencing protocol",
"example": "WGS",
"type": [
"string",
"null"
]
},
"sampleName": {
"description": "Full sample name",
"example": "SAMPLE1-FF-1",
"type": [
"string",
"null"
]
},
"duplicateReadsPerc": {
"description": "Percentage of duplicate reads",
"example": "12.3",
"type": [
"string",
"number",
"integer",
"null"
]
Comment thread
elewis2 marked this conversation as resolved.
}
}
}
},
Comment thread
elewis2 marked this conversation as resolved.
"smallMutations": {
"default": [],
"items": {
Expand Down
50 changes: 50 additions & 0 deletions pori_python/ipr/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -801,6 +801,56 @@ def check_null(checker, instance):
DefaultValidatingDraft7Validator = extend_with_default(jsonschema.Draft7Validator)


def normalize_seqqc(content: Dict) -> Dict:
"""
Normalize seqQC field names from production report format to schema format.

Maps inconsistent casing and underscores in field names to match content.spec.json requirements.
For example: 'Reads' -> 'reads', 'Sample Name' -> 'sampleName', etc.

Comment thread
elewis2 marked this conversation as resolved.
Args:
content: Report content dictionary that may contain seqQC array

Returns:
The content dictionary with seqQC fields normalized in-place
"""
# Field name mapping from production/legacy format to schema format
field_mapping = {
'Reads': 'reads',
'Sample': 'sample',
'Library': 'library',
'Coverage': 'coverage',
'Input_ng': 'inputNg',
'Input_ug': 'inputUg',
'Protocol': 'protocol',
'Sample Name': 'sampleName',
'Duplicate_Reads_Perc': 'duplicateReadsPerc',
}
normalized_keys = set(field_mapping.values())

if 'seqQC' in content and isinstance(content['seqQC'], list):
for i, item in enumerate(content['seqQC']):
if not isinstance(item, dict):
continue
# Preserve already-normalized keys (and unrelated keys) first so
# legacy aliases cannot overwrite them based on insertion order.
normalized_item = {}
for key, value in item.items():
if key in normalized_keys or key not in field_mapping:
normalized_item[key] = value

# Add legacy aliases only when the normalized key is not already
# present. This makes collision handling explicit and stable.
for old_key, new_key in field_mapping.items():
if old_key in item and new_key not in normalized_item:
normalized_item[new_key] = item[old_key]

# Replace the item with normalized version
content['seqQC'][i] = normalized_item

Comment thread
elewis2 marked this conversation as resolved.
return content
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

side-effect-driven programming... Do you still want to return?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussed offline, I see what you're saying and will fix



def validate_report_content(content: Dict, schema_file: str = SPECIFICATION) -> None:
"""
Validate a report content input JSON object against the schema specification
Expand Down
5 changes: 5 additions & 0 deletions pori_python/ipr/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from .inputs import (
check_comparators,
check_variant_links,
normalize_seqqc,
preprocess_copy_variants,
preprocess_cosmic,
preprocess_expression_variants,
Expand Down Expand Up @@ -402,6 +403,10 @@ def ipr_report(
ipr_result = ipr_conn.validate_json(content)
return ipr_result

# seqqc normalization is a bridging measure only;
# validate_json should be called on non-normalized json
normalize_seqqc(content)

if upload_json:
if not ipr_conn:
raise ValueError('ipr_url required to upload json')
Expand Down
160 changes: 160 additions & 0 deletions tests/test_ipr/test_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
check_comparators,
check_variant_links,
create_graphkb_sv_notation,
normalize_seqqc,
preprocess_copy_variants,
preprocess_cosmic,
preprocess_expression_variants,
Expand Down Expand Up @@ -558,3 +559,162 @@ def test_valid_json_inputs(example_name: str):
with open(os.path.join(DATA_DIR, 'json_examples', f'{example_name}.json'), 'r') as fh:
content = json.load(fh)
validate_report_content(content)


class TestNormalizeSeqQC:
"""Test seqQC field name normalization from production format to schema format."""

def test_normalize_seqqc_production_format(self):
"""Test normalization of production report field names."""
content = {
'seqQC': [
{
'Reads': '2407M',
'Sample': 'Tumour DNA',
'Library': 'LIB0001',
'Coverage': '96X',
'Input_ng': 400,
'Input_ug': '',
'Protocol': 'Genome Shotgun FFPE 4.2',
'Sample Name': 'SAMPLE-T-01',
'bioQC': 'Passed',
'labQC': 'Approved',
'Duplicate_Reads_Perc': 18,
}
]
}

result = normalize_seqqc(content)

assert result['seqQC'][0]['reads'] == '2407M'
assert result['seqQC'][0]['sample'] == 'Tumour DNA'
assert result['seqQC'][0]['library'] == 'LIB0001'
assert result['seqQC'][0]['coverage'] == '96X'
assert result['seqQC'][0]['inputNg'] == 400
assert result['seqQC'][0]['inputUg'] == ''
assert result['seqQC'][0]['protocol'] == 'Genome Shotgun FFPE 4.2'
assert result['seqQC'][0]['sampleName'] == 'SAMPLE-T-01'
assert result['seqQC'][0]['bioQC'] == 'Passed'
assert result['seqQC'][0]['labQC'] == 'Approved'
assert result['seqQC'][0]['duplicateReadsPerc'] == 18
# Old keys should be gone
assert 'Reads' not in result['seqQC'][0]
assert 'Sample' not in result['seqQC'][0]

def test_normalize_seqqc_already_normalized(self):
"""Test that already-normalized field names are preserved."""
content = {
'seqQC': [
{
'reads': '1200M',
'sample': 'Constitutional DNA',
'library': 'LIB0002',
'coverage': '40x',
'inputNg': '300',
'protocol': 'WGS',
'sampleName': 'SAMPLE-N-01',
'bioQC': 'passed',
'labQC': 'passed',
'duplicateReadsPerc': '8.1',
}
]
}

result = normalize_seqqc(content)

# All normalized keys should still exist with same values
assert result['seqQC'][0]['reads'] == '1200M'
assert result['seqQC'][0]['sample'] == 'Constitutional DNA'
assert result['seqQC'][0]['inputNg'] == '300'

def test_normalize_seqqc_no_seqqc_field(self):
"""Test that content without seqQC is unchanged."""
content = {
'patientId': 'TEST001',
'project': 'TEST',
}

result = normalize_seqqc(content)

assert result == content
assert 'seqQC' not in result

def test_normalize_seqqc_empty_seqqc(self):
"""Test that empty seqQC array is handled."""
content = {'seqQC': []}

result = normalize_seqqc(content)

assert result['seqQC'] == []

def test_normalize_seqqc_multiple_items(self):
"""Test normalization of multiple seqQC items."""
content = {
'seqQC': [
{
'Reads': '2534M',
'Sample': 'Tumour DNA',
'Duplicate_Reads_Perc': 12.3,
},
{
'Reads': '1200M',
'Sample': 'Constitutional DNA',
'Duplicate_Reads_Perc': 8.1,
},
]
}

result = normalize_seqqc(content)

assert len(result['seqQC']) == 2
assert result['seqQC'][0]['reads'] == '2534M'
assert result['seqQC'][0]['sample'] == 'Tumour DNA'
assert result['seqQC'][0]['duplicateReadsPerc'] == 12.3
assert result['seqQC'][1]['reads'] == '1200M'
assert result['seqQC'][1]['sample'] == 'Constitutional DNA'
assert result['seqQC'][1]['duplicateReadsPerc'] == 8.1

def test_normalize_seqqc_numeric_fields_pass_validation(self):
"""Test that integer/float values for inputNg, inputUg, duplicateReadsPerc pass schema validation."""
content = {
'patientId': 'PATIENT001',
'kbDiseaseMatch': 'colorectal cancer',
'project': 'TEST',
'template': 'genomic',
'seqQC': [
{
'reads': '2407M',
'sample': 'Tumour DNA',
'library': 'LIB0001',
'inputNg': 400,
'inputUg': 0.4,
'duplicateReadsPerc': 18,
}
],
}
# Should not raise
validate_report_content(content)

def test_normalize_seqqc_numeric_float_duplicateReadsPerc_passes_validation(self):
"""Test that a float duplicateReadsPerc value passes schema validation after normalization."""
content = {
'patientId': 'PATIENT001',
'kbDiseaseMatch': 'colorectal cancer',
'project': 'TEST',
'template': 'genomic',
'seqQC': [
{
'Reads': '2534M',
'Sample': 'Tumour DNA',
'Duplicate_Reads_Perc': 12.3,
'Input_ng': 500,
'Input_ug': 0.5,
}
],
}
result = normalize_seqqc(content)
assert result['seqQC'][0]['duplicateReadsPerc'] == 12.3
assert result['seqQC'][0]['inputNg'] == 500
assert result['seqQC'][0]['inputUg'] == 0.5
# Should not raise after normalization
validate_report_content(result)
Loading
Loading