Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 121 additions & 0 deletions bin/validate_expdesign.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#!/usr/bin/env python3
"""
Validates OpenMS experimental design files for duplicate (Fraction_Group, Fraction, Label) combinations.

This script checks the experimental design file generated from SDRF conversion to ensure that
each combination of (Fraction_Group, Fraction, Label) appears only once, which is a requirement
for downstream OpenMS tools like MSstatsConverter and ProteinQuantifier.
"""

import sys
import argparse
import csv
from collections import defaultdict


def validate_expdesign(expdesign_file):
"""
Validate the experimental design file for duplicate combinations.

Args:
expdesign_file: Path to the OpenMS experimental design TSV file

Returns:
bool: True if validation passes, False otherwise
"""
print(f"Validating experimental design file: {expdesign_file}")

# Track combinations and their row numbers
combinations = defaultdict(list)

# Read the file
try:
with open(expdesign_file, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f, delimiter='\t')

# Check if required columns exist
required_cols = ['Fraction_Group', 'Fraction', 'Label']
if not all(col in reader.fieldnames for col in required_cols):
print(f"ERROR: Missing required columns. Expected columns: {required_cols}")
print(f"Found columns: {reader.fieldnames}")
return False

# Check each row for duplicates
for row_num, row in enumerate(reader, start=2): # Start at 2 because line 1 is header
fraction_group = row.get('Fraction_Group', '')
fraction = row.get('Fraction', '')
label = row.get('Label', '')

# Create the combination tuple
combination = (fraction_group, fraction, label)

# Track which rows have this combination
combinations[combination].append({
'row': row_num,
'data': row
})

# Check for duplicates
duplicates_found = False
for combination, occurrences in combinations.items():
if len(occurrences) > 1:
duplicates_found = True
fraction_group, fraction, label = combination
print(f"\nERROR: Duplicate (Fraction_Group={fraction_group}, Fraction={fraction}, Label={label}) combination found!")
print(f"This combination appears {len(occurrences)} times in the following rows:")

for occurrence in occurrences:
row_num = occurrence['row']
data = occurrence['data']
# Get relevant columns for display
sample = data.get('Sample', 'N/A')
spectra = data.get('Spectra_Filepath', data.get('MSRun', 'N/A'))
print(f" - Row {row_num}: Sample={sample}, Spectra={spectra}")

if duplicates_found:
print("\n" + "="*80)
print("VALIDATION FAILED: Duplicate (Fraction_Group, Fraction, Label) combinations detected!")
print("="*80)
print("\nPlease fix the SDRF file to ensure each (Fraction_Group, Fraction, Label) combination is unique.")
print("Common causes:")
print(" - Duplicate label assignments for the same data file")
print(" - Incorrect fraction or fraction group assignments")
print(" - Copy-paste errors in the SDRF file")
return False
else:
print(f"\n✓ Validation passed: All {len(combinations)} (Fraction_Group, Fraction, Label) combinations are unique.")
return True

except FileNotFoundError:
print(f"ERROR: File not found: {expdesign_file}")
return False
except Exception as e:
print(f"ERROR: Failed to read or parse the file: {e}")
return False


def main():
parser = argparse.ArgumentParser(
description='Validate OpenMS experimental design files for duplicate combinations',
formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument(
'--expdesign',
required=True,
help='Path to the OpenMS experimental design TSV file'
)

args = parser.parse_args()

# Run validation
is_valid = validate_expdesign(args.expdesign)

# Exit with appropriate code
if is_valid:
sys.exit(0)
else:
sys.exit(1)


if __name__ == '__main__':
main()
5 changes: 4 additions & 1 deletion modules/local/preprocess_expdesign/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,16 @@ process PREPROCESS_EXPDESIGN {
# we edit the design here and change the endings.
sed 's/.raw\\t/.mzML\\t/I' ${design} > ${design.baseName}_openms_design.tsv

# Validate the experimental design for duplicate combinations
quantmsutilsc validateexpdesign --expdesign ${design.baseName}_openms_design.tsv 2>&1 | tee validation.log

# here we extract the filenames and fake an empty config (since the config values will be deduced from the workflow params)
a=\$(grep -n '^\$' ${design} | head -n 1 | awk -F ":" '{print \$1}')
sed -e ''"\${a}"',\$d' ${design} > ${design.baseName}_config.tsv

cat <<-END_VERSIONS > versions.yml
"${task.process}":
sdrf-pipelines: \$(parse_sdrf --version 2>/dev/null | awk -F ' ' '{print \$2}')
quantms-utils: \$(pip show quantms-utils | grep "Version" | awk -F ': ' '{print \$2}')
END_VERSIONS
"""
}
3 changes: 3 additions & 0 deletions modules/local/sdrf_parsing/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ process SDRF_PARSING {
mv openms.tsv ${sdrf.baseName}_config.tsv
mv experimental_design.tsv ${sdrf.baseName}_openms_design.tsv

# Validate the experimental design for duplicate combinations
validate_expdesign.py --expdesign ${sdrf.baseName}_openms_design.tsv 2>&1 | tee -a ${sdrf.baseName}_parsing.log

cat <<-END_VERSIONS > versions.yml
"${task.process}":
sdrf-pipelines: \$(parse_sdrf --version 2>/dev/null | awk -F ' ' '{print \$2}')
Expand Down