Skip to content

Commit cae48d8

Browse files
authored
Fix: fetch sra metadata retraction module (#32)
This pull request fixes a bug where an SRA sample that is not retracted was being assigned a retraction date. It also includes minor updates to improve code clarity and maintain consistency across the pipeline. The main changes are a switch from `shell` to `script` for process execution, variable renaming for clarity, and minor formatting improvements. The version number has also been incremented. Fix * Add assertions enforcing SRA retraction consistency (retracted => has date; not retracted => no date) General improvements and maintenance: * Changed process execution from `shell` to `script` in the `SRATOOLS_PREFETCH` process for better compatibility and maintainability (`modules/nf-core/sratools/prefetch/main.nf`). * Updated pipeline version from `1.5.1` to `1.5.2` in `nextflow.config`. Code clarity and formatting: * Renamed the unused lambda parameter from `sra` to `_sra` in the `FLUSRA` workflow to indicate it is not used (`workflows/flusra.nf`).
1 parent 3593351 commit cae48d8

4 files changed

Lines changed: 38 additions & 24 deletions

File tree

bin/fetch_sra_metadata.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,7 @@ def parse_args() -> argparse.Namespace:
318318
parser = argparse.ArgumentParser(description='Fetch SRA metadata for a BioProject.')
319319
parser.add_argument('-b', '--bioproject_ids', type=str, required=True, help='BioProject ID to monitor. Multiple IDs should be separated by commas.')
320320
parser.add_argument('-e', '--email', type=str, required=True, help='Email address for Entrez')
321-
parser.add_argument('-m', '--metadata', type=str, required=True, help='Path to old metadata file')
321+
parser.add_argument('-m', '--metadata', type=str, required=False, help='Path to old metadata file')
322322
parser.add_argument('-t', '--trimming_config', type=str, required=False, help='Path to trimming yaml file')
323323
parser.add_argument('-r', '--check_retracted', action='store_true', help='Check for retracted SRA runs')
324324
return parser.parse_args()
@@ -347,13 +347,25 @@ def main():
347347
"""
348348

349349
args = parse_args()
350-
350+
351+
# Resolve metadata path: use provided path or build a sensible default from bioproject ids
352+
if args.metadata:
353+
metadata_path = args.metadata
354+
else:
355+
proj_tag = "_".join([bid.strip() for bid in args.bioproject_ids.split(',') if bid.strip()]) or "sra"
356+
metadata_path = f"{proj_tag}_metadata.csv"
357+
351358
# Process BioProject IDs
352359
bioproject_ids = [bid.strip() for bid in args.bioproject_ids.split(',') if bid.strip()]
353360
search_term = " OR ".join(f"{bid}[BioProject]" for bid in bioproject_ids)
354361

355362
new_metadata = get_new_srps(search_term, args.email)
356-
prev_metadata = pd.read_csv(args.metadata)
363+
# Try to read previous metadata from the resolved metadata_path; if missing use empty df
364+
try:
365+
prev_metadata = pd.read_csv(metadata_path)
366+
except (FileNotFoundError, pd.errors.EmptyDataError):
367+
print(f"Metadata file {metadata_path} not found or empty, using empty DataFrame.")
368+
prev_metadata = pd.DataFrame(columns=FIELDS.keys())
357369

358370
new_sras = new_metadata.loc[~new_metadata['Run'].isin(prev_metadata['Run'])]
359371
combined_metadata = prev_metadata.copy()
@@ -380,7 +392,7 @@ def main():
380392
save_columns.append('global_trimming')
381393

382394
new_sras[save_columns].to_csv(
383-
args.metadata.replace('.csv', '_to_process.tsv'),
395+
metadata_path.replace('.csv', '_to_process.tsv'),
384396
index=False,
385397
sep='\t'
386398
)
@@ -389,6 +401,8 @@ def main():
389401
# Check for retracted runs and update metadata accordingly
390402
retracted_runs = check_retracted_runs(prev_metadata, new_metadata)
391403
combined_metadata['is_retracted'] = combined_metadata['Run'].isin(retracted_runs)
404+
# ensure is_retracted is boolean
405+
combined_metadata['is_retracted'] = combined_metadata['is_retracted'].astype(bool)
392406

393407
# Ensure the retraction detection date column exists
394408
if 'retraction_detection_date_utc' not in combined_metadata.columns:
@@ -399,10 +413,14 @@ def main():
399413
mask = combined_metadata['is_retracted'] & combined_metadata['retraction_detection_date_utc'].isna()
400414
combined_metadata.loc[mask, 'retraction_detection_date_utc'] = now_str
401415

416+
# assert that all retracted runs have a detection date and no non-retracted runs have a date
417+
assert combined_metadata.loc[combined_metadata['is_retracted'], 'retraction_detection_date_utc'].notna().all()
418+
assert combined_metadata.loc[~combined_metadata['is_retracted'], 'retraction_detection_date_utc'].isna().all()
419+
402420
if combined_metadata.equals(prev_metadata):
403421
print("No new updates found.")
404422
else:
405-
updated_path = args.metadata.replace('.csv', '_updated.csv')
423+
updated_path = metadata_path.replace('.csv', '_updated.csv')
406424
combined_metadata.to_csv(updated_path, index=False)
407425
print(f"Updated metadata saved to {updated_path}")
408426

conf/test.config

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,22 +11,22 @@
1111
*/
1212

1313
params {
14-
config_profile_name = 'Test profile'
14+
config_profile_name = 'Test profile'
1515
config_profile_description = 'Test dataset to check pipeline function'
1616

1717
// Limit resources so that this can run on GitHub Actions
18-
max_cpus = 2
18+
max_cpus = 2
1919
max_memory = '6.GB'
20-
max_time = '6.h'
20+
max_time = '6.h'
2121

2222
// Input data
23-
bioproject = null
24-
samples_to_process = "${projectDir}/assets/test/input/sra/input.tsv"
25-
email = "test@test.com"
26-
metadata = null
23+
bioproject = null
24+
samples_to_process = "${projectDir}/assets/test/input/sra/input.tsv"
25+
email = "test@test.com"
26+
metadata = null
2727
// Reference genome
28-
reference = "${projectDir}/assets/test/input/ref/reference.fasta"
29-
milk_reference = "${projectDir}/assets/test/input/ref/milk_reference.fasta"
30-
milk_barcode = "${projectDir}/assets/test/input/barcode/barcode.csv"
31-
outdir = "${projectDir}/testing/output"
28+
reference = "${projectDir}/assets/test/input/ref/reference.fasta"
29+
milk_reference = "${projectDir}/assets/test/input/ref/milk_reference.fasta"
30+
milk_barcode = "${projectDir}/assets/test/input/barcode/barcode.csv"
31+
outdir = "${projectDir}/testing/output"
3232
}

nextflow.config

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ manifest {
165165
description = """A pipeline to assemble avian flu genomes"""
166166
mainScript = 'main.nf'
167167
nextflowVersion = '!>=24.10.0'
168-
version = '1.5.1'
168+
version = '1.5.2'
169169
doi = ''
170170
}
171171

workflows/flusra.nf

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ workflow FLUSRA {
2525
by: 1,
2626
failOnDuplicate: true,
2727
)
28-
.map { sra, meta, reads ->
28+
.map { _sra, meta, reads ->
2929
tuple(meta, reads)
3030
}
3131
.set { reads_ch }
@@ -71,14 +71,10 @@ workflow FLUSRA {
7171
.set { sample_reads_input }
7272

7373
if (!params.fetch_and_pull) {
74-
sample_reads_input.samples.filter { it
75-
!= null }
76-
| PROCESS_SRA
74+
sample_reads_input.samples.filter { it != null } | PROCESS_SRA
7775
ch_versions = ch_versions.mix(PROCESS_SRA.out.versions)
7876

79-
sample_reads_input.milk.filter { it
80-
!= null }
81-
| MILK_FREYJA
77+
sample_reads_input.milk.filter { it != null } | MILK_FREYJA
8278
ch_versions = ch_versions.mix(MILK_FREYJA.out.versions)
8379
}
8480

0 commit comments

Comments
 (0)