Fix: fetch sra metadata retraction module (#32)

gp201 · web-flow · commit cae48d88dd22 · 2025-09-12T14:19:30.000-07:00
This pull request fixes a bug where an SRA sample that is not retracted
was being assigned a retraction date.
It also includes minor updates to improve code clarity and maintain
consistency across the pipeline. The main changes are a switch from
`shell` to `script` for process execution, variable renaming for
clarity, and minor formatting improvements. The version number has also
been incremented.

Fix

* Add assertions enforcing SRA retraction consistency (retracted =&gt; has
date; not retracted =&gt; no date)

General improvements and maintenance:

* Changed process execution from `shell` to `script` in the
`SRATOOLS_PREFETCH` process for better compatibility and maintainability
(`modules/nf-core/sratools/prefetch/main.nf`).
* Updated pipeline version from `1.5.1` to `1.5.2` in `nextflow.config`.

Code clarity and formatting:

* Renamed the unused lambda parameter from `sra` to `_sra` in the
`FLUSRA` workflow to indicate it is not used (`workflows/flusra.nf`).
diff --git a/bin/fetch_sra_metadata.py b/bin/fetch_sra_metadata.py
@@ -318,7 +318,7 @@ def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(description='Fetch SRA metadata for a BioProject.')
     parser.add_argument('-b', '--bioproject_ids', type=str, required=True, help='BioProject ID to monitor. Multiple IDs should be separated by commas.')
     parser.add_argument('-e', '--email', type=str, required=True, help='Email address for Entrez')
-    parser.add_argument('-m', '--metadata', type=str, required=True, help='Path to old metadata file')
+    parser.add_argument('-m', '--metadata', type=str, required=False, help='Path to old metadata file')
     parser.add_argument('-t', '--trimming_config', type=str, required=False, help='Path to trimming yaml file')
     parser.add_argument('-r', '--check_retracted', action='store_true', help='Check for retracted SRA runs')
     return parser.parse_args()
@@ -347,13 +347,25 @@ def main():
     """
 
     args = parse_args()
-
+    
+    # Resolve metadata path: use provided path or build a sensible default from bioproject ids
+    if args.metadata:
+        metadata_path = args.metadata
+    else:
+        proj_tag = "_".join([bid.strip() for bid in args.bioproject_ids.split(',') if bid.strip()]) or "sra"
+        metadata_path = f"{proj_tag}_metadata.csv"
+    
     # Process BioProject IDs
     bioproject_ids = [bid.strip() for bid in args.bioproject_ids.split(',') if bid.strip()]
     search_term = " OR ".join(f"{bid}[BioProject]" for bid in bioproject_ids)
 
     new_metadata = get_new_srps(search_term, args.email)
-    prev_metadata = pd.read_csv(args.metadata)
+    # Try to read previous metadata from the resolved metadata_path; if missing use empty df
+    try:
+        prev_metadata = pd.read_csv(metadata_path)
+    except (FileNotFoundError, pd.errors.EmptyDataError):
+        print(f"Metadata file {metadata_path} not found or empty, using empty DataFrame.")
+        prev_metadata = pd.DataFrame(columns=FIELDS.keys())
 
     new_sras = new_metadata.loc[~new_metadata['Run'].isin(prev_metadata['Run'])]
     combined_metadata = prev_metadata.copy()
@@ -380,7 +392,7 @@ def main():
             save_columns.append('global_trimming')
 
         new_sras[save_columns].to_csv(
-            args.metadata.replace('.csv', '_to_process.tsv'),
+            metadata_path.replace('.csv', '_to_process.tsv'),
             index=False,
             sep='\t'
         )
@@ -389,6 +401,8 @@ def main():
         # Check for retracted runs and update metadata accordingly
         retracted_runs = check_retracted_runs(prev_metadata, new_metadata)
         combined_metadata['is_retracted'] = combined_metadata['Run'].isin(retracted_runs)
+        # ensure is_retracted is boolean
+        combined_metadata['is_retracted'] = combined_metadata['is_retracted'].astype(bool)
 
         # Ensure the retraction detection date column exists
         if 'retraction_detection_date_utc' not in combined_metadata.columns:
@@ -399,10 +413,14 @@ def main():
         mask = combined_metadata['is_retracted'] & combined_metadata['retraction_detection_date_utc'].isna()
         combined_metadata.loc[mask, 'retraction_detection_date_utc'] = now_str
 
+        # assert that all retracted runs have a detection date and no non-retracted runs have a date
+        assert combined_metadata.loc[combined_metadata['is_retracted'], 'retraction_detection_date_utc'].notna().all()
+        assert combined_metadata.loc[~combined_metadata['is_retracted'], 'retraction_detection_date_utc'].isna().all()
+
     if combined_metadata.equals(prev_metadata):
         print("No new updates found.")
     else:
-        updated_path = args.metadata.replace('.csv', '_updated.csv')
+        updated_path = metadata_path.replace('.csv', '_updated.csv')
         combined_metadata.to_csv(updated_path, index=False)
         print(f"Updated metadata saved to {updated_path}")
 
diff --git a/conf/test.config b/conf/test.config
@@ -11,22 +11,22 @@
 */
 
 params {
-    config_profile_name        = 'Test profile'
+    config_profile_name = 'Test profile'
     config_profile_description = 'Test dataset to check pipeline function'
 
     // Limit resources so that this can run on GitHub Actions
-    max_cpus   = 2
+    max_cpus = 2
     max_memory = '6.GB'
-    max_time   = '6.h'
+    max_time = '6.h'
 
     // Input data
-    bioproject                  = null
-    samples_to_process          = "${projectDir}/assets/test/input/sra/input.tsv"
-    email                       = "test@test.com"
-    metadata                    = null
+    bioproject = null
+    samples_to_process = "${projectDir}/assets/test/input/sra/input.tsv"
+    email = "test@test.com"
+    metadata = null
     // Reference genome
-    reference                  = "${projectDir}/assets/test/input/ref/reference.fasta"
-    milk_reference             = "${projectDir}/assets/test/input/ref/milk_reference.fasta"
-    milk_barcode               = "${projectDir}/assets/test/input/barcode/barcode.csv"
-    outdir                     = "${projectDir}/testing/output"
+    reference = "${projectDir}/assets/test/input/ref/reference.fasta"
+    milk_reference = "${projectDir}/assets/test/input/ref/milk_reference.fasta"
+    milk_barcode = "${projectDir}/assets/test/input/barcode/barcode.csv"
+    outdir = "${projectDir}/testing/output"
 }
diff --git a/nextflow.config b/nextflow.config
@@ -165,7 +165,7 @@ manifest {
     description     = """A pipeline to assemble avian flu genomes"""
     mainScript      = 'main.nf'
     nextflowVersion = '!>=24.10.0'
-    version         = '1.5.1'
+    version         = '1.5.2'
     doi             = ''
 }
 
diff --git a/workflows/flusra.nf b/workflows/flusra.nf
@@ -25,7 +25,7 @@ workflow FLUSRA {
                 by: 1,
                 failOnDuplicate: true,
             )
-            .map { sra, meta, reads ->
+            .map { _sra, meta, reads ->
                 tuple(meta, reads)
             }
             .set { reads_ch }
@@ -71,14 +71,10 @@ workflow FLUSRA {
         .set { sample_reads_input }
 
     if (!params.fetch_and_pull) {
-        sample_reads_input.samples.filter { it
-            != null }
-            | PROCESS_SRA
+        sample_reads_input.samples.filter { it != null } | PROCESS_SRA
         ch_versions = ch_versions.mix(PROCESS_SRA.out.versions)
 
-        sample_reads_input.milk.filter { it
-            != null }
-            | MILK_FREYJA
+        sample_reads_input.milk.filter { it != null } | MILK_FREYJA
         ch_versions = ch_versions.mix(MILK_FREYJA.out.versions)
     }
 

Original file line number	Diff line number	Diff line change
`@@ -165,7 +165,7 @@ manifest {`
`165`	`165`	`description = """A pipeline to assemble avian flu genomes"""`
`166`	`166`	`mainScript = 'main.nf'`
`167`	`167`	`nextflowVersion = '!>=24.10.0'`
`168`		`- version = '1.5.1'`
	`168`	`+ version = '1.5.2'`
`169`	`169`	`doi = ''`
`170`	`170`	`}`
`171`	`171`