From f71fa0f7381bbcfe67df1f5dae90c50eb3067e6e Mon Sep 17 00:00:00 2001 From: gp201 Date: Fri, 12 Sep 2025 13:36:05 -0700 Subject: [PATCH 1/4] fix: added assertion to ensure retraction_detection_date_utc is only present when is_retracted is True --- bin/fetch_sra_metadata.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/bin/fetch_sra_metadata.py b/bin/fetch_sra_metadata.py index 5aeb639..1e3a7f1 100755 --- a/bin/fetch_sra_metadata.py +++ b/bin/fetch_sra_metadata.py @@ -318,7 +318,7 @@ def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description='Fetch SRA metadata for a BioProject.') parser.add_argument('-b', '--bioproject_ids', type=str, required=True, help='BioProject ID to monitor. Multiple IDs should be separated by commas.') parser.add_argument('-e', '--email', type=str, required=True, help='Email address for Entrez') - parser.add_argument('-m', '--metadata', type=str, required=True, help='Path to old metadata file') + parser.add_argument('-m', '--metadata', type=str, required=False, help='Path to old metadata file') parser.add_argument('-t', '--trimming_config', type=str, required=False, help='Path to trimming yaml file') parser.add_argument('-r', '--check_retracted', action='store_true', help='Check for retracted SRA runs') return parser.parse_args() @@ -347,13 +347,25 @@ def main(): """ args = parse_args() - + + # Resolve metadata path: use provided path or build a sensible default from bioproject ids + if args.metadata: + metadata_path = args.metadata + else: + proj_tag = "_".join([bid.strip() for bid in args.bioproject_ids.split(',') if bid.strip()]) or "sra" + metadata_path = f"{proj_tag}_metadata.csv" + # Process BioProject IDs bioproject_ids = [bid.strip() for bid in args.bioproject_ids.split(',') if bid.strip()] search_term = " OR ".join(f"{bid}[BioProject]" for bid in bioproject_ids) new_metadata = get_new_srps(search_term, args.email) - prev_metadata = pd.read_csv(args.metadata) + # Try to read previous metadata from the resolved metadata_path; if missing use empty df + try: + prev_metadata = pd.read_csv(metadata_path) + except (FileNotFoundError, pd.errors.EmptyDataError): + print(f"Metadata file {metadata_path} not found or empty, using empty DataFrame.") + prev_metadata = pd.DataFrame(columns=FIELDS.keys()) new_sras = new_metadata.loc[~new_metadata['Run'].isin(prev_metadata['Run'])] combined_metadata = prev_metadata.copy() @@ -380,7 +392,7 @@ def main(): save_columns.append('global_trimming') new_sras[save_columns].to_csv( - args.metadata.replace('.csv', '_to_process.tsv'), + metadata_path.replace('.csv', '_to_process.tsv'), index=False, sep='\t' ) @@ -389,6 +401,8 @@ def main(): # Check for retracted runs and update metadata accordingly retracted_runs = check_retracted_runs(prev_metadata, new_metadata) combined_metadata['is_retracted'] = combined_metadata['Run'].isin(retracted_runs) + # ensure is_retracted is boolean + combined_metadata['is_retracted'] = combined_metadata['is_retracted'].astype(bool) # Ensure the retraction detection date column exists if 'retraction_detection_date_utc' not in combined_metadata.columns: @@ -399,10 +413,14 @@ def main(): mask = combined_metadata['is_retracted'] & combined_metadata['retraction_detection_date_utc'].isna() combined_metadata.loc[mask, 'retraction_detection_date_utc'] = now_str + # assert that all retracted runs have a detection date and no non-retracted runs have a date + assert combined_metadata.loc[combined_metadata['is_retracted'], 'retraction_detection_date_utc'].notna().all() + assert combined_metadata.loc[~combined_metadata['is_retracted'], 'retraction_detection_date_utc'].isna().all() + if combined_metadata.equals(prev_metadata): print("No new updates found.") else: - updated_path = args.metadata.replace('.csv', '_updated.csv') + updated_path = metadata_path.replace('.csv', '_updated.csv') combined_metadata.to_csv(updated_path, index=False) print(f"Updated metadata saved to {updated_path}") From 0ea5340f2c8afab5ec98d571eec90adc0bb29e9a Mon Sep 17 00:00:00 2001 From: gp201 Date: Fri, 12 Sep 2025 13:36:23 -0700 Subject: [PATCH 2/4] lint: lint fixes --- conf/test.config | 22 +++++++++++----------- modules/nf-core/sratools/prefetch/main.nf | 2 +- workflows/flusra.nf | 10 +++------- 3 files changed, 15 insertions(+), 19 deletions(-) diff --git a/conf/test.config b/conf/test.config index 652d716..f673b5e 100644 --- a/conf/test.config +++ b/conf/test.config @@ -11,22 +11,22 @@ */ params { - config_profile_name = 'Test profile' + config_profile_name = 'Test profile' config_profile_description = 'Test dataset to check pipeline function' // Limit resources so that this can run on GitHub Actions - max_cpus = 2 + max_cpus = 2 max_memory = '6.GB' - max_time = '6.h' + max_time = '6.h' // Input data - bioproject = null - samples_to_process = "${projectDir}/assets/test/input/sra/input.tsv" - email = "test@test.com" - metadata = null + bioproject = null + samples_to_process = "${projectDir}/assets/test/input/sra/input.tsv" + email = "test@test.com" + metadata = null // Reference genome - reference = "${projectDir}/assets/test/input/ref/reference.fasta" - milk_reference = "${projectDir}/assets/test/input/ref/milk_reference.fasta" - milk_barcode = "${projectDir}/assets/test/input/barcode/barcode.csv" - outdir = "${projectDir}/testing/output" + reference = "${projectDir}/assets/test/input/ref/reference.fasta" + milk_reference = "${projectDir}/assets/test/input/ref/milk_reference.fasta" + milk_barcode = "${projectDir}/assets/test/input/barcode/barcode.csv" + outdir = "${projectDir}/testing/output" } diff --git a/modules/nf-core/sratools/prefetch/main.nf b/modules/nf-core/sratools/prefetch/main.nf index 29195e5..23e8182 100644 --- a/modules/nf-core/sratools/prefetch/main.nf +++ b/modules/nf-core/sratools/prefetch/main.nf @@ -14,7 +14,7 @@ process SRATOOLS_PREFETCH { when: task.ext.when == null || task.ext.when - shell: + script: args = '5 1 100' // template('retry_with_backoff.sh') diff --git a/workflows/flusra.nf b/workflows/flusra.nf index 91a9ae5..d2f3c6d 100644 --- a/workflows/flusra.nf +++ b/workflows/flusra.nf @@ -25,7 +25,7 @@ workflow FLUSRA { by: 1, failOnDuplicate: true, ) - .map { sra, meta, reads -> + .map { _sra, meta, reads -> tuple(meta, reads) } .set { reads_ch } @@ -71,14 +71,10 @@ workflow FLUSRA { .set { sample_reads_input } if (!params.fetch_and_pull) { - sample_reads_input.samples.filter { it - != null } - | PROCESS_SRA + sample_reads_input.samples.filter { it != null } | PROCESS_SRA ch_versions = ch_versions.mix(PROCESS_SRA.out.versions) - sample_reads_input.milk.filter { it - != null } - | MILK_FREYJA + sample_reads_input.milk.filter { it != null } | MILK_FREYJA ch_versions = ch_versions.mix(MILK_FREYJA.out.versions) } From 9d8c077890bd44b794329ba253ed702dfc655e86 Mon Sep 17 00:00:00 2001 From: gp201 Date: Fri, 12 Sep 2025 13:36:36 -0700 Subject: [PATCH 3/4] chore: bumped version to 1.5.2 --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 1a5a689..f5e7f6b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -165,7 +165,7 @@ manifest { description = """A pipeline to assemble avian flu genomes""" mainScript = 'main.nf' nextflowVersion = '!>=24.10.0' - version = '1.5.1' + version = '1.5.2' doi = '' } From 10f6cbd2193963209921d1b9bd2f902fe1afa97b Mon Sep 17 00:00:00 2001 From: gp201 Date: Fri, 12 Sep 2025 13:57:14 -0700 Subject: [PATCH 4/4] fix: change script block to shell in SRATOOLS_PREFETCH process --- modules/nf-core/sratools/prefetch/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/nf-core/sratools/prefetch/main.nf b/modules/nf-core/sratools/prefetch/main.nf index 23e8182..29195e5 100644 --- a/modules/nf-core/sratools/prefetch/main.nf +++ b/modules/nf-core/sratools/prefetch/main.nf @@ -14,7 +14,7 @@ process SRATOOLS_PREFETCH { when: task.ext.when == null || task.ext.when - script: + shell: args = '5 1 100' // template('retry_with_backoff.sh')