diff --git a/bin/fetch_sra_metadata.py b/bin/fetch_sra_metadata.py index 5aeb639..1e3a7f1 100755 --- a/bin/fetch_sra_metadata.py +++ b/bin/fetch_sra_metadata.py @@ -318,7 +318,7 @@ def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description='Fetch SRA metadata for a BioProject.') parser.add_argument('-b', '--bioproject_ids', type=str, required=True, help='BioProject ID to monitor. Multiple IDs should be separated by commas.') parser.add_argument('-e', '--email', type=str, required=True, help='Email address for Entrez') - parser.add_argument('-m', '--metadata', type=str, required=True, help='Path to old metadata file') + parser.add_argument('-m', '--metadata', type=str, required=False, help='Path to old metadata file') parser.add_argument('-t', '--trimming_config', type=str, required=False, help='Path to trimming yaml file') parser.add_argument('-r', '--check_retracted', action='store_true', help='Check for retracted SRA runs') return parser.parse_args() @@ -347,13 +347,25 @@ def main(): """ args = parse_args() - + + # Resolve metadata path: use provided path or build a sensible default from bioproject ids + if args.metadata: + metadata_path = args.metadata + else: + proj_tag = "_".join([bid.strip() for bid in args.bioproject_ids.split(',') if bid.strip()]) or "sra" + metadata_path = f"{proj_tag}_metadata.csv" + # Process BioProject IDs bioproject_ids = [bid.strip() for bid in args.bioproject_ids.split(',') if bid.strip()] search_term = " OR ".join(f"{bid}[BioProject]" for bid in bioproject_ids) new_metadata = get_new_srps(search_term, args.email) - prev_metadata = pd.read_csv(args.metadata) + # Try to read previous metadata from the resolved metadata_path; if missing use empty df + try: + prev_metadata = pd.read_csv(metadata_path) + except (FileNotFoundError, pd.errors.EmptyDataError): + print(f"Metadata file {metadata_path} not found or empty, using empty DataFrame.") + prev_metadata = pd.DataFrame(columns=FIELDS.keys()) new_sras = new_metadata.loc[~new_metadata['Run'].isin(prev_metadata['Run'])] combined_metadata = prev_metadata.copy() @@ -380,7 +392,7 @@ def main(): save_columns.append('global_trimming') new_sras[save_columns].to_csv( - args.metadata.replace('.csv', '_to_process.tsv'), + metadata_path.replace('.csv', '_to_process.tsv'), index=False, sep='\t' ) @@ -389,6 +401,8 @@ def main(): # Check for retracted runs and update metadata accordingly retracted_runs = check_retracted_runs(prev_metadata, new_metadata) combined_metadata['is_retracted'] = combined_metadata['Run'].isin(retracted_runs) + # ensure is_retracted is boolean + combined_metadata['is_retracted'] = combined_metadata['is_retracted'].astype(bool) # Ensure the retraction detection date column exists if 'retraction_detection_date_utc' not in combined_metadata.columns: @@ -399,10 +413,14 @@ def main(): mask = combined_metadata['is_retracted'] & combined_metadata['retraction_detection_date_utc'].isna() combined_metadata.loc[mask, 'retraction_detection_date_utc'] = now_str + # assert that all retracted runs have a detection date and no non-retracted runs have a date + assert combined_metadata.loc[combined_metadata['is_retracted'], 'retraction_detection_date_utc'].notna().all() + assert combined_metadata.loc[~combined_metadata['is_retracted'], 'retraction_detection_date_utc'].isna().all() + if combined_metadata.equals(prev_metadata): print("No new updates found.") else: - updated_path = args.metadata.replace('.csv', '_updated.csv') + updated_path = metadata_path.replace('.csv', '_updated.csv') combined_metadata.to_csv(updated_path, index=False) print(f"Updated metadata saved to {updated_path}") diff --git a/conf/test.config b/conf/test.config index 652d716..f673b5e 100644 --- a/conf/test.config +++ b/conf/test.config @@ -11,22 +11,22 @@ */ params { - config_profile_name = 'Test profile' + config_profile_name = 'Test profile' config_profile_description = 'Test dataset to check pipeline function' // Limit resources so that this can run on GitHub Actions - max_cpus = 2 + max_cpus = 2 max_memory = '6.GB' - max_time = '6.h' + max_time = '6.h' // Input data - bioproject = null - samples_to_process = "${projectDir}/assets/test/input/sra/input.tsv" - email = "test@test.com" - metadata = null + bioproject = null + samples_to_process = "${projectDir}/assets/test/input/sra/input.tsv" + email = "test@test.com" + metadata = null // Reference genome - reference = "${projectDir}/assets/test/input/ref/reference.fasta" - milk_reference = "${projectDir}/assets/test/input/ref/milk_reference.fasta" - milk_barcode = "${projectDir}/assets/test/input/barcode/barcode.csv" - outdir = "${projectDir}/testing/output" + reference = "${projectDir}/assets/test/input/ref/reference.fasta" + milk_reference = "${projectDir}/assets/test/input/ref/milk_reference.fasta" + milk_barcode = "${projectDir}/assets/test/input/barcode/barcode.csv" + outdir = "${projectDir}/testing/output" } diff --git a/nextflow.config b/nextflow.config index 1a5a689..f5e7f6b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -165,7 +165,7 @@ manifest { description = """A pipeline to assemble avian flu genomes""" mainScript = 'main.nf' nextflowVersion = '!>=24.10.0' - version = '1.5.1' + version = '1.5.2' doi = '' } diff --git a/workflows/flusra.nf b/workflows/flusra.nf index 91a9ae5..d2f3c6d 100644 --- a/workflows/flusra.nf +++ b/workflows/flusra.nf @@ -25,7 +25,7 @@ workflow FLUSRA { by: 1, failOnDuplicate: true, ) - .map { sra, meta, reads -> + .map { _sra, meta, reads -> tuple(meta, reads) } .set { reads_ch } @@ -71,14 +71,10 @@ workflow FLUSRA { .set { sample_reads_input } if (!params.fetch_and_pull) { - sample_reads_input.samples.filter { it - != null } - | PROCESS_SRA + sample_reads_input.samples.filter { it != null } | PROCESS_SRA ch_versions = ch_versions.mix(PROCESS_SRA.out.versions) - sample_reads_input.milk.filter { it - != null } - | MILK_FREYJA + sample_reads_input.milk.filter { it != null } | MILK_FREYJA ch_versions = ch_versions.mix(MILK_FREYJA.out.versions) }