Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 23 additions & 5 deletions bin/fetch_sra_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description='Fetch SRA metadata for a BioProject.')
parser.add_argument('-b', '--bioproject_ids', type=str, required=True, help='BioProject ID to monitor. Multiple IDs should be separated by commas.')
parser.add_argument('-e', '--email', type=str, required=True, help='Email address for Entrez')
parser.add_argument('-m', '--metadata', type=str, required=True, help='Path to old metadata file')
parser.add_argument('-m', '--metadata', type=str, required=False, help='Path to old metadata file')
parser.add_argument('-t', '--trimming_config', type=str, required=False, help='Path to trimming yaml file')
parser.add_argument('-r', '--check_retracted', action='store_true', help='Check for retracted SRA runs')
return parser.parse_args()
Expand Down Expand Up @@ -347,13 +347,25 @@ def main():
"""

args = parse_args()


# Resolve metadata path: use provided path or build a sensible default from bioproject ids
if args.metadata:
metadata_path = args.metadata
else:
proj_tag = "_".join([bid.strip() for bid in args.bioproject_ids.split(',') if bid.strip()]) or "sra"
metadata_path = f"{proj_tag}_metadata.csv"

# Process BioProject IDs
bioproject_ids = [bid.strip() for bid in args.bioproject_ids.split(',') if bid.strip()]
search_term = " OR ".join(f"{bid}[BioProject]" for bid in bioproject_ids)

new_metadata = get_new_srps(search_term, args.email)
prev_metadata = pd.read_csv(args.metadata)
# Try to read previous metadata from the resolved metadata_path; if missing use empty df
try:
prev_metadata = pd.read_csv(metadata_path)
except (FileNotFoundError, pd.errors.EmptyDataError):
print(f"Metadata file {metadata_path} not found or empty, using empty DataFrame.")
prev_metadata = pd.DataFrame(columns=FIELDS.keys())

new_sras = new_metadata.loc[~new_metadata['Run'].isin(prev_metadata['Run'])]
combined_metadata = prev_metadata.copy()
Expand All @@ -380,7 +392,7 @@ def main():
save_columns.append('global_trimming')

new_sras[save_columns].to_csv(
args.metadata.replace('.csv', '_to_process.tsv'),
metadata_path.replace('.csv', '_to_process.tsv'),
index=False,
sep='\t'
)
Expand All @@ -389,6 +401,8 @@ def main():
# Check for retracted runs and update metadata accordingly
retracted_runs = check_retracted_runs(prev_metadata, new_metadata)
combined_metadata['is_retracted'] = combined_metadata['Run'].isin(retracted_runs)
# ensure is_retracted is boolean
combined_metadata['is_retracted'] = combined_metadata['is_retracted'].astype(bool)

# Ensure the retraction detection date column exists
if 'retraction_detection_date_utc' not in combined_metadata.columns:
Expand All @@ -399,10 +413,14 @@ def main():
mask = combined_metadata['is_retracted'] & combined_metadata['retraction_detection_date_utc'].isna()
combined_metadata.loc[mask, 'retraction_detection_date_utc'] = now_str

# assert that all retracted runs have a detection date and no non-retracted runs have a date
assert combined_metadata.loc[combined_metadata['is_retracted'], 'retraction_detection_date_utc'].notna().all()
assert combined_metadata.loc[~combined_metadata['is_retracted'], 'retraction_detection_date_utc'].isna().all()

if combined_metadata.equals(prev_metadata):
print("No new updates found.")
else:
updated_path = args.metadata.replace('.csv', '_updated.csv')
updated_path = metadata_path.replace('.csv', '_updated.csv')
combined_metadata.to_csv(updated_path, index=False)
print(f"Updated metadata saved to {updated_path}")

Expand Down
22 changes: 11 additions & 11 deletions conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,22 @@
*/

params {
config_profile_name = 'Test profile'
config_profile_name = 'Test profile'
config_profile_description = 'Test dataset to check pipeline function'

// Limit resources so that this can run on GitHub Actions
max_cpus = 2
max_cpus = 2
max_memory = '6.GB'
max_time = '6.h'
max_time = '6.h'

// Input data
bioproject = null
samples_to_process = "${projectDir}/assets/test/input/sra/input.tsv"
email = "test@test.com"
metadata = null
bioproject = null
samples_to_process = "${projectDir}/assets/test/input/sra/input.tsv"
email = "test@test.com"
metadata = null
// Reference genome
reference = "${projectDir}/assets/test/input/ref/reference.fasta"
milk_reference = "${projectDir}/assets/test/input/ref/milk_reference.fasta"
milk_barcode = "${projectDir}/assets/test/input/barcode/barcode.csv"
outdir = "${projectDir}/testing/output"
reference = "${projectDir}/assets/test/input/ref/reference.fasta"
milk_reference = "${projectDir}/assets/test/input/ref/milk_reference.fasta"
milk_barcode = "${projectDir}/assets/test/input/barcode/barcode.csv"
outdir = "${projectDir}/testing/output"
}
2 changes: 1 addition & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ manifest {
description = """A pipeline to assemble avian flu genomes"""
mainScript = 'main.nf'
nextflowVersion = '!>=24.10.0'
version = '1.5.1'
version = '1.5.2'
doi = ''
}

Expand Down
10 changes: 3 additions & 7 deletions workflows/flusra.nf
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ workflow FLUSRA {
by: 1,
failOnDuplicate: true,
)
.map { sra, meta, reads ->
.map { _sra, meta, reads ->
tuple(meta, reads)
}
.set { reads_ch }
Expand Down Expand Up @@ -71,14 +71,10 @@ workflow FLUSRA {
.set { sample_reads_input }

if (!params.fetch_and_pull) {
sample_reads_input.samples.filter { it
!= null }
| PROCESS_SRA
sample_reads_input.samples.filter { it != null } | PROCESS_SRA
ch_versions = ch_versions.mix(PROCESS_SRA.out.versions)

sample_reads_input.milk.filter { it
!= null }
| MILK_FREYJA
sample_reads_input.milk.filter { it != null } | MILK_FREYJA
ch_versions = ch_versions.mix(MILK_FREYJA.out.versions)
}

Expand Down
Loading