diff --git a/.gitignore b/.gitignore index 6ca4a88..8551b05 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,6 @@ envs/amrplusplus-update.sif logs/ .nextflow* nextflow-24.10.9-dist +trace-*.txt +wb/config/wb.env +wb/config/gcp.env diff --git a/bin/kraken2_long_to_wide_update.py b/bin/kraken2_long_to_wide_update.py index 23a15e7..afcee85 100755 --- a/bin/kraken2_long_to_wide_update.py +++ b/bin/kraken2_long_to_wide_update.py @@ -13,28 +13,30 @@ 'R1':0, 'R2':1, 'R3':2, - 'K': 3, - 'P': 4, - 'C': 5, - 'O': 6, - 'F': 7, - 'G': 8, - 'S': 9, - 'U': 10 + 'D': 3, + 'K': 4, + 'P': 5, + 'C': 6, + 'O': 7, + 'F': 8, + 'G': 9, + 'S': 10, + 'U': 11 } taxa_level_names = { 0: 'Root1', 1: 'Root2', 2: 'Root3', - 3: 'Kingdom', - 4: 'Phylum', - 5: 'Class', - 6: 'Order', - 7: 'Family', - 8: 'Genus', - 9: 'Species', - 10: 'Unclassified' + 3: 'Domain', + 4: 'Kingdom', + 5: 'Phylum', + 6: 'Class', + 7: 'Order', + 8: 'Family', + 9: 'Genus', + 10: 'Species', + 11: 'Unclassified' } diff --git a/config/google_batch.config b/config/google_batch.config index e51154e..335876b 100644 --- a/config/google_batch.config +++ b/config/google_batch.config @@ -10,30 +10,58 @@ */ process { - // Default for all processes cpus = 4 memory = '16 GB' machineType = 'n2-standard-4' cache = 'lenient' - executor = 'google-batch' withName: 'runqc' { cpus = 16 - memory = "64.GB" - machineType = "n2-standard-16" + memory = '64 GB' + machineType = 'n2-standard-16' containerOptions = '--env _JAVA_OPTIONS="-Xmx60g"' } - withName: 'bwa_align' { + withName: 'bowtie2_index' { + cpus = 8 + memory = '64 GB' + machineType = 'n2-highmem-8' + } + + withName: 'index' { + cpus = 8 + memory = '64 GB' + machineType = 'n2-highmem-8' + } + + withName: 'bowtie2_align' { + cpus = 32 + memory = '128 GB' + machineType = 'n2-standard-32' + } + + withName: 'bowtie2_rm_contaminant_fq' { cpus = 32 - memory = "256.GB" - machineType = "n2-highmem-32" + memory = '256 GB' + machineType = 'n2-highmem-32' + } + + withName: 'bwa_align' { + cpus = 16 + memory = '128 GB' + machineType = 'n2-highmem-32' } withName: 'runkraken' { cpus = 16 - memory = "256.GB" - machineType = "n2-highmem-32" + memory = '256 GB' + machineType = 'n2-highmem-32' + } + + withName: 'runkrakenInterleaved' { + cpus = 16 + memory = '256 GB' + machineType = 'n2-highmem-32' } } diff --git a/docs/workbench.md b/docs/workbench.md new file mode 100644 index 0000000..59fe718 --- /dev/null +++ b/docs/workbench.md @@ -0,0 +1,140 @@ +# FloRes on Verily Workbench + +**Prerequisites**: +- You must create a Workbench workspace where you have **ADMIN** permissions +- All setup and execution must be done within this workspace + +## Dependencies + +- **Verily Workbench CLI** (`wb`) - Workbench command-line tool +- **Google Cloud SDK** (`gcloud`) - GCP command-line tool +- **Docker** - For building and pushing container images (must be running) +- **Nextflow v24** - Workflow orchestration (installed in Workbench app) + - **Note**: v25 has breaking changes and is not compatible with this pipeline + +## Quick Start: Workbench Orchestration with Google Batch + +This guide walks through setting up and running FloRes with Workbench orchestration and Google Batch compute. The setup is split between local commands (for infrastructure) and Workbench app commands (for execution). + +### Step 1: Create Workspace and App + +Create a new workspace and app in the Workbench UI (or use the CLI if preferred). + +### Step 2: Local Setup + +Run these commands on your **local machine**: + +```bash +# Set your active workspace (replace with your workspace ID) +wb workspace set --id=your-workspace-id + +# Copy the Workbench environment template +cp wb/config/wb.env.template wb/config/wb.env +``` + +Edit `wb/config/wb.env` and set the user-defined variables: +- `GCS_BUCKET`: Your Workbench GCS bucket resource ID (e.g., `nf-output`) +- `GCS_REF_BUCKET`: Bucket containing reference genomes (e.g., `referencegenomes-wb-my-workspace-1234`) +- `GCS_BUCKET_LOCATION`: Region (default: `us-central1`) +- `GOOGLE_ARTIFACT_REPO`: Your artifact registry repo (e.g., `nextflow-containers`) + +**Note**: Project IDs, service accounts, and registry paths are automatically determined from your `gcloud` and `wb` CLI configurations. + +Then run: + +```bash +# Set up infrastructure (creates buckets, service accounts, etc.) +./wb/setup_infra.sh wb + +# Upload input data and reference databases to GCS +./wb/upload_data.sh wb + +# Build Docker image and push to Artifact Registry +# NOTE: Docker must be running before executing this command +./wb/build.sh --env wb --push +``` + +### Step 3: Workbench App Setup + +Open your Workbench app, launch the Terminal, and run: + +```bash +# Clone the repository +cd repos/ && git clone https://github.com/passdan/FloRes.git && cd FloRes/ + +# Copy the environment template +cp wb/config/wb.env.template wb/config/wb.env +``` + +Now copy your local `wb/config/wb.env` configuration into the Workbench app. + +### Step 4: Run the Pipeline + +```bash +./wb/run.sh --env wb +``` + +Results will be stored in your configured GCS bucket. + +**Known Issues**: +- The `gcloud storage cp` command may not correctly resolve Workbench resource names to full `gs://` paths when running `upload_data.sh` or `run.sh`. If you encounter path resolution issues, manually specify the full GCS bucket path in your `wb.env` configuration. + +--- + +## Alternative: Quick Demo in Workbench JupyterLab + +For a simple demonstration without Google Batch (both orchestration and execution running in the same Workbench app): + +Create a new Workbench workspace and add this git repository in the **Apps** tab. + +Create a JupyterLab app instance, launch it, and open the terminal: + +```bash +# Initialize conda +conda init +source ~/.bashrc + +# Navigate to the repository +cd repos/FloRes + +# Create and activate the conda environment +conda env create -f envs/AMR++_env.yaml +conda activate AMR++_env + +# Verify Nextflow version 24 is installed +nextflow -v + +# Run the test pipeline (takes ~5 minutes) +nextflow run main_AMR++.nf +``` + +Expected output: results in `~/repos/FloRes/test_results` + +--- + +## Configuration + +### Resource Scaling on Google Batch + +Google Batch does NOT automatically scale machine types based on CPU/memory requests. Resource scaling is configured in `config/google_batch.config`. + +Each process that needs more than default resources must explicitly specify a matching `machineType`. Current resource allocations: + +| Process | CPUs | Memory | Machine Type | +|---------|------|--------|-------------| +| Default | 4 | 16 GB | n2-standard-4 | +| runqc | 16 | 64 GB | n2-standard-16 | +| bowtie2_align | 32 | 128 GB | n2-standard-32 | +| bowtie2_rm_contaminant_fq | 32 | 256 GB | n2-highmem-32 | +| bwa_align | 16 | 128 GB | n2-highmem-32 | +| runkraken | 16 | 256 GB | n2-highmem-32 | +| runkrakenInterleaved | 16 | 256 GB | n2-highmem-32 | + +### Supporting Environments + +**Local** (testing): `./wb/run.sh --env local` +- Requires Docker and Conda + +**GCP** (debugging): `./wb/run.sh --env gcp` +- For debugging Google Batch jobs with visible logs +- Requires `gcloud` CLI and Docker diff --git a/envs/containers/Dockerfile b/envs/containers/Dockerfile index 12b2a90..ba8d795 100755 --- a/envs/containers/Dockerfile +++ b/envs/containers/Dockerfile @@ -20,6 +20,7 @@ RUN apt-get update -q && \ subversion \ wget \ g++ \ + make \ libarchive13 \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* @@ -59,5 +60,19 @@ RUN set -x && \ find /opt/conda/ -follow -type f -name '*.js.map' -delete && \ /opt/conda/bin/conda clean -afy && \ /opt/conda/bin/conda install -c conda-forge mamba && \ - /opt/conda/bin/mamba install -c conda-forge -c bioconda git python=3.9 trimmomatic multiqc bwa samtools bedtools kraken2 multiqc fastqc krona bracken numpy pysam pandas biopython matplotlib nextflow && \ + /opt/conda/bin/mamba install -c conda-forge -c bioconda git python=3.9 trimmomatic multiqc bwa samtools bedtools kraken2 multiqc fastqc krona bracken numpy pysam pandas biopython matplotlib nextflow=24 fastp bowtie2 && \ conda clean --all + +# Copy AMR++ bin scripts into the container +COPY bin /opt/amrplusplus/bin +RUN chmod +x /opt/amrplusplus/bin/*.py && \ + chmod +x /opt/amrplusplus/bin/rarefaction && \ + chmod +x /opt/amrplusplus/bin/resistome + +# Clone AmrPlusPlus_SNP for SNP verification (replaces empty dir copied from bin/) +RUN rm -rf /opt/amrplusplus/bin/AmrPlusPlus_SNP && \ + git clone https://github.com/Isabella136/AmrPlusPlus_SNP.git /opt/amrplusplus/bin/AmrPlusPlus_SNP && \ + chmod -R 755 /opt/amrplusplus/bin/AmrPlusPlus_SNP + +# Add bin directory to PATH +ENV PATH="/opt/amrplusplus/bin:${PATH}" diff --git a/modules/Alignment/bowtie2-for_AMRplusplus.nf b/modules/Alignment/bowtie2-for_AMRplusplus.nf index f7b8dfc..f026f9a 100755 --- a/modules/Alignment/bowtie2-for_AMRplusplus.nf +++ b/modules/Alignment/bowtie2-for_AMRplusplus.nf @@ -145,6 +145,6 @@ process HostRemovalStats { path("host.removal.stats"), emit: combo_host_rm_stats """ - ${PYTHON3} $baseDir/bin/samtools_idxstats.py -i ${host_rm_stats} -o host.removal.stats + ${PYTHON3} /opt/amrplusplus/bin/samtools_idxstats.py -i ${host_rm_stats} -o host.removal.stats """ } diff --git a/modules/Alignment/bwa.nf b/modules/Alignment/bwa.nf index 1617558..cf0a59f 100755 --- a/modules/Alignment/bwa.nf +++ b/modules/Alignment/bwa.nf @@ -63,22 +63,22 @@ process bwa_align { script: if( deduped == "N") """ - ${BWA} mem ${indexfiles[0]} ${reads} -t ${threads} -R '@RG\\tID:${pair_id}\\tSM:${pair_id}' | \ - ${SAMTOOLS} sort -@ ${threads} -m 4G -o ${pair_id}_alignment_sorted.bam + ${BWA} mem ${indexfiles[0]} ${reads} -t ${task.cpus} -R '@RG\\tID:${pair_id}\\tSM:${pair_id}' | \ + ${SAMTOOLS} sort -@ ${task.cpus} -m 4G -o ${pair_id}_alignment_sorted.bam """ else if( deduped == "Y") """ - ${BWA} mem ${indexfiles[0]} ${reads} -t ${threads} -R '@RG\\tID:${pair_id}\\tSM:${pair_id}' > ${pair_id}_alignment.sam - ${SAMTOOLS} view -@ ${threads} -S -b ${pair_id}_alignment.sam > ${pair_id}_alignment.bam + ${BWA} mem ${indexfiles[0]} ${reads} -t ${task.cpus} -R '@RG\\tID:${pair_id}\\tSM:${pair_id}' > ${pair_id}_alignment.sam + ${SAMTOOLS} view -@ ${task.cpus} -S -b ${pair_id}_alignment.sam > ${pair_id}_alignment.bam rm ${pair_id}_alignment.sam - ${SAMTOOLS} sort -@ ${threads} -m 3G -n ${pair_id}_alignment.bam -o ${pair_id}_alignment_sorted.bam + ${SAMTOOLS} sort -@ ${task.cpus} -m 3G -n ${pair_id}_alignment.bam -o ${pair_id}_alignment_sorted.bam rm ${pair_id}_alignment.bam - ${SAMTOOLS} fixmate -@ ${threads} ${pair_id}_alignment_sorted.bam ${pair_id}_alignment_sorted_fix.bam - ${SAMTOOLS} sort -@ ${threads} -m 3G ${pair_id}_alignment_sorted_fix.bam -o ${pair_id}_alignment_sorted_fix.sorted.bam + ${SAMTOOLS} fixmate -@ ${task.cpus} ${pair_id}_alignment_sorted.bam ${pair_id}_alignment_sorted_fix.bam + ${SAMTOOLS} sort -@ ${task.cpus} -m 3G ${pair_id}_alignment_sorted_fix.bam -o ${pair_id}_alignment_sorted_fix.sorted.bam rm ${pair_id}_alignment_sorted_fix.bam ${SAMTOOLS} rmdup -S ${pair_id}_alignment_sorted_fix.sorted.bam ${pair_id}_alignment_dedup.bam rm ${pair_id}_alignment_sorted_fix.sorted.bam - ${SAMTOOLS} view -@ ${threads} -h -o ${pair_id}_alignment_dedup.sam ${pair_id}_alignment_dedup.bam + ${SAMTOOLS} view -@ ${task.cpus} -h -o ${pair_id}_alignment_dedup.sam ${pair_id}_alignment_dedup.bam rm ${pair_id}_alignment_dedup.sam """ else @@ -107,13 +107,13 @@ process bwa_rm_contaminant_fq { path("${pair_id}.samtools.idxstats"), emit: host_rm_stats """ - ${BWA} mem ${indexfiles[0]} ${reads[0]} ${reads[1]} -t ${threads} | \ - ${SAMTOOLS} sort -@ ${threads} -m 4G -o ${pair_id}.host.sorted.bam + ${BWA} mem ${indexfiles[0]} ${reads[0]} ${reads[1]} -t ${task.cpus} | \ + ${SAMTOOLS} sort -@ ${task.cpus} -m 4G -o ${pair_id}.host.sorted.bam ${SAMTOOLS} index ${pair_id}.host.sorted.bam && ${SAMTOOLS} idxstats ${pair_id}.host.sorted.bam > ${pair_id}.samtools.idxstats ${SAMTOOLS} view -h -f 12 -b ${pair_id}.host.sorted.bam -o ${pair_id}.host.sorted.removed.bam - ${SAMTOOLS} sort -n -@ ${threads} -m 3G ${pair_id}.host.sorted.removed.bam -o ${pair_id}.host.resorted.removed.bam + ${SAMTOOLS} sort -n -@ ${task.cpus} -m 3G ${pair_id}.host.sorted.removed.bam -o ${pair_id}.host.resorted.removed.bam ${SAMTOOLS} \ - fastq -@ ${threads} -c 6 \ + fastq -@ ${task.cpus} -c 6 \ ${pair_id}.host.resorted.removed.bam \ -1 ${pair_id}.non.host.R1.fastq.gz \ -2 ${pair_id}.non.host.R2.fastq.gz \ @@ -143,6 +143,6 @@ process HostRemovalStats { path("host.removal.stats"), emit: combo_host_rm_stats """ - ${PYTHON3} $baseDir/bin/samtools_idxstats.py -i ${host_rm_stats} -o host.removal.stats + ${PYTHON3} /opt/amrplusplus/bin/samtools_idxstats.py -i ${host_rm_stats} -o host.removal.stats """ } diff --git a/modules/Fastqc/fastqc.nf b/modules/Fastqc/fastqc.nf index bf3c8d0..893d592 100755 --- a/modules/Fastqc/fastqc.nf +++ b/modules/Fastqc/fastqc.nf @@ -51,7 +51,8 @@ process multiqc { script: """ cp $config/* . - multiqc -v data* --interactive -f --cl-config "max_table_rows: 3000" - mv multiqc_data/multiqc_general_stats.txt . + multiqc -v data* --interactive -f --cl-config "max_table_rows: 3000" --outdir multiqc_data --filename multiqc_report.html + mv multiqc_data/multiqc_report_data/multiqc_general_stats.txt . + mv multiqc_data/multiqc_report.html . """ } diff --git a/modules/Microbiome/kraken2.nf b/modules/Microbiome/kraken2.nf index 9219438..e7c9565 100755 --- a/modules/Microbiome/kraken2.nf +++ b/modules/Microbiome/kraken2.nf @@ -78,7 +78,7 @@ process krakenresults { path("kraken_analytic_matrix.csv") """ - ${PYTHON3} $baseDir/bin/kraken2_long_to_wide.py -i ${kraken_reports} -o kraken_analytic_matrix.csv + ${PYTHON3} /opt/amrplusplus/bin/kraken2_long_to_wide.py -i ${kraken_reports} -o kraken_analytic_matrix.csv """ } diff --git a/modules/Microbiome/kraken_and_bracken.nf b/modules/Microbiome/kraken_and_bracken.nf index f7f60a6..bd511e2 100755 --- a/modules/Microbiome/kraken_and_bracken.nf +++ b/modules/Microbiome/kraken_and_bracken.nf @@ -104,13 +104,14 @@ process krakenresults { """ - ${PYTHON3} $baseDir/bin/kraken2_long_to_wide_update.py -i ${kraken_reports} -o kraken_analytic_matrix.csv + ${PYTHON3} /opt/amrplusplus/bin/kraken2_long_to_wide_update.py -i ${kraken_reports} -o kraken_analytic_matrix.csv """ } process runbracken { label "microbiome" - + errorStrategy { task.exitStatus == 1 ? 'ignore' : 'terminate' } + input: tuple val(sample_id), path(kraken_report), val(level) path(krakendb) diff --git a/modules/Resistome/resistome.nf b/modules/Resistome/resistome.nf index 3b987df..420f5ea 100755 --- a/modules/Resistome/resistome.nf +++ b/modules/Resistome/resistome.nf @@ -40,7 +40,7 @@ process build_dependencies { #mv rarefaction ../ #cd ../ #rm -rf rarefactionanalyzer - cp $baseDir/bin/rarefaction . + cp /opt/amrplusplus/bin/rarefaction . #git clone https://github.com/cdeanj/resistomeanalyzer.git @@ -50,7 +50,7 @@ process build_dependencies { #mv resistome ../ #cd ../ #rm -rf resistomeanalyzer - cp $baseDir/bin/resistome . + cp /opt/amrplusplus/bin/resistome . git clone https://github.com/Isabella136/AmrPlusPlus_SNP.git chmod -R 777 AmrPlusPlus_SNP/ @@ -119,7 +119,7 @@ process resistomeresults { path("${prefix}_analytic_matrix.csv"), emit: snp_count_matrix, optional: true """ - ${PYTHON3} $baseDir/bin/amr_long_to_wide.py -i ${resistomes} -o ${prefix}_analytic_matrix.csv + ${PYTHON3} /opt/amrplusplus/bin/amr_long_to_wide.py -i ${resistomes} -o ${prefix}_analytic_matrix.csv """ } @@ -189,7 +189,7 @@ process plotrarefaction { """ mkdir data/ mv *.tsv data/ - python $baseDir/bin/rfplot.py --dir ./data --nd --s --sd . + python /opt/amrplusplus/bin/rfplot.py --dir ./data --nd --s --sd . """ } @@ -219,7 +219,7 @@ process runsnp { path("${sample_id}.${prefix}_SNPs${sample_id}/*") """ - cp -r $baseDir/bin/AmrPlusPlus_SNP/* . + cp -r /opt/amrplusplus/bin/AmrPlusPlus_SNP/* . # change name to stay consistent with count matrix name, but only if the names don't match if [ "${bam}" != "${sample_id}.bam" ]; then @@ -257,7 +257,7 @@ process snpresults { """ - ${PYTHON3} $baseDir/bin/snp_long_to_wide.py -i ${snp_counts} -o SNPconfirmed_${prefix}_analytic_matrix.csv + ${PYTHON3} /opt/amrplusplus/bin/snp_long_to_wide.py -i ${snp_counts} -o SNPconfirmed_${prefix}_analytic_matrix.csv """ } diff --git a/modules/Trimming/trimmomatic.nf b/modules/Trimming/trimmomatic.nf index db61c18..a26869f 100755 --- a/modules/Trimming/trimmomatic.nf +++ b/modules/Trimming/trimmomatic.nf @@ -41,7 +41,7 @@ process runqc { """ ${TRIMMOMATIC} \ PE \ - -threads ${threads} \ + -threads ${task.cpus} \ ${reads[0]} ${reads[1]} ${sample_id}.1P.fastq.gz ${sample_id}.1U.fastq.gz ${sample_id}.2P.fastq.gz ${sample_id}.2U.fastq.gz \ ILLUMINACLIP:${adapters}:2:30:10:3:TRUE \ LEADING:${leading} \ @@ -73,6 +73,6 @@ process QCstats { path("trimmomatic.stats"), emit: combo_trim_stats """ - ${PYTHON3} $baseDir/bin/trimmomatic_stats.py -i ${stats} -o trimmomatic.stats + ${PYTHON3} /opt/amrplusplus/bin/trimmomatic_stats.py -i ${stats} -o trimmomatic.stats """ } diff --git a/nextflow.config b/nextflow.config index bb06a4d..6d66c20 100755 --- a/nextflow.config +++ b/nextflow.config @@ -75,12 +75,12 @@ profiles { 'google-batch' { includeConfig "config/google_batch.config" process.executor = 'google-batch' - process.container = "us-central1-docker.pkg.dev/${GOOGLE_CLOUD_PROJECT}/${GOOGLE_ARTIFACT_REPO}/amrplusplus-workbench:latest" + process.container = "us-central1-docker.pkg.dev/${GOOGLE_CLOUD_PROJECT}/${GOOGLE_ARTIFACT_REPO}/flores-workbench:latest" process.errorStrategy = { task.exitStatus==50001 ? 'retry' : 'terminate' } process.maxRetries = 2 workDir = "gs://${GCS_BUCKET}/scratch" - refDir = "gs://referencegenomes-wb-mighty-tangerine-1678" + refDir = "gs://${GCS_REF_BUCKET}" google.region = "${GCS_BUCKET_LOCATION}" google.project = "$GOOGLE_CLOUD_PROJECT" @@ -99,12 +99,12 @@ profiles { workbench { includeConfig "config/google_batch.config" process.executor = 'google-batch' - process.container = "us-central1-docker.pkg.dev/${GOOGLE_CLOUD_PROJECT}/${GOOGLE_ARTIFACT_REPO}/amrplusplus-workbench:latest" + process.container = "us-central1-docker.pkg.dev/${GOOGLE_CLOUD_PROJECT}/${GOOGLE_ARTIFACT_REPO}/flores-workbench:latest" process.errorStrategy = { task.exitStatus==50001 ? 'retry' : 'terminate' } process.maxRetries = 5 workDir = "gs://${GCS_BUCKET}/scratch" - params.refDir = "gs://referencegenomes-wb-mighty-tangerine-1678" + params.refDir = "gs://${GCS_REF_BUCKET}" google.region = "${GCS_BUCKET_LOCATION}" google.project = "$GOOGLE_CLOUD_PROJECT" diff --git a/params.config b/params.config index 9c8aa37..ad563bf 100755 --- a/params.config +++ b/params.config @@ -14,10 +14,10 @@ params { // ----------------------------------------------------------------- /* Location of forward and reverse read pairs */ - reads = "gs://{GCS_BUCKET}/rawdata-wb-farms/S358_MiSeq_BHWNTNDRX5/fastq/C1*{1,2}.f*q.gz" + reads = "${baseDir}/data/raw/*_R{1,2}.fastq.gz" /* Output directory */ - output = "gs://${GCS_BUCKET}/results" + output = "test_results" // ----------------------------------------------------------------- // Reference Databases @@ -28,13 +28,13 @@ params { split = "" /* Location of reference/host genome */ - host = "gs://${GCS_REF_BUCKET}/grch38_1kgmaj.fa" + host = "${baseDir}/data/host/chr21.fasta.gz" /* Optionally, you can specify the location of the host index files created with bwa with the path and wildcard (*): */ - host_index = null - - /* Kraken database location, default is "null" and will download minikraken db */ - kraken_db = "gs://${GCS_REF_BUCKET}/minikraken_8GB_20200312" + host_index = "${baseDir}/data/host/chr21.fasta.gz*" + + /* Kraken database location, default is "null" and will download minikraken db */ + kraken_db = null /* Location of amr index files with wildcard */ /* If you want the bowtie indexes built, use the bareword "null" */ @@ -105,55 +105,6 @@ params { dada2_db = "$baseDir/data/qiime/gg-13-8-99-515-806-nb-classifier.qza" } - // ----------------------------------------------------------------- - // default step resource requirements - // ----------------------------------------------------------------- - - -process { - // Stage resource usages - // See config for singularity install details - cache = 'lenient' - executor = 'google-batch' - - withName: 'runqc' { - cpus = 16 - memory = "64.GB" - machineType = "n2-standard-16" - containerOptions = '--env _JAVA_OPTIONS="-Xmx60g"' - } - - withName: 'bowtie2_align' { - cpus = 32 - memory = "128.GB" - } - - withName: 'bowtie2_rm_contaminant_fq' { - cpus = 32 - memory = "256.GB" - machineType = "n2-highmem-32" - } - - withName: 'bwa_align' { - cpus = 16 - memory = "128.GB" - machineType = "n2-highmem-32" - } - - withName: 'runkraken' { - cpus = 16 - memory = "256.GB" - machineType = "n2-highmem-32" - } - - withName: 'runkrakenInterleaved' { - cpus = 16 - memory = "256.GB" - machineType = "n2-highmem-32" - } -} - - // The location of each dependency binary needs to be specified here. // The examples listed below are assuming the tools are already in the $PATH, however, // the absolute path to each tool can be entered individually. diff --git a/params_google_batch.config b/params_google_batch.config new file mode 100644 index 0000000..da16807 --- /dev/null +++ b/params_google_batch.config @@ -0,0 +1,92 @@ +/* + * Google Batch Parameter Configuration + * All file paths use gs:// for cloud execution via Google Batch. + * Uses GCS_BUCKET and GCS_REF_BUCKET environment variables. + */ + +def gcs_bucket = System.getenv("GCS_BUCKET") ?: "nf-files" +def gcs_ref_bucket = System.getenv("GCS_REF_BUCKET") ?: gcs_bucket + +params { + help = false + + // ----------------------------------------------------------------- + // Input Data + // ----------------------------------------------------------------- + reads = "gs://${gcs_bucket}/data/raw/*_R{1,2}.fastq.gz" + bam_files = null + split = "" + + // ----------------------------------------------------------------- + // Reference Databases + // ----------------------------------------------------------------- + host = "gs://${gcs_ref_bucket}/grch38_1kgmaj.fa" + host_index = "gs://${gcs_ref_bucket}/grch38_1kgmaj{.fa,*.bt2}" + + kraken_db = "gs://${gcs_ref_bucket}/minikraken_8GB_20200312" + + amr_index = null + amr = "gs://${gcs_bucket}/data/amr/megares_database_v3.fasta" + annotation = "gs://${gcs_bucket}/data/amr/megares_annotations_v3.00.csv" + + // ----------------------------------------------------------------- + // Output + // ----------------------------------------------------------------- + output = "gs://${gcs_bucket}/results" + + // ----------------------------------------------------------------- + // Pipeline Logic & Analysis Toggles + // ----------------------------------------------------------------- + snp = "Y" + deduped = "N" + prefix = "AMR" + threads = 8 + + // ----------------------------------------------------------------- + // Trimming Parameters + // ----------------------------------------------------------------- + adapters = "gs://${gcs_bucket}/data/adapters/nextera.fa" + leading = 3 + trailing = 3 + slidingwindow = "4:15" + minlen = 36 + + // ----------------------------------------------------------------- + // Resistome Analysis Parameters + // ----------------------------------------------------------------- + threshold = 80 + min = 5 + max = 100 + skip = 5 + samples = 1 + + // ----------------------------------------------------------------- + // Other Tools + // ----------------------------------------------------------------- + multiqc = "gs://${gcs_bucket}/data/multiqc" + + p_trim_left_f = 25 + p_trim_left_r = 26 + p_trunc_len_f = 225 + p_trunc_len_r = 220 + + taxlevel = "R1,R2,R3,K,P,C,O,F,G,S" + + dada2_db = "gs://${gcs_bucket}/data/qiime/gg-13-8-99-515-806-nb-classifier.qza" +} + +env { + JAVA = "java" + TRIMMOMATIC = "trimmomatic" + FASTP = "fastp" + PYTHON3 = "python3" + BWA = "bwa" + BOWTIE2 = "bowtie2" + SAMTOOLS = "samtools" + BEDTOOLS = "bedtools" + RESISTOME = "resistome" + RAREFACTION = "rarefaction" + SNPFINDER = "snpfinder" + KRAKEN2 = "kraken2" + QIIME = "qiime" +} diff --git a/subworkflows/fastq_host_removal.nf b/subworkflows/fastq_host_removal.nf index c93ecc0..cfaacd6 100755 --- a/subworkflows/fastq_host_removal.nf +++ b/subworkflows/fastq_host_removal.nf @@ -19,9 +19,7 @@ workflow FASTQ_RM_HOST_WF { reference_index_files = bowtie2_index.out } else { reference_index_files = Channel - .fromPath(Paths.get(params.host_index)) - .map { file(it.toString()) } - .filter { file(it).exists() } + .fromPath(params.host_index) .toList() .map { files -> if (files.size() < 6) { diff --git a/wb/config/gcp.env b/wb/config/gcp.env deleted file mode 100644 index 75756fc..0000000 --- a/wb/config/gcp.env +++ /dev/null @@ -1,41 +0,0 @@ -# GCP environment configuration -# This is used when running Nextflow on Google Batch with local orchestration -# Nextflow runs on your local machine, jobs execute on Google Batch - -############################################################################### -# USER CONFIGURATION - UPDATE THESE VALUES -############################################################################### - -# GCS bucket for storing pipeline data and results -# Replace with your GCS bucket name (without gs:// prefix) -# Example: "my-nextflow-data" -export GCS_BUCKET=mod-wb-mighty-tangerine-1678 - -# GCS bucket location/region -# Common values: us-central1, us-east1, europe-west1 -export GCS_BUCKET_LOCATION=europe-west2 - -# Google Artifact Registry repository name -# Replace with your artifact registry repository name -# Example: "nextflow-containers" -export GOOGLE_ARTIFACT_REPO=nextflow-containers - -############################################################################### -# AUTOMATIC CONFIGURATION - DO NOT MODIFY -############################################################################### - -# Google Cloud project (auto-detected from gcloud CLI) -export GOOGLE_CLOUD_PROJECT=$(gcloud config get project) - -# Service account configuration (auto-generated) -export GOOGLE_SERVICE_ACCOUNT_NAME=nextflow-runner -export GOOGLE_SERVICE_ACCOUNT_EMAIL="${GOOGLE_SERVICE_ACCOUNT_NAME}@${GOOGLE_CLOUD_PROJECT}.iam.gserviceaccount.com" - -# Docker image configuration (auto-generated paths) -IMAGE_NAME="amrplusplus-workbench" -IMAGE_TAG="latest" -REGISTRY_PATH="us-central1-docker.pkg.dev/${GOOGLE_CLOUD_PROJECT}/${GOOGLE_ARTIFACT_REPO}/${IMAGE_NAME}:${IMAGE_TAG}" - -# Nextflow profile and config -NEXTFLOW_PROFILE="google_batch" -NEXTFLOW_CONFIG="params_google-batch.config" diff --git a/wb/config/gcp.env.template b/wb/config/gcp.env.template index 79032ee..609e78c 100644 --- a/wb/config/gcp.env.template +++ b/wb/config/gcp.env.template @@ -32,7 +32,7 @@ export GOOGLE_SERVICE_ACCOUNT_NAME=nextflow-runner export GOOGLE_SERVICE_ACCOUNT_EMAIL="${GOOGLE_SERVICE_ACCOUNT_NAME}@${GOOGLE_CLOUD_PROJECT}.iam.gserviceaccount.com" # Docker image configuration (auto-generated paths) -IMAGE_NAME="amrplusplus-workbench" +IMAGE_NAME="flores-workbench" IMAGE_TAG="latest" REGISTRY_PATH="us-central1-docker.pkg.dev/${GOOGLE_CLOUD_PROJECT}/${GOOGLE_ARTIFACT_REPO}/${IMAGE_NAME}:${IMAGE_TAG}" diff --git a/wb/config/local.env b/wb/config/local.env index ecd7915..a1df9f3 100644 --- a/wb/config/local.env +++ b/wb/config/local.env @@ -7,8 +7,8 @@ # Docker image configuration # Replace with your Docker Hub username -# Example: "johndoe/amrplusplus-workbench" -IMAGE_NAME="passdan/amrplusplus-workbench" +# Example: "johndoe/flores-workbench" +IMAGE_NAME="passdan/flores-workbench" ############################################################################### # AUTOMATIC CONFIGURATION - DO NOT MODIFY diff --git a/wb/config/local.env.template b/wb/config/local.env.template index cd78e5b..cba30b5 100644 --- a/wb/config/local.env.template +++ b/wb/config/local.env.template @@ -7,8 +7,8 @@ # Docker image configuration # Replace with your Docker Hub username -# Example: "johndoe/amrplusplus-workbench" -IMAGE_NAME="/amrplusplus-workbench" +# Example: "johndoe/flores-workbench" +IMAGE_NAME="/flores-workbench" ############################################################################### # AUTOMATIC CONFIGURATION - DO NOT MODIFY diff --git a/wb/config/wb.env b/wb/config/wb.env deleted file mode 100644 index c4e4ea6..0000000 --- a/wb/config/wb.env +++ /dev/null @@ -1,44 +0,0 @@ -# Workbench environment configuration -# This is used when running Nextflow on Google Batch with Workbench orchestration -# Both Nextflow orchestration and job execution happen in Workbench/Google Cloud - -############################################################################### -# USER CONFIGURATION - UPDATE THESE VALUES -############################################################################### - -# GCS bucket resource ID (created via Workbench) -# Replace with your Workbench GCS bucket resource ID -# Example: "nf-output" or "my-pipeline-data" -# Note: Use the resource ID, not the full GCS bucket name -export GCS_BUCKET=wb-mighty-tangerine-1678 -export GCS_REF_BUCKET=referencegenomes-wb-mighty-tangerine-1678 - -# GCS bucket location/region -# Common values: us-central1, us-east1, europe-west1 -export GCS_BUCKET_LOCATION=europe-west2 - -# Google Artifact Registry repository name -# Replace with your artifact registry repository name -# Example: "nextflow-containers" -export GOOGLE_ARTIFACT_REPO=nextflow-containers - -############################################################################### -# AUTOMATIC CONFIGURATION - DO NOT MODIFY -############################################################################### - -# Google Cloud project (auto-detected from Workbench workspace) -export WORKBENCH_GOOGLE_CLOUD_PROJECT=$(wb status 2>/dev/null | grep "Google project" | awk -F': ' '{print $2}' | xargs) -export GOOGLE_CLOUD_PROJECT="${WORKBENCH_GOOGLE_CLOUD_PROJECT}" - -# Service account configuration (Workbench Pet Service Account - auto-detected) -export GOOGLE_SERVICE_ACCOUNT_EMAIL=$(wb auth status 2>&1 | grep "Service account email" | awk -F': ' '{print $2}' | xargs) -export GOOGLE_SERVICE_ACCOUNT_NAME=$(echo "${GOOGLE_SERVICE_ACCOUNT_EMAIL}" | cut -d'@' -f1) - -# Docker image configuration (auto-generated paths) -IMAGE_NAME="amrplusplus-workbench" -IMAGE_TAG="latest" -REGISTRY_PATH="us-central1-docker.pkg.dev/${WORKBENCH_GOOGLE_CLOUD_PROJECT}/${GOOGLE_ARTIFACT_REPO}/${IMAGE_NAME}:${IMAGE_TAG}" - -# Nextflow profile and config -NEXTFLOW_PROFILE="workbench" -NEXTFLOW_CONFIG="params.config" diff --git a/wb/config/wb.env.template b/wb/config/wb.env.template index 3503d55..ab9011a 100644 --- a/wb/config/wb.env.template +++ b/wb/config/wb.env.template @@ -1,6 +1,12 @@ # Workbench environment configuration # This is used when running Nextflow on Google Batch with Workbench orchestration # Both Nextflow orchestration and job execution happen in Workbench/Google Cloud +# +# Setup: +# cp wb/config/wb.env.template wb/config/wb.env +# # Edit wb/config/wb.env and fill in the values below +# +# Note: wb.env is gitignored — each user/workspace needs its own copy. ############################################################################### # USER CONFIGURATION - UPDATE THESE VALUES @@ -12,6 +18,11 @@ # Note: Use the resource ID, not the full GCS bucket name export GCS_BUCKET= +# GCS reference data bucket +# Replace with the bucket containing reference genomes +# Example: "referencegenomes-wb-my-workspace-1234" +export GCS_REF_BUCKET= + # GCS bucket location/region # Common values: us-central1, us-east1, europe-west1 export GCS_BUCKET_LOCATION=us-central1 @@ -34,10 +45,10 @@ export GOOGLE_SERVICE_ACCOUNT_EMAIL=$(wb auth status 2>&1 | grep "Service accoun export GOOGLE_SERVICE_ACCOUNT_NAME=$(echo "${GOOGLE_SERVICE_ACCOUNT_EMAIL}" | cut -d'@' -f1) # Docker image configuration (auto-generated paths) -IMAGE_NAME="amrplusplus-workbench" +IMAGE_NAME="flores-workbench" IMAGE_TAG="latest" REGISTRY_PATH="us-central1-docker.pkg.dev/${WORKBENCH_GOOGLE_CLOUD_PROJECT}/${GOOGLE_ARTIFACT_REPO}/${IMAGE_NAME}:${IMAGE_TAG}" # Nextflow profile and config NEXTFLOW_PROFILE="workbench" -NEXTFLOW_CONFIG="params_google-batch.config" +NEXTFLOW_CONFIG="params_google_batch.config" diff --git a/wb/run.sh b/wb/run.sh index 67580fe..fbf3e1e 100755 --- a/wb/run.sh +++ b/wb/run.sh @@ -46,6 +46,7 @@ fi CONFIG_FILE="${SCRIPT_DIR}/config/${ENV}.env" if [[ ! -f "$CONFIG_FILE" ]]; then echo "Error: Configuration file not found: $CONFIG_FILE" + echo "Create it from the template: cp ${SCRIPT_DIR}/config/${ENV}.env.template ${CONFIG_FILE}" exit 1 fi @@ -132,5 +133,4 @@ fi now=$(date +"%Y-%m-%d--%H-%M") # Run nextflow with Google Batch profile -#nextflow -c "${NEXTFLOW_CONFIG}" run main_AMR++.nf --pipeline "standard_AMR_wKraken_and_Bracken" -profile "${NEXTFLOW_PROFILE}" -with-trace "trace-${now}.txt" -resume -bg -nextflow run main_AMR++.nf --pipeline "standard_AMR_wKraken_and_Bracken" -with-trace "trace-${now}.txt" +nextflow run main_AMR++.nf -profile "${NEXTFLOW_PROFILE}" -c "${NEXTFLOW_CONFIG}" --pipeline "standard_AMR_wKraken_and_Bracken" -with-trace "trace-${now}.txt"