Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@ envs/amrplusplus-update.sif
logs/
.nextflow*
nextflow-24.10.9-dist
trace-*.txt
wb/config/wb.env
wb/config/gcp.env
34 changes: 18 additions & 16 deletions bin/kraken2_long_to_wide_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,28 +13,30 @@
'R1':0,
'R2':1,
'R3':2,
'K': 3,
'P': 4,
'C': 5,
'O': 6,
'F': 7,
'G': 8,
'S': 9,
'U': 10
'D': 3,
'K': 4,
'P': 5,
'C': 6,
'O': 7,
'F': 8,
'G': 9,
'S': 10,
'U': 11
}

taxa_level_names = {
0: 'Root1',
1: 'Root2',
2: 'Root3',
3: 'Kingdom',
4: 'Phylum',
5: 'Class',
6: 'Order',
7: 'Family',
8: 'Genus',
9: 'Species',
10: 'Unclassified'
3: 'Domain',
4: 'Kingdom',
5: 'Phylum',
6: 'Class',
7: 'Order',
8: 'Family',
9: 'Genus',
10: 'Species',
11: 'Unclassified'
}


Expand Down
46 changes: 37 additions & 9 deletions config/google_batch.config
Original file line number Diff line number Diff line change
Expand Up @@ -10,30 +10,58 @@
*/

process {
// Default for all processes
cpus = 4
memory = '16 GB'
machineType = 'n2-standard-4'

cache = 'lenient'
executor = 'google-batch'

withName: 'runqc' {
cpus = 16
memory = "64.GB"
machineType = "n2-standard-16"
memory = '64 GB'
machineType = 'n2-standard-16'
containerOptions = '--env _JAVA_OPTIONS="-Xmx60g"'
}

withName: 'bwa_align' {
withName: 'bowtie2_index' {
cpus = 8
memory = '64 GB'
machineType = 'n2-highmem-8'
}

withName: 'index' {
cpus = 8
memory = '64 GB'
machineType = 'n2-highmem-8'
}

withName: 'bowtie2_align' {
cpus = 32
memory = '128 GB'
machineType = 'n2-standard-32'
}

withName: 'bowtie2_rm_contaminant_fq' {
cpus = 32
memory = "256.GB"
machineType = "n2-highmem-32"
memory = '256 GB'
machineType = 'n2-highmem-32'
}

withName: 'bwa_align' {
cpus = 16
memory = '128 GB'
machineType = 'n2-highmem-32'
}

withName: 'runkraken' {
cpus = 16
memory = "256.GB"
machineType = "n2-highmem-32"
memory = '256 GB'
machineType = 'n2-highmem-32'
}

withName: 'runkrakenInterleaved' {
cpus = 16
memory = '256 GB'
machineType = 'n2-highmem-32'
}
}
140 changes: 140 additions & 0 deletions docs/workbench.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
# FloRes on Verily Workbench

**Prerequisites**:
- You must create a Workbench workspace where you have **ADMIN** permissions
- All setup and execution must be done within this workspace

## Dependencies

- **Verily Workbench CLI** (`wb`) - Workbench command-line tool
- **Google Cloud SDK** (`gcloud`) - GCP command-line tool
- **Docker** - For building and pushing container images (must be running)
- **Nextflow v24** - Workflow orchestration (installed in Workbench app)
- **Note**: v25 has breaking changes and is not compatible with this pipeline

## Quick Start: Workbench Orchestration with Google Batch

This guide walks through setting up and running FloRes with Workbench orchestration and Google Batch compute. The setup is split between local commands (for infrastructure) and Workbench app commands (for execution).

### Step 1: Create Workspace and App

Create a new workspace and app in the Workbench UI (or use the CLI if preferred).

### Step 2: Local Setup

Run these commands on your **local machine**:

```bash
# Set your active workspace (replace with your workspace ID)
wb workspace set --id=your-workspace-id

# Copy the Workbench environment template
cp wb/config/wb.env.template wb/config/wb.env
```

Edit `wb/config/wb.env` and set the user-defined variables:
- `GCS_BUCKET`: Your Workbench GCS bucket resource ID (e.g., `nf-output`)
- `GCS_REF_BUCKET`: Bucket containing reference genomes (e.g., `referencegenomes-wb-my-workspace-1234`)
- `GCS_BUCKET_LOCATION`: Region (default: `us-central1`)
- `GOOGLE_ARTIFACT_REPO`: Your artifact registry repo (e.g., `nextflow-containers`)

**Note**: Project IDs, service accounts, and registry paths are automatically determined from your `gcloud` and `wb` CLI configurations.

Then run:

```bash
# Set up infrastructure (creates buckets, service accounts, etc.)
./wb/setup_infra.sh wb

# Upload input data and reference databases to GCS
./wb/upload_data.sh wb

# Build Docker image and push to Artifact Registry
# NOTE: Docker must be running before executing this command
./wb/build.sh --env wb --push
```

### Step 3: Workbench App Setup

Open your Workbench app, launch the Terminal, and run:

```bash
# Clone the repository
cd repos/ && git clone https://github.com/passdan/FloRes.git && cd FloRes/

# Copy the environment template
cp wb/config/wb.env.template wb/config/wb.env
```

Now copy your local `wb/config/wb.env` configuration into the Workbench app.

### Step 4: Run the Pipeline

```bash
./wb/run.sh --env wb
```

Results will be stored in your configured GCS bucket.

**Known Issues**:
- The `gcloud storage cp` command may not correctly resolve Workbench resource names to full `gs://` paths when running `upload_data.sh` or `run.sh`. If you encounter path resolution issues, manually specify the full GCS bucket path in your `wb.env` configuration.

---

## Alternative: Quick Demo in Workbench JupyterLab

For a simple demonstration without Google Batch (both orchestration and execution running in the same Workbench app):

Create a new Workbench workspace and add this git repository in the **Apps** tab.

Create a JupyterLab app instance, launch it, and open the terminal:

```bash
# Initialize conda
conda init
source ~/.bashrc

# Navigate to the repository
cd repos/FloRes

# Create and activate the conda environment
conda env create -f envs/AMR++_env.yaml
conda activate AMR++_env

# Verify Nextflow version 24 is installed
nextflow -v

# Run the test pipeline (takes ~5 minutes)
nextflow run main_AMR++.nf
```

Expected output: results in `~/repos/FloRes/test_results`

---

## Configuration

### Resource Scaling on Google Batch

Google Batch does NOT automatically scale machine types based on CPU/memory requests. Resource scaling is configured in `config/google_batch.config`.

Each process that needs more than default resources must explicitly specify a matching `machineType`. Current resource allocations:

| Process | CPUs | Memory | Machine Type |
|---------|------|--------|-------------|
| Default | 4 | 16 GB | n2-standard-4 |
| runqc | 16 | 64 GB | n2-standard-16 |
| bowtie2_align | 32 | 128 GB | n2-standard-32 |
| bowtie2_rm_contaminant_fq | 32 | 256 GB | n2-highmem-32 |
| bwa_align | 16 | 128 GB | n2-highmem-32 |
| runkraken | 16 | 256 GB | n2-highmem-32 |
| runkrakenInterleaved | 16 | 256 GB | n2-highmem-32 |

### Supporting Environments

**Local** (testing): `./wb/run.sh --env local`
- Requires Docker and Conda

**GCP** (debugging): `./wb/run.sh --env gcp`
- For debugging Google Batch jobs with visible logs
- Requires `gcloud` CLI and Docker
17 changes: 16 additions & 1 deletion envs/containers/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ RUN apt-get update -q && \
subversion \
wget \
g++ \
make \
libarchive13 \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
Expand Down Expand Up @@ -59,5 +60,19 @@ RUN set -x && \
find /opt/conda/ -follow -type f -name '*.js.map' -delete && \
/opt/conda/bin/conda clean -afy && \
/opt/conda/bin/conda install -c conda-forge mamba && \
/opt/conda/bin/mamba install -c conda-forge -c bioconda git python=3.9 trimmomatic multiqc bwa samtools bedtools kraken2 multiqc fastqc krona bracken numpy pysam pandas biopython matplotlib nextflow && \
/opt/conda/bin/mamba install -c conda-forge -c bioconda git python=3.9 trimmomatic multiqc bwa samtools bedtools kraken2 multiqc fastqc krona bracken numpy pysam pandas biopython matplotlib nextflow=24 fastp bowtie2 && \
conda clean --all

# Copy AMR++ bin scripts into the container
COPY bin /opt/amrplusplus/bin
RUN chmod +x /opt/amrplusplus/bin/*.py && \
chmod +x /opt/amrplusplus/bin/rarefaction && \
chmod +x /opt/amrplusplus/bin/resistome

# Clone AmrPlusPlus_SNP for SNP verification (replaces empty dir copied from bin/)
RUN rm -rf /opt/amrplusplus/bin/AmrPlusPlus_SNP && \
git clone https://github.com/Isabella136/AmrPlusPlus_SNP.git /opt/amrplusplus/bin/AmrPlusPlus_SNP && \
chmod -R 755 /opt/amrplusplus/bin/AmrPlusPlus_SNP

# Add bin directory to PATH
ENV PATH="/opt/amrplusplus/bin:${PATH}"
2 changes: 1 addition & 1 deletion modules/Alignment/bowtie2-for_AMRplusplus.nf
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,6 @@ process HostRemovalStats {
path("host.removal.stats"), emit: combo_host_rm_stats

"""
${PYTHON3} $baseDir/bin/samtools_idxstats.py -i ${host_rm_stats} -o host.removal.stats
${PYTHON3} /opt/amrplusplus/bin/samtools_idxstats.py -i ${host_rm_stats} -o host.removal.stats
"""
}
26 changes: 13 additions & 13 deletions modules/Alignment/bwa.nf
Original file line number Diff line number Diff line change
Expand Up @@ -63,22 +63,22 @@ process bwa_align {
script:
if( deduped == "N")
"""
${BWA} mem ${indexfiles[0]} ${reads} -t ${threads} -R '@RG\\tID:${pair_id}\\tSM:${pair_id}' | \
${SAMTOOLS} sort -@ ${threads} -m 4G -o ${pair_id}_alignment_sorted.bam
${BWA} mem ${indexfiles[0]} ${reads} -t ${task.cpus} -R '@RG\\tID:${pair_id}\\tSM:${pair_id}' | \
${SAMTOOLS} sort -@ ${task.cpus} -m 4G -o ${pair_id}_alignment_sorted.bam
"""
else if( deduped == "Y")
"""
${BWA} mem ${indexfiles[0]} ${reads} -t ${threads} -R '@RG\\tID:${pair_id}\\tSM:${pair_id}' > ${pair_id}_alignment.sam
${SAMTOOLS} view -@ ${threads} -S -b ${pair_id}_alignment.sam > ${pair_id}_alignment.bam
${BWA} mem ${indexfiles[0]} ${reads} -t ${task.cpus} -R '@RG\\tID:${pair_id}\\tSM:${pair_id}' > ${pair_id}_alignment.sam
${SAMTOOLS} view -@ ${task.cpus} -S -b ${pair_id}_alignment.sam > ${pair_id}_alignment.bam
rm ${pair_id}_alignment.sam
${SAMTOOLS} sort -@ ${threads} -m 3G -n ${pair_id}_alignment.bam -o ${pair_id}_alignment_sorted.bam
${SAMTOOLS} sort -@ ${task.cpus} -m 3G -n ${pair_id}_alignment.bam -o ${pair_id}_alignment_sorted.bam
rm ${pair_id}_alignment.bam
${SAMTOOLS} fixmate -@ ${threads} ${pair_id}_alignment_sorted.bam ${pair_id}_alignment_sorted_fix.bam
${SAMTOOLS} sort -@ ${threads} -m 3G ${pair_id}_alignment_sorted_fix.bam -o ${pair_id}_alignment_sorted_fix.sorted.bam
${SAMTOOLS} fixmate -@ ${task.cpus} ${pair_id}_alignment_sorted.bam ${pair_id}_alignment_sorted_fix.bam
${SAMTOOLS} sort -@ ${task.cpus} -m 3G ${pair_id}_alignment_sorted_fix.bam -o ${pair_id}_alignment_sorted_fix.sorted.bam
rm ${pair_id}_alignment_sorted_fix.bam
${SAMTOOLS} rmdup -S ${pair_id}_alignment_sorted_fix.sorted.bam ${pair_id}_alignment_dedup.bam
rm ${pair_id}_alignment_sorted_fix.sorted.bam
${SAMTOOLS} view -@ ${threads} -h -o ${pair_id}_alignment_dedup.sam ${pair_id}_alignment_dedup.bam
${SAMTOOLS} view -@ ${task.cpus} -h -o ${pair_id}_alignment_dedup.sam ${pair_id}_alignment_dedup.bam
rm ${pair_id}_alignment_dedup.sam
"""
else
Expand Down Expand Up @@ -107,13 +107,13 @@ process bwa_rm_contaminant_fq {
path("${pair_id}.samtools.idxstats"), emit: host_rm_stats

"""
${BWA} mem ${indexfiles[0]} ${reads[0]} ${reads[1]} -t ${threads} | \
${SAMTOOLS} sort -@ ${threads} -m 4G -o ${pair_id}.host.sorted.bam
${BWA} mem ${indexfiles[0]} ${reads[0]} ${reads[1]} -t ${task.cpus} | \
${SAMTOOLS} sort -@ ${task.cpus} -m 4G -o ${pair_id}.host.sorted.bam
${SAMTOOLS} index ${pair_id}.host.sorted.bam && ${SAMTOOLS} idxstats ${pair_id}.host.sorted.bam > ${pair_id}.samtools.idxstats
${SAMTOOLS} view -h -f 12 -b ${pair_id}.host.sorted.bam -o ${pair_id}.host.sorted.removed.bam
${SAMTOOLS} sort -n -@ ${threads} -m 3G ${pair_id}.host.sorted.removed.bam -o ${pair_id}.host.resorted.removed.bam
${SAMTOOLS} sort -n -@ ${task.cpus} -m 3G ${pair_id}.host.sorted.removed.bam -o ${pair_id}.host.resorted.removed.bam
${SAMTOOLS} \
fastq -@ ${threads} -c 6 \
fastq -@ ${task.cpus} -c 6 \
${pair_id}.host.resorted.removed.bam \
-1 ${pair_id}.non.host.R1.fastq.gz \
-2 ${pair_id}.non.host.R2.fastq.gz \
Expand Down Expand Up @@ -143,6 +143,6 @@ process HostRemovalStats {
path("host.removal.stats"), emit: combo_host_rm_stats

"""
${PYTHON3} $baseDir/bin/samtools_idxstats.py -i ${host_rm_stats} -o host.removal.stats
${PYTHON3} /opt/amrplusplus/bin/samtools_idxstats.py -i ${host_rm_stats} -o host.removal.stats
"""
}
5 changes: 3 additions & 2 deletions modules/Fastqc/fastqc.nf
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ process multiqc {
script:
"""
cp $config/* .
multiqc -v data* --interactive -f --cl-config "max_table_rows: 3000"
mv multiqc_data/multiqc_general_stats.txt .
multiqc -v data* --interactive -f --cl-config "max_table_rows: 3000" --outdir multiqc_data --filename multiqc_report.html
mv multiqc_data/multiqc_report_data/multiqc_general_stats.txt .
mv multiqc_data/multiqc_report.html .
"""
}
2 changes: 1 addition & 1 deletion modules/Microbiome/kraken2.nf
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ process krakenresults {
path("kraken_analytic_matrix.csv")

"""
${PYTHON3} $baseDir/bin/kraken2_long_to_wide.py -i ${kraken_reports} -o kraken_analytic_matrix.csv
${PYTHON3} /opt/amrplusplus/bin/kraken2_long_to_wide.py -i ${kraken_reports} -o kraken_analytic_matrix.csv
"""
}

Expand Down
5 changes: 3 additions & 2 deletions modules/Microbiome/kraken_and_bracken.nf
Original file line number Diff line number Diff line change
Expand Up @@ -104,13 +104,14 @@ process krakenresults {


"""
${PYTHON3} $baseDir/bin/kraken2_long_to_wide_update.py -i ${kraken_reports} -o kraken_analytic_matrix.csv
${PYTHON3} /opt/amrplusplus/bin/kraken2_long_to_wide_update.py -i ${kraken_reports} -o kraken_analytic_matrix.csv
"""
}

process runbracken {
label "microbiome"

errorStrategy { task.exitStatus == 1 ? 'ignore' : 'terminate' }

input:
tuple val(sample_id), path(kraken_report), val(level)
path(krakendb)
Expand Down
Loading