diff --git a/README.md b/README.md index 2849b8f..cb7761f 100644 --- a/README.md +++ b/README.md @@ -159,16 +159,38 @@ output/ │ # (one pair per sample for multi-sample runs) │ ├── extended_annotations.gtf -├── se_unique_counts.rds -├── se_gene_counts.rds │ -│ # single-cell EM: -├── se_transcript_counts_singlecell.rds +├── unique_counts/ +│ ├── se_unique_counts.rds +│ ├── matrix.mtx.gz +│ ├── barcodes.tsv.gz +│ └── features.tsv.gz │ -│ # clustered EM: +├── gene_counts/ +│ ├── se_gene_counts.rds +│ ├── matrix.mtx.gz +│ ├── barcodes.tsv.gz +│ └── features.tsv.gz +│ +│ # single-cell EM (--quantification_mode EM): +├── transcript_counts_singlecell/ +│ ├── se_transcript_counts_singlecell.rds +│ ├── matrix.mtx.gz +│ ├── barcodes.tsv.gz +│ └── features.tsv.gz +│ +│ # clustered EM (--quantification_mode EM_clusters): ├── seurat_obj.rds -├── se_transcript_counts_clusters.rds -├── se_gene_counts_clusters.rds +├── transcript_counts_clusters/ +│ ├── se_transcript_counts_clusters.rds +│ ├── matrix.mtx.gz +│ ├── barcodes.tsv.gz +│ └── features.tsv.gz +├── gene_counts_clusters/ +│ ├── se_gene_counts_clusters.rds +│ ├── matrix.mtx.gz +│ ├── barcodes.tsv.gz +│ └── features.tsv.gz │ ├── pipeline_info/ │ ├── execution_timeline.html @@ -182,20 +204,23 @@ output/ **Description of the Output Files** | File | Description |---|--- -| _demultiplexed.bam | BAM file containing demultiplexed, trimmed and aligned reads -| _demultiplexed.bam.bai | BAM index for the corresponding BAM file -| extended_annotations.gtf | A `.gtf` file containing the novel transcripts discovered by Bambu as well as the reference annotations provided by the user. -| seurat_obj.rds | A [SeuratObject](https://satijalab.github.io/seurat-object/reference/Seurat-class.html) containing normalised counts, PCA embeddings, and cluster assignments. For multi-sample runs, also contains Harmony-integrated embeddings corrected for sequencing technology and capture chemistry. UMAP has not been computed. Only produced when `--quantification_mode` is set to `EM_clusters`. -| se_unique_counts.rds | A [RangedSummarizedExperiment](https://www.rdocumentation.org/packages/SummarizedExperiment/versions/1.2.3/topics/RangedSummarizedExperiment-class) object containing transcript-level unique counts at single-cell resolution, produced prior to EM quantification. Columns follow the `sampleName_barcode` naming convention. -| se_gene_counts.rds | A RangedSummarizedExperiment object containing gene-level counts at single-cell resolution. Columns follow the `sampleName_barcode` naming convention. -| se_transcript_counts_singlecell.rds | A RangedSummarizedExperiment object containing per-cell transcript counts after EM quantification. Columns follow the `sampleName_barcode` naming convention. Only produced when `--quantification_mode` is set to `EM`. -| se_transcript_counts_clusters.rds | A RangedSummarizedExperiment object containing cluster-level transcript counts after EM quantification. Columns follow the `clusterId` naming convention for single-sample runs, and `sampleName_clusterId` for multi-sample runs. Only produced when `--quantification_mode` is set to `EM_clusters`. -| se_gene_counts_clusters.rds | A RangedSummarizedExperiment object containing cluster-level gene counts. Columns follow the `clusterId` naming convention for single-sample runs, and `sampleName_clusterId` for multi-sample runs. Only produced when `--quantification_mode` is set to `EM_clusters`. -| software_versions.yml | A YAML file listing the versions of all software tools used during the pipeline run. -| execution_timeline.html | Pipeline execution timeline. See [Nextflow docs](https://www.nextflow.io/docs/latest/tracing.html#timeline-report). -| execution_report.html | Resource and runtime report for the pipeline run. See [Nextflow docs](https://www.nextflow.io/docs/latest/tracing.html). -| execution_trace.txt | Per-process execution trace. See [Nextflow docs](https://www.nextflow.io/docs/latest/tracing.html#trace-report). -| pipeline_dag.svg | Workflow DAG diagram. See [Nextflow docs](https://www.nextflow.io/docs/latest/tracing.html#dag-visualisation). +| `_demultiplexed.bam` | BAM file containing demultiplexed, trimmed and aligned reads +| `_demultiplexed.bam.bai` | BAM index for the corresponding BAM file +| `extended_annotations.gtf` | A `.gtf` file containing the novel transcripts discovered by Bambu as well as the reference annotations provided by the user. +| `seurat_obj.rds` | A [SeuratObject](https://satijalab.github.io/seurat-object/reference/Seurat-class.html) containing normalised counts, PCA embeddings, and cluster assignments. For multi-sample runs, also contains Harmony-integrated embeddings corrected for sequencing technology and capture chemistry. UMAP has not been computed. Only produced when `--quantification_mode` is set to `EM_clusters`. +| `se_unique_counts.rds` | A [RangedSummarizedExperiment](https://www.rdocumentation.org/packages/SummarizedExperiment/versions/1.2.3/topics/RangedSummarizedExperiment-class) object containing transcript-level unique counts at single-cell resolution, produced prior to EM quantification. Columns follow the `sampleName_barcode` naming convention. +| `se_gene_counts.rds` | A RangedSummarizedExperiment object containing gene-level counts at single-cell resolution. Columns follow the `sampleName_barcode` naming convention. +| `se_transcript_counts_singlecell.rds` | A RangedSummarizedExperiment object containing per-cell transcript counts after EM quantification. Columns follow the `sampleName_barcode` naming convention. Only produced when `--quantification_mode` is set to `EM`. +| `se_transcript_counts_clusters.rds` | A RangedSummarizedExperiment object containing cluster-level transcript counts after EM quantification. Columns follow the `clusterId` naming convention for single-sample runs, and `sampleName_clusterId` for multi-sample runs. Only produced when `--quantification_mode` is set to `EM_clusters`. +| `se_gene_counts_clusters.rds` | A RangedSummarizedExperiment object containing cluster-level gene counts. Columns follow the `clusterId` naming convention for single-sample runs, and `sampleName_clusterId` for multi-sample runs. Only produced when `--quantification_mode` is set to `EM_clusters`. +| `matrix.mtx.gz` | Gzip-compressed sparse count matrix in [Matrix Market Exchange](https://math.nist.gov/MatrixMarket/formats.html) format (features × barcodes). +| `barcodes.tsv.gz` | Gzip-compressed list of cell barcodes (one per line) corresponding to the columns of `matrix.mtx.gz`. Barcodes follow the `sampleName_barcode` naming convention for multi-sample runs. +| `features.tsv.gz` | Gzip-compressed three-column TSV corresponding to the rows of `matrix.mtx.gz`. Column 1 is the feature ID (transcript ID for transcript-level matrices, gene ID for gene-level matrices). Column 2 is the feature name (same as column 1). Column 3 is the feature type (`Transcript Expression` for transcript-level matrices, `Gene Expression` for gene-level matrices). +| `software_versions.yml` | A YAML file listing the versions of all software tools used during the pipeline run. +| `execution_timeline.html` | Pipeline execution timeline. See [Nextflow docs](https://www.nextflow.io/docs/latest/tracing.html#timeline-report). +| `execution_report.html` | Resource and runtime report for the pipeline run. See [Nextflow docs](https://www.nextflow.io/docs/latest/tracing.html). +| `execution_trace.txt` | Per-process execution trace. See [Nextflow docs](https://www.nextflow.io/docs/latest/tracing.html#trace-report). +| `pipeline_dag.svg` | Workflow DAG diagram. See [Nextflow docs](https://www.nextflow.io/docs/latest/tracing.html#dag-visualisation). **Count Matrices** diff --git a/containers/r/Dockerfile b/containers/r/Dockerfile index 0512f79..294f264 100644 --- a/containers/r/Dockerfile +++ b/containers/r/Dockerfile @@ -12,6 +12,7 @@ RUN micromamba install -y -n base -c conda-forge -c bioconda \ r-devtools=2.5.2 \ r-biocmanager=1.30.27 \ bioconductor-bambu=3.12.1 \ + bioconductor-dropletutils=1.30.0 \ r-seurat=5.4.0 \ r-harmony=2.0.2 \ gxx=15.2.0 \ diff --git a/modules/bambu/EM_quant.nf b/modules/bambu/EM_quant.nf index 15ae169..c2e3f0b 100644 --- a/modules/bambu/EM_quant.nf +++ b/modules/bambu/EM_quant.nf @@ -1,5 +1,7 @@ process BAMBU_EM{ - publishDir "$params.output_dir", mode: 'copy', pattern: '*.rds' + publishDir "$params.output_dir", mode: 'copy', pattern: 'transcript_counts_singlecell' + publishDir "$params.output_dir", mode: 'copy', pattern: 'transcript_counts_clusters' + publishDir "$params.output_dir", mode: 'copy', pattern: 'gene_counts_clusters' label "r" label "low_cpu" label "high_mem" @@ -12,42 +14,48 @@ process BAMBU_EM{ path(genome) output: - path ('se_transcript_counts_*.rds') - path ('se_gene_counts_clusters.rds'), optional: true + path ('transcript_counts_singlecell'), optional: true + path ('transcript_counts_clusters'), optional: true + path ('gene_counts_clusters'), optional: true path "versions.yml", topic: 'versions' script: """ #!/usr/bin/env Rscript if ("$params.bambu_path" == "null") { library("bambu") } else { library("devtools"); load_all("$params.bambu_path") } - + library(DropletUtils) + extendedAnno <- readRDS("$extended_annotation") - quantData = readRDS("$quant_data") - clusters = if ("$has_clusters" == "true") readRDS("$clusters") else NULL - degBias <- !is.null(clusters) - - se = bambu.singlecell( - reads = NULL, - annotations = extendedAnno, - genome = "$genome", - quantData = quantData, - assignDist = FALSE, - ncore = $task.cpus, - discovery = FALSE, - quant = TRUE, - verbose = FALSE, - opt.em = list(degradationBias = degBias), + quantData <- readRDS("$quant_data") + clusters <- if ("$has_clusters" == "true") readRDS("$clusters") else NULL + degBias <- !is.null(clusters) + + se <- bambu.singlecell( + reads = NULL, + annotations = extendedAnno, + genome = "$genome", + quantData = quantData, + assignDist = FALSE, + ncore = $task.cpus, + discovery = FALSE, + quant = TRUE, + verbose = FALSE, + opt.em = list(degradationBias = degBias), clusters = clusters ) - # Save transcript counts; for clustered EM, also save gene counts if (is.null(clusters)) { - saveRDS(se, "se_transcript_counts_singlecell.rds") + write10xCounts("transcript_counts_singlecell", assays(se)\$counts, version = "3", gene.type = "Transcript Expression") + saveRDS(se, "transcript_counts_singlecell/se_transcript_counts_singlecell.rds") } else { - saveRDS(se, "se_transcript_counts_clusters.rds") - saveRDS(transcriptToGeneExpression(se), "se_gene_counts_clusters.rds") + write10xCounts("transcript_counts_clusters", assays(se)\$counts, version = "3", gene.type = "Transcript Expression") + saveRDS(se, "transcript_counts_clusters/se_transcript_counts_clusters.rds") + + se_gene <- transcriptToGeneExpression(se) + write10xCounts("gene_counts_clusters", assays(se_gene)\$counts, version = "3") + saveRDS(se_gene, "gene_counts_clusters/se_gene_counts_clusters.rds") } - + writeLines(c('"${task.process}":', paste0(' R: ', R.Version()\$version.string), paste0(' bambu: ', as.character(packageVersion("bambu")))), "versions.yml") """ } \ No newline at end of file diff --git a/modules/bambu/transcript_discovery.nf b/modules/bambu/transcript_discovery.nf index 50daa54..8074d2f 100644 --- a/modules/bambu/transcript_discovery.nf +++ b/modules/bambu/transcript_discovery.nf @@ -1,6 +1,7 @@ -process BAMBU_TRANSCRIPT_DISCOVERY{ +process BAMBU_TRANSCRIPT_DISCOVERY{ publishDir "$params.output_dir", mode: 'copy', pattern: 'extended_annotations.gtf' - publishDir "$params.output_dir", mode: 'copy', pattern: '{se_unique_counts,se_gene_counts}.rds' + publishDir "$params.output_dir", mode: 'copy', pattern: 'unique_counts' + publishDir "$params.output_dir", mode: 'copy', pattern: 'gene_counts' publishDir "$params.output_dir/intermediate_R", mode: 'copy', pattern: '*.rds', enabled: params.save_intermediates label "r" label "medium_cpu" @@ -12,20 +13,23 @@ process BAMBU_TRANSCRIPT_DISCOVERY{ path(genome) path(bambu_annotation) val(ndr) - + output: path ('quant_data.rds'), emit: quant_data - path ('se_unique_counts.rds'), emit: se_unique_counts - path ('se_gene_counts.rds'), emit: se_gene_counts + path ('unique_counts/se_unique_counts.rds') + path ('gene_counts/se_gene_counts.rds'), emit: se_gene_counts path ('extended_annotations.rds'), emit: extended_annotations - path ('extended_annotations.gtf'), emit: extended_annotations_gtf + path ('extended_annotations.gtf') path ('sample_names.rds'), emit: sample_names + path ('unique_counts') + path ('gene_counts') path "versions.yml", topic: 'versions' script: - """ + """ #!/usr/bin/env Rscript if ("$params.bambu_path" == "null") { library("bambu") } else { library("devtools"); load_all("$params.bambu_path") } + library(DropletUtils) annotation <- readRDS("$bambu_annotation") readClassFile <- strsplit("${rds_files.join(',')}", ",")[[1]] @@ -35,14 +39,14 @@ process BAMBU_TRANSCRIPT_DISCOVERY{ technology <- setNames(strsplit("${meta.collect { m -> m.technology }.join(',')}", ",")[[1]], sampleNames) # Transcript discovery - extendedAnno <- bambu.singlecell(reads = readClassFile, annotations = annotation, genome = "$genome", ncore = $task.cpus, + extendedAnno <- bambu.singlecell(reads = readClassFile, annotations = annotation, genome = "$genome", ncore = $task.cpus, discovery = TRUE, quant = FALSE, verbose = FALSE, assignDist = FALSE, NDR = $ndr) saveRDS(extendedAnno, "extended_annotations.rds") writeToGTF(extendedAnno, "extended_annotations.gtf") # Quantification without EM sampleData <- if (any(startsWith(chemistry, "visium-v"))) sampleData else NULL # Add spatial metadata for visium samples - quantData <- bambu.singlecell(reads = readClassFile, annotations = extendedAnno, genome = "$genome", ncore = $task.cpus, + quantData <- bambu.singlecell(reads = readClassFile, annotations = extendedAnno, genome = "$genome", ncore = $task.cpus, discovery = FALSE, quant = FALSE, verbose = FALSE, opt.em = list(degradationBias = FALSE), assignDist = TRUE, sampleData = sampleData) saveRDS(quantData, "quant_data.rds") @@ -50,15 +54,17 @@ process BAMBU_TRANSCRIPT_DISCOVERY{ seDiscovery <- generateUniqueCountsSEFromQuantData(quantData, extendedAnno) colData(seDiscovery)\$chemistry <- unname(chemistry[colData(seDiscovery)\$sampleName]) # Add chemistry into colData (for subsequent batch correction) colData(seDiscovery)\$technology <- unname(technology[colData(seDiscovery)\$sampleName]) # Add technology into colData (for subsequent batch correction) - saveRDS(seDiscovery, "se_unique_counts.rds") + write10xCounts("unique_counts", assays(seDiscovery)\$counts, version = "3", gene.type = "Transcript Expression") + saveRDS(seDiscovery, "unique_counts/se_unique_counts.rds") # Generate gene counts SE from unique counts SE seDiscovery.gene <- transcriptToGeneExpression(seDiscovery) - saveRDS(seDiscovery.gene, "se_gene_counts.rds") + write10xCounts("gene_counts", assays(seDiscovery.gene)\$counts, version = "3") + saveRDS(seDiscovery.gene, "gene_counts/se_gene_counts.rds") # Save sampleNames (required for multi-sample Seurat clustering) saveRDS(sampleNames, "sample_names.rds") - + writeLines(c('"${task.process}":', paste0(' R: ', R.Version()\$version.string), paste0(' bambu: ', as.character(packageVersion("bambu")))), "versions.yml") """ } \ No newline at end of file