GoekeLab · ch99l · Jun 29, 2026 · Jun 29, 2026
diff --git a/README.md b/README.md
@@ -159,16 +159,38 @@ output/
 │    # (one pair per sample for multi-sample runs)  
 │
 ├── extended_annotations.gtf
-├── se_unique_counts.rds
-├── se_gene_counts.rds
 │
-│   # single-cell EM:
-├── se_transcript_counts_singlecell.rds
+├── unique_counts/
+│   ├── se_unique_counts.rds
+│   ├── matrix.mtx.gz
+│   ├── barcodes.tsv.gz
+│   └── features.tsv.gz
 │
-│   # clustered EM:
+├── gene_counts/
+│   ├── se_gene_counts.rds
+│   ├── matrix.mtx.gz
+│   ├── barcodes.tsv.gz
+│   └── features.tsv.gz
+│
+│   # single-cell EM (--quantification_mode EM):
+├── transcript_counts_singlecell/
+│   ├── se_transcript_counts_singlecell.rds
+│   ├── matrix.mtx.gz
+│   ├── barcodes.tsv.gz
+│   └── features.tsv.gz
+│
+│   # clustered EM (--quantification_mode EM_clusters):
 ├── seurat_obj.rds
-├── se_transcript_counts_clusters.rds
-├── se_gene_counts_clusters.rds
+├── transcript_counts_clusters/
+│   ├── se_transcript_counts_clusters.rds
+│   ├── matrix.mtx.gz
+│   ├── barcodes.tsv.gz
+│   └── features.tsv.gz
+├── gene_counts_clusters/
+│   ├── se_gene_counts_clusters.rds
+│   ├── matrix.mtx.gz
+│   ├── barcodes.tsv.gz
+│   └── features.tsv.gz
 │
 ├── pipeline_info/
 │   ├── execution_timeline.html
@@ -182,20 +204,23 @@ output/
 **Description of the Output Files**
 | File | Description 
 |---|---
-| <sample_name>_demultiplexed.bam | BAM file containing demultiplexed, trimmed and aligned reads
-| <sample_name>_demultiplexed.bam.bai | BAM index for the corresponding BAM file
-| extended_annotations.gtf | A `.gtf` file containing the novel transcripts discovered by Bambu as well as the reference annotations provided by the user.
-| seurat_obj.rds | A [SeuratObject](https://satijalab.github.io/seurat-object/reference/Seurat-class.html) containing normalised counts, PCA embeddings, and cluster assignments. For multi-sample runs, also contains Harmony-integrated embeddings corrected for sequencing technology and capture chemistry. UMAP has not been computed. Only produced when `--quantification_mode` is set to `EM_clusters`.
-| se_unique_counts.rds | A [RangedSummarizedExperiment](https://www.rdocumentation.org/packages/SummarizedExperiment/versions/1.2.3/topics/RangedSummarizedExperiment-class) object containing transcript-level unique counts at single-cell resolution, produced prior to EM quantification. Columns follow the `sampleName_barcode` naming convention.
-| se_gene_counts.rds | A RangedSummarizedExperiment object containing gene-level counts at single-cell resolution. Columns follow the `sampleName_barcode` naming convention.
-| se_transcript_counts_singlecell.rds | A RangedSummarizedExperiment object containing per-cell transcript counts after EM quantification. Columns follow the `sampleName_barcode` naming convention. Only produced when `--quantification_mode` is set to `EM`.
-| se_transcript_counts_clusters.rds | A RangedSummarizedExperiment object containing cluster-level transcript counts after EM quantification. Columns follow the `clusterId` naming convention for single-sample runs, and `sampleName_clusterId` for multi-sample runs. Only produced when `--quantification_mode` is set to `EM_clusters`.
-| se_gene_counts_clusters.rds | A RangedSummarizedExperiment object containing cluster-level gene counts. Columns follow the `clusterId` naming convention for single-sample runs, and `sampleName_clusterId` for multi-sample runs. Only produced when `--quantification_mode` is set to `EM_clusters`.
-| software_versions.yml | A YAML file listing the versions of all software tools used during the pipeline run.
-| execution_timeline.html | Pipeline execution timeline. See [Nextflow docs](https://www.nextflow.io/docs/latest/tracing.html#timeline-report).
-| execution_report.html | Resource and runtime report for the pipeline run. See [Nextflow docs](https://www.nextflow.io/docs/latest/tracing.html).
-| execution_trace.txt | Per-process execution trace. See [Nextflow docs](https://www.nextflow.io/docs/latest/tracing.html#trace-report).
-| pipeline_dag.svg | Workflow DAG diagram. See [Nextflow docs](https://www.nextflow.io/docs/latest/tracing.html#dag-visualisation).
+| `<sample>_demultiplexed.bam` | BAM file containing demultiplexed, trimmed and aligned reads
+| `<sample>_demultiplexed.bam.bai` | BAM index for the corresponding BAM file
+| `extended_annotations.gtf` | A `.gtf` file containing the novel transcripts discovered by Bambu as well as the reference annotations provided by the user.
+| `seurat_obj.rds` | A [SeuratObject](https://satijalab.github.io/seurat-object/reference/Seurat-class.html) containing normalised counts, PCA embeddings, and cluster assignments. For multi-sample runs, also contains Harmony-integrated embeddings corrected for sequencing technology and capture chemistry. UMAP has not been computed. Only produced when `--quantification_mode` is set to `EM_clusters`.
+| `se_unique_counts.rds` | A [RangedSummarizedExperiment](https://www.rdocumentation.org/packages/SummarizedExperiment/versions/1.2.3/topics/RangedSummarizedExperiment-class) object containing transcript-level unique counts at single-cell resolution, produced prior to EM quantification. Columns follow the `sampleName_barcode` naming convention.
+| `se_gene_counts.rds` | A RangedSummarizedExperiment object containing gene-level counts at single-cell resolution. Columns follow the `sampleName_barcode` naming convention.
+| `se_transcript_counts_singlecell.rds` | A RangedSummarizedExperiment object containing per-cell transcript counts after EM quantification. Columns follow the `sampleName_barcode` naming convention. Only produced when `--quantification_mode` is set to `EM`.
+| `se_transcript_counts_clusters.rds` | A RangedSummarizedExperiment object containing cluster-level transcript counts after EM quantification. Columns follow the `clusterId` naming convention for single-sample runs, and `sampleName_clusterId` for multi-sample runs. Only produced when `--quantification_mode` is set to `EM_clusters`.
+| `se_gene_counts_clusters.rds` | A RangedSummarizedExperiment object containing cluster-level gene counts. Columns follow the `clusterId` naming convention for single-sample runs, and `sampleName_clusterId` for multi-sample runs. Only produced when `--quantification_mode` is set to `EM_clusters`.
+| `matrix.mtx.gz` | Gzip-compressed sparse count matrix in [Matrix Market Exchange](https://math.nist.gov/MatrixMarket/formats.html) format (features × barcodes).
+| `barcodes.tsv.gz` | Gzip-compressed list of cell barcodes (one per line) corresponding to the columns of `matrix.mtx.gz`. Barcodes follow the `sampleName_barcode` naming convention for multi-sample runs.
+| `features.tsv.gz` | Gzip-compressed three-column TSV corresponding to the rows of `matrix.mtx.gz`. Column 1 is the feature ID (transcript ID for transcript-level matrices, gene ID for gene-level matrices). Column 2 is the feature name (same as column 1). Column 3 is the feature type (`Transcript Expression` for transcript-level matrices, `Gene Expression` for gene-level matrices).
+| `software_versions.yml` | A YAML file listing the versions of all software tools used during the pipeline run.
+| `execution_timeline.html` | Pipeline execution timeline. See [Nextflow docs](https://www.nextflow.io/docs/latest/tracing.html#timeline-report).
+| `execution_report.html` | Resource and runtime report for the pipeline run. See [Nextflow docs](https://www.nextflow.io/docs/latest/tracing.html).
+| `execution_trace.txt` | Per-process execution trace. See [Nextflow docs](https://www.nextflow.io/docs/latest/tracing.html#trace-report).
+| `pipeline_dag.svg` | Workflow DAG diagram. See [Nextflow docs](https://www.nextflow.io/docs/latest/tracing.html#dag-visualisation).
 
 **Count Matrices**
 

diff --git a/containers/r/Dockerfile b/containers/r/Dockerfile
@@ -12,6 +12,7 @@ RUN micromamba install -y -n base -c conda-forge -c bioconda \
     r-devtools=2.5.2 \
     r-biocmanager=1.30.27 \
     bioconductor-bambu=3.12.1 \
+    bioconductor-dropletutils=1.30.0 \
     r-seurat=5.4.0 \
     r-harmony=2.0.2 \
     gxx=15.2.0 \

diff --git a/modules/bambu/EM_quant.nf b/modules/bambu/EM_quant.nf
@@ -1,5 +1,7 @@
 process BAMBU_EM{
-	publishDir "$params.output_dir", mode: 'copy', pattern: '*.rds'
+    publishDir "$params.output_dir", mode: 'copy', pattern: 'transcript_counts_singlecell'
+    publishDir "$params.output_dir", mode: 'copy', pattern: 'transcript_counts_clusters'
+    publishDir "$params.output_dir", mode: 'copy', pattern: 'gene_counts_clusters'
     label "r"
     label "low_cpu"
     label "high_mem"
@@ -12,42 +14,48 @@ process BAMBU_EM{
     path(genome)
 
     output:
-    path ('se_transcript_counts_*.rds')
-    path ('se_gene_counts_clusters.rds'), optional: true
+    path ('transcript_counts_singlecell'), optional: true
+    path ('transcript_counts_clusters'), optional: true
+    path ('gene_counts_clusters'), optional: true
     path "versions.yml", topic: 'versions'
 
     script:
     """
     #!/usr/bin/env Rscript
     if ("$params.bambu_path" == "null") { library("bambu") } else { library("devtools"); load_all("$params.bambu_path") }
-
+    library(DropletUtils)
+
     extendedAnno <- readRDS("$extended_annotation")
-    quantData = readRDS("$quant_data")
-    clusters = if ("$has_clusters" == "true") readRDS("$clusters") else NULL
-    degBias <- !is.null(clusters) 
-
-    se = bambu.singlecell(
-        reads = NULL, 
-        annotations = extendedAnno, 
-        genome = "$genome", 
-        quantData = quantData, 
-        assignDist = FALSE, 
-        ncore = $task.cpus, 
-        discovery = FALSE, 
-        quant = TRUE, 
-        verbose = FALSE, 
-        opt.em = list(degradationBias = degBias), 
+    quantData    <- readRDS("$quant_data")
+    clusters     <- if ("$has_clusters" == "true") readRDS("$clusters") else NULL
+    degBias      <- !is.null(clusters)
+
+    se <- bambu.singlecell(
+        reads = NULL,
+        annotations = extendedAnno,
+        genome = "$genome",
+        quantData = quantData,
+        assignDist = FALSE,
+        ncore = $task.cpus,
+        discovery = FALSE,
+        quant = TRUE,
+        verbose = FALSE,
+        opt.em = list(degradationBias = degBias),
         clusters = clusters
     )
 
-    # Save transcript counts; for clustered EM, also save gene counts
     if (is.null(clusters)) {
-        saveRDS(se, "se_transcript_counts_singlecell.rds")
+        write10xCounts("transcript_counts_singlecell", assays(se)\$counts, version = "3", gene.type = "Transcript Expression")
+        saveRDS(se, "transcript_counts_singlecell/se_transcript_counts_singlecell.rds")
     } else {
-        saveRDS(se, "se_transcript_counts_clusters.rds")
-        saveRDS(transcriptToGeneExpression(se), "se_gene_counts_clusters.rds")
+        write10xCounts("transcript_counts_clusters", assays(se)\$counts, version = "3", gene.type = "Transcript Expression")
+        saveRDS(se, "transcript_counts_clusters/se_transcript_counts_clusters.rds")
+
+        se_gene <- transcriptToGeneExpression(se)
+        write10xCounts("gene_counts_clusters", assays(se_gene)\$counts, version = "3")
+        saveRDS(se_gene, "gene_counts_clusters/se_gene_counts_clusters.rds")
     }
-    
+
     writeLines(c('"${task.process}":', paste0('    R: ', R.Version()\$version.string), paste0('    bambu: ', as.character(packageVersion("bambu")))), "versions.yml")
     """
 }
diff --git a/modules/bambu/transcript_discovery.nf b/modules/bambu/transcript_discovery.nf
@@ -1,6 +1,7 @@
-process BAMBU_TRANSCRIPT_DISCOVERY{ 
+process BAMBU_TRANSCRIPT_DISCOVERY{
     publishDir "$params.output_dir", mode: 'copy', pattern: 'extended_annotations.gtf'
-    publishDir "$params.output_dir", mode: 'copy', pattern: '{se_unique_counts,se_gene_counts}.rds'
+    publishDir "$params.output_dir", mode: 'copy', pattern: 'unique_counts'
+    publishDir "$params.output_dir", mode: 'copy', pattern: 'gene_counts'
     publishDir "$params.output_dir/intermediate_R", mode: 'copy', pattern: '*.rds', enabled: params.save_intermediates
     label "r"
     label "medium_cpu"
@@ -12,20 +13,23 @@ process BAMBU_TRANSCRIPT_DISCOVERY{
 	path(genome)
 	path(bambu_annotation)
     val(ndr)
-    
+
 	output:
     path ('quant_data.rds'), emit: quant_data
-    path ('se_unique_counts.rds'), emit: se_unique_counts
-    path ('se_gene_counts.rds'), emit: se_gene_counts
+    path ('unique_counts/se_unique_counts.rds')
+    path ('gene_counts/se_gene_counts.rds'), emit: se_gene_counts
 	path ('extended_annotations.rds'), emit: extended_annotations
-    path ('extended_annotations.gtf'), emit: extended_annotations_gtf
+    path ('extended_annotations.gtf')
     path ('sample_names.rds'), emit: sample_names
+    path ('unique_counts')
+    path ('gene_counts')
     path "versions.yml", topic: 'versions'
 
 	script:
-	""" 
+	"""
 	#!/usr/bin/env Rscript
     if ("$params.bambu_path" == "null") { library("bambu") } else { library("devtools"); load_all("$params.bambu_path") }
+    library(DropletUtils)
 
     annotation <- readRDS("$bambu_annotation")
     readClassFile <- strsplit("${rds_files.join(',')}", ",")[[1]]
@@ -35,30 +39,32 @@ process BAMBU_TRANSCRIPT_DISCOVERY{
     technology <- setNames(strsplit("${meta.collect { m -> m.technology }.join(',')}", ",")[[1]], sampleNames)
 
     # Transcript discovery
-    extendedAnno <- bambu.singlecell(reads = readClassFile, annotations = annotation, genome = "$genome", ncore = $task.cpus, 
+    extendedAnno <- bambu.singlecell(reads = readClassFile, annotations = annotation, genome = "$genome", ncore = $task.cpus,
     discovery = TRUE, quant = FALSE, verbose = FALSE, assignDist = FALSE, NDR = $ndr)
     saveRDS(extendedAnno, "extended_annotations.rds")
     writeToGTF(extendedAnno, "extended_annotations.gtf")
 
     # Quantification without EM
     sampleData <- if (any(startsWith(chemistry, "visium-v"))) sampleData else NULL # Add spatial metadata for visium samples
-    quantData <- bambu.singlecell(reads = readClassFile, annotations = extendedAnno, genome = "$genome", ncore = $task.cpus, 
+    quantData <- bambu.singlecell(reads = readClassFile, annotations = extendedAnno, genome = "$genome", ncore = $task.cpus,
     discovery = FALSE, quant = FALSE, verbose = FALSE, opt.em = list(degradationBias = FALSE), assignDist = TRUE, sampleData = sampleData)
     saveRDS(quantData, "quant_data.rds")
 
     # Generate unique counts SE from quantData
     seDiscovery <- generateUniqueCountsSEFromQuantData(quantData, extendedAnno)
     colData(seDiscovery)\$chemistry  <- unname(chemistry[colData(seDiscovery)\$sampleName]) # Add chemistry into colData (for subsequent batch correction)
     colData(seDiscovery)\$technology <- unname(technology[colData(seDiscovery)\$sampleName]) # Add technology into colData (for subsequent batch correction)
-    saveRDS(seDiscovery, "se_unique_counts.rds")
+    write10xCounts("unique_counts", assays(seDiscovery)\$counts, version = "3", gene.type = "Transcript Expression")
+    saveRDS(seDiscovery, "unique_counts/se_unique_counts.rds")
 
     # Generate gene counts SE from unique counts SE
     seDiscovery.gene <- transcriptToGeneExpression(seDiscovery)
-    saveRDS(seDiscovery.gene, "se_gene_counts.rds")
+    write10xCounts("gene_counts", assays(seDiscovery.gene)\$counts, version = "3")
+    saveRDS(seDiscovery.gene, "gene_counts/se_gene_counts.rds")
 
     # Save sampleNames (required for multi-sample Seurat clustering)
     saveRDS(sampleNames, "sample_names.rds")
-    
+
     writeLines(c('"${task.process}":', paste0('    R: ', R.Version()\$version.string), paste0('    bambu: ', as.character(packageVersion("bambu")))), "versions.yml")
 	"""
 }