From f949409c5c2b930f6a89070c775d305022ed7126 Mon Sep 17 00:00:00 2001 From: KurayiChawatama Date: Thu, 12 Mar 2026 10:40:49 +0300 Subject: [PATCH 01/28] Add scdblfinder module skeleton generated by nf-core tools --- modules/local/scdblfinder/environment.yml | 10 +++ modules/local/scdblfinder/main.nf | 83 ++++++++++++++++++++ modules/local/scdblfinder/meta.yml | 77 ++++++++++++++++++ modules/local/scdblfinder/tests/main.nf.test | 78 ++++++++++++++++++ 4 files changed, 248 insertions(+) create mode 100644 modules/local/scdblfinder/environment.yml create mode 100644 modules/local/scdblfinder/main.nf create mode 100644 modules/local/scdblfinder/meta.yml create mode 100644 modules/local/scdblfinder/tests/main.nf.test diff --git a/modules/local/scdblfinder/environment.yml b/modules/local/scdblfinder/environment.yml new file mode 100644 index 00000000..2dbc8004 --- /dev/null +++ b/modules/local/scdblfinder/environment.yml @@ -0,0 +1,10 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + # TODO nf-core: List required Conda package(s). + # Software MUST be pinned to channel (i.e. "bioconda"), version (i.e. "1.10"). + # For Conda, the build (i.e. "h9402c20_2") must be EXCLUDED to support installation on different operating systems. + - "bioconda::bioconductor-scdblfinder=1.24.0" diff --git a/modules/local/scdblfinder/main.nf b/modules/local/scdblfinder/main.nf new file mode 100644 index 00000000..8b093cd4 --- /dev/null +++ b/modules/local/scdblfinder/main.nf @@ -0,0 +1,83 @@ +// TODO nf-core: If in doubt look at other nf-core/modules to see how we are doing things! :) +// https://github.com/nf-core/modules/tree/master/modules/nf-core/ +// You can also ask for help via your pull request or on the #modules channel on the nf-core Slack workspace: +// https://nf-co.re/join +// TODO nf-core: A module file SHOULD only define input and output files as command-line parameters. +// All other parameters MUST be provided using the "task.ext" directive, see here: +// https://www.nextflow.io/docs/latest/process.html#ext +// where "task.ext" is a string. +// Any parameters that need to be evaluated in the context of a particular sample +// e.g. single-end/paired-end data MUST also be defined and evaluated appropriately. +// TODO nf-core: Software that can be piped together SHOULD be added to separate module files +// unless there is a run-time, storage advantage in implementing in this way +// e.g. it's ok to have a single module for bwa to output BAM instead of SAM: +// bwa mem | samtools view -B -T ref.fasta +// TODO nf-core: Optional inputs are not currently supported by Nextflow. However, using an empty +// list (`[]`) instead of a file can be used to work around this issue. + +process SCDBLFINDER { + tag "$meta.id" + label 'process_medium' + + // TODO nf-core: See section in main README for further information regarding finding and adding container addresses to the section below. + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE': + 'biocontainers/YOUR-TOOL-HERE' }" + + input:// TODO nf-core: Where applicable all sample-specific information e.g. "id", "single_end", "read_group" + // MUST be provided as an input via a Groovy Map called "meta". + // This information may not be required in some instances e.g. indexing reference genome files: + // https://github.com/nf-core/modules/blob/master/modules/nf-core/bwa/index/main.nf + // TODO nf-core: Where applicable please provide/convert compressed files as input/output + // e.g. "*.fastq.gz" and NOT "*.fastq", "*.bam" and NOT "*.sam" etc. + tuple val(meta), path(bam) + + output: + // TODO nf-core: Named file extensions MUST be emitted for ALL output channels + tuple val(meta), path("*.bam"), emit: bam + // TODO nf-core: List additional required output channels/values here + // TODO nf-core: Update the command here to obtain the version number of the software used in this module + // TODO nf-core: If multiple software packages are used in this module, all MUST be added here + // by copying the line below and replacing the current tool with the extra tool(s) + tuple val("${task.process}"), val('scdblfinder'), eval("scdblfinder --version"), topic: versions, emit: versions_scdblfinder + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + // TODO nf-core: Where possible, a command MUST be provided to obtain the version number of the software e.g. 1.10 + // If the software is unable to output a version number on the command-line then it can be manually specified + // e.g. https://github.com/nf-core/modules/blob/master/modules/nf-core/homer/annotatepeaks/main.nf + // Each software used MUST provide the software name and version number in the YAML version file (versions.yml) + // TODO nf-core: It MUST be possible to pass additional parameters to the tool as a command-line string via the "task.ext.args" directive + // TODO nf-core: If the tool supports multi-threading then you MUST provide the appropriate parameter + // using the Nextflow "task" variable e.g. "--threads $task.cpus" + // TODO nf-core: Please replace the example samtools command below with your module's command + // TODO nf-core: Please indent the command appropriately (4 spaces!!) to help with readability ;) + """ + scdblfinder \\ + $args \\ + -@ $task.cpus \\ + -o ${prefix}.bam \\ + $bam + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + // TODO nf-core: A stub section should mimic the execution of the original module as best as possible + // Have a look at the following examples: + // Simple example: https://github.com/nf-core/modules/blob/818474a292b4860ae8ff88e149fbcda68814114d/modules/nf-core/bcftools/annotate/main.nf#L47-L63 + // Complex example: https://github.com/nf-core/modules/blob/818474a292b4860ae8ff88e149fbcda68814114d/modules/nf-core/bedtools/split/main.nf#L38-L54 + // TODO nf-core: If the module doesn't use arguments ($args), you SHOULD remove: + // - The definition of args `def args = task.ext.args ?: ''` above. + // - The use of the variable in the script `echo $args ` below. + """ + echo $args + + touch ${prefix}.bam + """ +} diff --git a/modules/local/scdblfinder/meta.yml b/modules/local/scdblfinder/meta.yml new file mode 100644 index 00000000..b621994a --- /dev/null +++ b/modules/local/scdblfinder/meta.yml @@ -0,0 +1,77 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +# # TODO nf-core: Add a description of the module and list keywords +name: "scdblfinder" +description: write your description here +keywords: + - sort + - example + - genomics +tools: + ## TODO nf-core: Add a description and other details for the software below + - "scdblfinder": + description: "scDblFinder" + homepage: "None" + documentation: "None" + tool_dev_url: "None" + doi: "" + licence: ["GPL v3 + file LICENSE"] + identifier: null + +input: + ### TODO nf-core: Add a description of all of the variables used as input + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - bam: + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + ontologies: + - edam: "http://edamontology.org/format_2572" # BAM + - edam: "http://edamontology.org/format_2573" # CRAM + - edam: "http://edamontology.org/format_3462" # SAM + +output: + ### TODO nf-core: Add a description of all of the variables used as output + bam: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "*.bam": + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + ontologies: + - edam: "http://edamontology.org/format_2572" # BAM + - edam: "http://edamontology.org/format_2573" # CRAM + - edam: "http://edamontology.org/format_3462" # SAM + versions_scdblfinder: + - - "${task.process}": + type: string + description: The name of the process + - "scdblfinder": + type: string + description: The name of the tool + - "scdblfinder --version": + type: eval + description: The expression to obtain the version of the tool + +topics: + versions: + - - ${task.process}: + type: string + description: The name of the process + - scdblfinder: + type: string + description: The name of the tool + - scdblfinder --version: + type: eval + description: The expression to obtain the version of the tool +authors: + - "@KurayiChawatama" +maintainers: + - "@KurayiChawatama" diff --git a/modules/local/scdblfinder/tests/main.nf.test b/modules/local/scdblfinder/tests/main.nf.test new file mode 100644 index 00000000..c4625e94 --- /dev/null +++ b/modules/local/scdblfinder/tests/main.nf.test @@ -0,0 +1,78 @@ +// TODO nf-core: Once you have added the required tests, please run the following command to build this file: +// nf-core modules test scdblfinder +nextflow_process { + + name "Test Process SCDBLFINDER" + script "../main.nf" + process "SCDBLFINDER" + + tag "modules" + tag "modules_" + tag "scdblfinder" + + // TODO nf-core: Change the test name preferably indicating the test-data and file-format used + test("sarscov2 - bam") { + + // TODO nf-core: If you are created a test for a chained module + // (the module requires running more than one process to generate the required output) + // add the 'setup' method here. + // You can find more information about how to use a 'setup' method in the docs (https://nf-co.re/docs/contributing/modules#steps-for-creating-nf-test-for-chained-modules). + + when { + process { + """ + // TODO nf-core: define inputs of the process here. Example: + + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + ] + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + process.out, + path(process.out.versions[0]).yaml + ).match() } + //TODO nf-core: Add all required assertions to verify the test output. + // See https://nf-co.re/docs/contributing/tutorials/nf-test_assertions for more information and examples. + ) + } + + } + + // TODO nf-core: Change the test name preferably indicating the test-data and file-format used but keep the " - stub" suffix. + test("sarscov2 - bam - stub") { + + options "-stub" + + when { + process { + """ + // TODO nf-core: define inputs of the process here. Example: + + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + ] + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + process.out, + path(process.out.versions[0]).yaml + ).match() } + ) + } + + } + +} From 0c1c2ee6fb38f752164008b47cad27c4b1d13fce Mon Sep 17 00:00:00 2001 From: KurayiChawatama Date: Thu, 12 Mar 2026 11:09:07 +0300 Subject: [PATCH 02/28] Fix scdblfinder: remove mockDoubletSCE and use real SCE object directly --- modules/local/scdblfinder/environment.yml | 7 +- modules/local/scdblfinder/main.nf | 60 +++--------- modules/local/scdblfinder/meta.yml | 89 ++++++++--------- .../local/scdblfinder/templates/scdblfinder.R | 95 +++++++++++++++++++ modules/local/scdblfinder/tests/main.nf.test | 38 +++----- 5 files changed, 164 insertions(+), 125 deletions(-) create mode 100644 modules/local/scdblfinder/templates/scdblfinder.R diff --git a/modules/local/scdblfinder/environment.yml b/modules/local/scdblfinder/environment.yml index 2dbc8004..509ce20c 100644 --- a/modules/local/scdblfinder/environment.yml +++ b/modules/local/scdblfinder/environment.yml @@ -4,7 +4,8 @@ channels: - conda-forge - bioconda dependencies: - # TODO nf-core: List required Conda package(s). - # Software MUST be pinned to channel (i.e. "bioconda"), version (i.e. "1.10"). - # For Conda, the build (i.e. "h9402c20_2") must be EXCLUDED to support installation on different operating systems. - "bioconda::bioconductor-scdblfinder=1.24.0" + - "bioconda::bioconductor-singlecellexperiment=1.26.0" + - "bioconda::bioconductor-biocparallel=1.38.0" + - "conda-forge::r-anndatar=0.3.2" + - "conda-forge::r-tidyverse=2.0.0" diff --git a/modules/local/scdblfinder/main.nf b/modules/local/scdblfinder/main.nf index 8b093cd4..a0903896 100644 --- a/modules/local/scdblfinder/main.nf +++ b/modules/local/scdblfinder/main.nf @@ -19,65 +19,31 @@ process SCDBLFINDER { tag "$meta.id" label 'process_medium' - // TODO nf-core: See section in main README for further information regarding finding and adding container addresses to the section below. conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE': - 'biocontainers/YOUR-TOOL-HERE' }" + 'https://depot.galaxyproject.org/singularity/mulled-v2-d8c5d0c7834f29eb8adde3fe8c4e9b6fbf89db2f:9fecf4e535ec29b85ab3c03bd26e5cca8e7d29a9-0' : + 'quay.io/biocontainers/mulled-v2-d8c5d0c7834f29eb8adde3fe8c4e9b6fbf89db2f:9fecf4e535ec29b85ab3c03bd26e5cca8e7d29a9-0' }" - input:// TODO nf-core: Where applicable all sample-specific information e.g. "id", "single_end", "read_group" - // MUST be provided as an input via a Groovy Map called "meta". - // This information may not be required in some instances e.g. indexing reference genome files: - // https://github.com/nf-core/modules/blob/master/modules/nf-core/bwa/index/main.nf - // TODO nf-core: Where applicable please provide/convert compressed files as input/output - // e.g. "*.fastq.gz" and NOT "*.fastq", "*.bam" and NOT "*.sam" etc. - tuple val(meta), path(bam) + input: + tuple val(meta), path(h5ad) output: - // TODO nf-core: Named file extensions MUST be emitted for ALL output channels - tuple val(meta), path("*.bam"), emit: bam - // TODO nf-core: List additional required output channels/values here - // TODO nf-core: Update the command here to obtain the version number of the software used in this module - // TODO nf-core: If multiple software packages are used in this module, all MUST be added here - // by copying the line below and replacing the current tool with the extra tool(s) - tuple val("${task.process}"), val('scdblfinder'), eval("scdblfinder --version"), topic: versions, emit: versions_scdblfinder + tuple val(meta), path("${prefix}.h5ad"), emit: h5ad + tuple val(meta), path("${prefix}.csv"), emit: predictions + path "versions.yml", emit: versions when: task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - // TODO nf-core: Where possible, a command MUST be provided to obtain the version number of the software e.g. 1.10 - // If the software is unable to output a version number on the command-line then it can be manually specified - // e.g. https://github.com/nf-core/modules/blob/master/modules/nf-core/homer/annotatepeaks/main.nf - // Each software used MUST provide the software name and version number in the YAML version file (versions.yml) - // TODO nf-core: It MUST be possible to pass additional parameters to the tool as a command-line string via the "task.ext.args" directive - // TODO nf-core: If the tool supports multi-threading then you MUST provide the appropriate parameter - // using the Nextflow "task" variable e.g. "--threads $task.cpus" - // TODO nf-core: Please replace the example samtools command below with your module's command - // TODO nf-core: Please indent the command appropriately (4 spaces!!) to help with readability ;) - """ - scdblfinder \\ - $args \\ - -@ $task.cpus \\ - -o ${prefix}.bam \\ - $bam - """ + prefix = task.ext.prefix ?: "${meta.id}" + template('scdblfinder.R') stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - // TODO nf-core: A stub section should mimic the execution of the original module as best as possible - // Have a look at the following examples: - // Simple example: https://github.com/nf-core/modules/blob/818474a292b4860ae8ff88e149fbcda68814114d/modules/nf-core/bcftools/annotate/main.nf#L47-L63 - // Complex example: https://github.com/nf-core/modules/blob/818474a292b4860ae8ff88e149fbcda68814114d/modules/nf-core/bedtools/split/main.nf#L38-L54 - // TODO nf-core: If the module doesn't use arguments ($args), you SHOULD remove: - // - The definition of args `def args = task.ext.args ?: ''` above. - // - The use of the variable in the script `echo $args ` below. + prefix = task.ext.prefix ?: "${meta.id}" """ - echo $args - - touch ${prefix}.bam + touch ${prefix}.h5ad + touch ${prefix}.csv + touch versions.yml """ } diff --git a/modules/local/scdblfinder/meta.yml b/modules/local/scdblfinder/meta.yml index b621994a..367e73c5 100644 --- a/modules/local/scdblfinder/meta.yml +++ b/modules/local/scdblfinder/meta.yml @@ -1,76 +1,67 @@ # yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json -# # TODO nf-core: Add a description of the module and list keywords name: "scdblfinder" -description: write your description here +description: Detect doublets in single-cell RNA-seq data using scDblFinder keywords: - - sort - - example - - genomics + - doublet-detection + - single-cell + - scrnaseq + - quality-control tools: - ## TODO nf-core: Add a description and other details for the software below - "scdblfinder": - description: "scDblFinder" - homepage: "None" - documentation: "None" - tool_dev_url: "None" - doi: "" - licence: ["GPL v3 + file LICENSE"] - identifier: null + description: "scDblFinder: Computational identification of doublets in single-cell transcriptomics data" + homepage: "https://bioconductor.org/packages/scDblFinder" + documentation: "https://bioconductor.org/packages/release/bioc/vignettes/scDblFinder/inst/doc/scDblFinder.html" + tool_dev_url: "https://github.com/plger/scDblFinder" + doi: "10.12688/f1000research.73600.2" + licence: ["GPL-3.0"] + identifier: biotools:scdblfinder input: - ### TODO nf-core: Add a description of all of the variables used as input - - meta: type: map description: | Groovy Map containing sample information e.g. `[ id:'sample1' ]` - - bam: + - h5ad: type: file - description: Sorted BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" + description: AnnData object in h5ad format + pattern: "*.{h5ad}" ontologies: - - edam: "http://edamontology.org/format_2572" # BAM - - edam: "http://edamontology.org/format_2573" # CRAM - - edam: "http://edamontology.org/format_3462" # SAM + - edam: "http://edamontology.org/format_3590" # HDF5 format output: - ### TODO nf-core: Add a description of all of the variables used as output - bam: + h5ad: - - meta: type: map description: | Groovy Map containing sample information e.g. `[ id:'sample1' ]` - - "*.bam": + - "*.h5ad": type: file - description: Sorted BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" + description: AnnData object with doublet annotations + pattern: "*.h5ad" ontologies: - - edam: "http://edamontology.org/format_2572" # BAM - - edam: "http://edamontology.org/format_2573" # CRAM - - edam: "http://edamontology.org/format_3462" # SAM - versions_scdblfinder: - - - "${task.process}": - type: string - description: The name of the process - - "scdblfinder": - type: string - description: The name of the tool - - "scdblfinder --version": - type: eval - description: The expression to obtain the version of the tool - -topics: + - edam: "http://edamontology.org/format_3590" # HDF5 format + predictions: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "*.csv": + type: file + description: CSV file containing doublet predictions (boolean) + pattern: "*.csv" + ontologies: + - edam: "http://edamontology.org/format_3752" # CSV versions: - - - ${task.process}: - type: string - description: The name of the process - - scdblfinder: - type: string - description: The name of the tool - - scdblfinder --version: - type: eval - description: The expression to obtain the version of the tool + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML + authors: - "@KurayiChawatama" maintainers: diff --git a/modules/local/scdblfinder/templates/scdblfinder.R b/modules/local/scdblfinder/templates/scdblfinder.R new file mode 100644 index 00000000..8f83ec99 --- /dev/null +++ b/modules/local/scdblfinder/templates/scdblfinder.R @@ -0,0 +1,95 @@ +#!/usr/bin/env Rscript + +library(scDblFinder) +library(tidyverse) +library(SingleCellExperiment) +library(BiocParallel) +library(anndataR) + +adata <- read_h5ad("${h5ad}") +sce <- adata$as_SingleCellExperiment() + +# Set the param to a specified RNG seed for reproducibility +bp <- MulticoreParam(workers = multicoreWorkers(), RNGseed=123) + + +# 10 Genomics Doublet Rate calculator used to get multiplet rate if not provided +# 10X multiplet rate table(https://rpubs.com/kenneditodd/doublet_finder_example) +multiplet_rates_10x <- data.frame( + "Multiplet_rate" = c(0.004, 0.008, 0.0160, 0.023, 0.031, + 0.039, 0.046, 0.054, 0.061, 0.069, 0.076), + "Loaded_cells" = c(800, 1600, 3200, 4800, 6400, 8000, 9600, + 11200, 12800, 14400, 16000), + "Recovered_cells" = c(500, 1000, 2000, 3000, 4000, 5000, 6000, + 7000, 8000, 9000, 10000) +) + +# Adjust to use the number of cells in the SCE object +multiplet_rate <- multiplet_rates_10x %>% + dplyr::filter(Recovered_cells < ncol(sce)) %>% + dplyr::slice(which.max(Recovered_cells)) %>% + dplyr::pull(Multiplet_rate) %>% + as.numeric() + +message(paste0("Setting multiplet rate to ", multiplet_rate, " for ", ncol(sce), " cells")) + +# Run scDblFinder on the REAL data (not mock data!) +# scDblFinder creates artificial doublets internally +set.seed(123) +sce <- scDblFinder( + sce, + BPPARAM = bp, + dbr = multiplet_rate, + artificialDoublets = ncol(sce) +) + +# Generate a summary table +message("scDblFinder results summary:") +print(table(sce\$scDblFinder.class)) + +# Rename scDblFinder.* columns for consistency with other doublet methods +scdbl_cols <- grep("^scDblFinder\\\\.", colnames(colData(sce)), value = TRUE) +new_scdbl_cols <- paste0("scdblfinder_", gsub("^scDblFinder\\\\.", "", gsub("\\\\.", "_", scdbl_cols))) + +# Rename columns in colData(sce) +for (i in seq_along(scdbl_cols)) { + colData(sce)[[new_scdbl_cols[i]]] <- colData(sce)[[scdbl_cols[i]]] + colData(sce)[[scdbl_cols[i]]] <- NULL # Remove the original column +} + +# Convert back to AnnData and save +adata_processed <- as_AnnData(sce) +write_h5ad(adata_processed, "${prefix}.h5ad") + +# Extract predictions for doublet removal step +# Create a binary doublet call based on class +predictions <- data.frame( + doublet = colData(sce)\$scdblfinder_class == "doublet", + row.names = colnames(sce) +) +colnames(predictions) <- "${prefix}" + +# Save predictions to CSV +write.csv(predictions, "${prefix}.csv") + +################################################ +################################################ +## VERSIONS FILE ## +################################################ +################################################ + +r.version <- strsplit(version[['version.string']], ' ')[[1]][3] +scDblFinder.version <- as.character(packageVersion('scDblFinder')) + +writeLines( + c( + '"${task.process}":', + paste(' R:', r.version), + paste(' scDblFinder:', scDblFinder.version) + ), +'versions.yml') + +################################################ +################################################ +################################################ +################################################ diff --git a/modules/local/scdblfinder/tests/main.nf.test b/modules/local/scdblfinder/tests/main.nf.test index c4625e94..c0168559 100644 --- a/modules/local/scdblfinder/tests/main.nf.test +++ b/modules/local/scdblfinder/tests/main.nf.test @@ -1,5 +1,3 @@ -// TODO nf-core: Once you have added the required tests, please run the following command to build this file: -// nf-core modules test scdblfinder nextflow_process { name "Test Process SCDBLFINDER" @@ -7,25 +5,17 @@ nextflow_process { process "SCDBLFINDER" tag "modules" - tag "modules_" + tag "modules_local" tag "scdblfinder" - // TODO nf-core: Change the test name preferably indicating the test-data and file-format used - test("sarscov2 - bam") { - - // TODO nf-core: If you are created a test for a chained module - // (the module requires running more than one process to generate the required output) - // add the 'setup' method here. - // You can find more information about how to use a 'setup' method in the docs (https://nf-co.re/docs/contributing/modules#steps-for-creating-nf-test-for-chained-modules). + test("homo_sapiens - h5ad") { when { process { """ - // TODO nf-core: define inputs of the process here. Example: - input[0] = [ [ id:'test' ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/scrnaseq/h5ad/SRR28679759_filtered_matrix.h5ad', checkIfExists: true) ] """ } @@ -35,29 +25,28 @@ nextflow_process { assert process.success assertAll( { assert snapshot( - process.out, - path(process.out.versions[0]).yaml + process.out.versions, + process.out.predictions, + // Hashing does not work due to this issue: + // https://github.com/scverse/anndataR/issues/272 + file(process.out.h5ad.get(0).get(1)).exists(), + file(process.out.h5ad.get(0).get(1)).size() ).match() } - //TODO nf-core: Add all required assertions to verify the test output. - // See https://nf-co.re/docs/contributing/tutorials/nf-test_assertions for more information and examples. ) } } - // TODO nf-core: Change the test name preferably indicating the test-data and file-format used but keep the " - stub" suffix. - test("sarscov2 - bam - stub") { + test("homo_sapiens - h5ad - stub") { options "-stub" when { process { """ - // TODO nf-core: define inputs of the process here. Example: - input[0] = [ [ id:'test' ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/scrnaseq/h5ad/SRR28679759_filtered_matrix.h5ad', checkIfExists: true) ] """ } @@ -66,10 +55,7 @@ nextflow_process { then { assert process.success assertAll( - { assert snapshot( - process.out, - path(process.out.versions[0]).yaml - ).match() } + { assert snapshot(process.out).match() } ) } From 8ba85f78cbf888f35c237c3888e307f07d5867cb Mon Sep 17 00:00:00 2001 From: KurayiChawatama Date: Thu, 12 Mar 2026 11:21:20 +0300 Subject: [PATCH 03/28] Integrate scdblfinder into pipeline configuration and tests --- conf/modules.config | 10 ++++++++++ conf/test.config | 2 +- conf/test_full.config | 2 +- nextflow_schema.json | 4 ++-- subworkflows/local/doublet_detection/main.nf | 9 +++++++++ 5 files changed, 23 insertions(+), 4 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index beae2c2f..0504d6db 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -213,6 +213,16 @@ process { ] } + withName: SCDBLFINDER { + ext.prefix = { meta.id + '_scdblfinder' } + publishDir = [ + path: { "${params.outdir}/quality_control/doublet_detection/scdblfinder" }, + mode: params.publish_dir_mode, + enabled: params.save_intermediates, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + withName: DOUBLET_REMOVAL { publishDir = [ path: { "${params.outdir}/quality_control/doublet_detection" }, diff --git a/conf/test.config b/conf/test.config index 189363f9..c58705ae 100644 --- a/conf/test.config +++ b/conf/test.config @@ -25,7 +25,7 @@ params { // Input data input = params.pipelines_testdata_base_path + 'samplesheet.csv' integration_methods = 'scvi,harmony,bbknn,combat' - doublet_detection = 'solo,scrublet,scds' + doublet_detection = 'solo,scrublet,scds,scdblfinder' celltypist_model = 'Adult_Human_Skin' celldex_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/scdownstream/singleR/references.csv' integration_hvgs = 500 diff --git a/conf/test_full.config b/conf/test_full.config index 8262e5bf..7d64f3f0 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -25,7 +25,7 @@ params { // Input data for full size test input = params.pipelines_testdata_base_path + 'samplesheet.csv' integration_methods = 'scvi,harmony,bbknn,combat' - doublet_detection = 'solo,scrublet,doubletdetection,scds' + doublet_detection = 'solo,scrublet,doubletdetection,scds,scdblfinder' celltypist_model = 'Adult_Human_Skin' celldex_reference = 'hpca__2024-02-26,monaco_immune__2024-02-26' // Feature: Support offline. celldex_reference_label = 'label.main,label.fine' diff --git a/nextflow_schema.json b/nextflow_schema.json index 5157e224..2233d8a6 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -105,8 +105,8 @@ "type": "string", "default": "scrublet", "description": "Specify the tools to use for doublet detection. Setting to 'none' will skip this step", - "help_text": "If you want to use multiple tools, separate them with a comma. Available methods are: solo, scrublet, doubletdetection, scds", - "pattern": "^(none|((solo|scrublet|doubletdetection|scds)?,?)*[^,]+$)" + "help_text": "If you want to use multiple tools, separate them with a comma. Available methods are: solo, scrublet, doubletdetection, scds, scdblfinder", + "pattern": "^(none|((solo|scrublet|doubletdetection|scds|scdblfinder)?,?)*[^,]+$)" }, "doublet_detection_threshold": { "type": "integer", diff --git a/subworkflows/local/doublet_detection/main.nf b/subworkflows/local/doublet_detection/main.nf index c7371ff3..110a3c4a 100644 --- a/subworkflows/local/doublet_detection/main.nf +++ b/subworkflows/local/doublet_detection/main.nf @@ -2,6 +2,7 @@ include { SCVITOOLS_SOLO } from '../../../modules/nf-core/scvitools/solo' include { SCANPY_SCRUBLET } from '../../../modules/nf-core/scanpy/scrublet' include { DOUBLETDETECTION } from '../../../modules/nf-core/doubletdetection' include { SCDS } from '../../../modules/local/doublet_detection/scds' +include { SCDBLFINDER } from '../../../modules/local/scdblfinder' include { DOUBLET_REMOVAL } from '../../../modules/local/doublet_detection/doublet_removal' workflow DOUBLET_DETECTION { @@ -56,6 +57,14 @@ workflow DOUBLET_DETECTION { ch_versions = DOUBLETDETECTION.out.versions } + if (methods.contains('scdblfinder')) { + SCDBLFINDER ( + ch_h5ad + ) + ch_predictions = ch_predictions.mix(SCDBLFINDER.out.predictions) + ch_versions = ch_versions.mix(SCDBLFINDER.out.versions) + } + DOUBLET_REMOVAL ( ch_h5ad.join(ch_predictions.groupTuple()), threshold, From 5a109a5597e9749688b842bc7a3732e418950b1c Mon Sep 17 00:00:00 2001 From: KurayiChawatama Date: Thu, 12 Mar 2026 13:11:33 +0300 Subject: [PATCH 04/28] Fix scdblfinder module implementation and tests --- modules/local/scdblfinder/environment.yml | 14 +- modules/local/scdblfinder/main.nf | 23 +--- .../local/scdblfinder/templates/scdblfinder.R | 46 +++++-- modules/local/scdblfinder/tests/main.nf.test | 55 ++++---- .../local/scdblfinder/tests/main.nf.test.snap | 122 ++++++++++++++++++ 5 files changed, 197 insertions(+), 63 deletions(-) create mode 100644 modules/local/scdblfinder/tests/main.nf.test.snap diff --git a/modules/local/scdblfinder/environment.yml b/modules/local/scdblfinder/environment.yml index 509ce20c..ea37ba79 100644 --- a/modules/local/scdblfinder/environment.yml +++ b/modules/local/scdblfinder/environment.yml @@ -1,11 +1,11 @@ ---- -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: scdblfinder channels: - conda-forge - bioconda dependencies: - - "bioconda::bioconductor-scdblfinder=1.24.0" - - "bioconda::bioconductor-singlecellexperiment=1.26.0" - - "bioconda::bioconductor-biocparallel=1.38.0" - - "conda-forge::r-anndatar=0.3.2" - - "conda-forge::r-tidyverse=2.0.0" + - bioconda::bioconductor-scdblfinder=1.24.0 + - bioconda::bioconductor-singlecellexperiment=1.32.0 + - bioconda::bioconductor-biocparallel=1.44.0 + - bioconda::bioconductor-anndatar=1.0.2 + - bioconda::bioconductor-rhdf5=2.54.1 + - conda-forge::r-tidyverse=2.0.0 \ No newline at end of file diff --git a/modules/local/scdblfinder/main.nf b/modules/local/scdblfinder/main.nf index a0903896..18727835 100644 --- a/modules/local/scdblfinder/main.nf +++ b/modules/local/scdblfinder/main.nf @@ -1,28 +1,11 @@ -// TODO nf-core: If in doubt look at other nf-core/modules to see how we are doing things! :) -// https://github.com/nf-core/modules/tree/master/modules/nf-core/ -// You can also ask for help via your pull request or on the #modules channel on the nf-core Slack workspace: -// https://nf-co.re/join -// TODO nf-core: A module file SHOULD only define input and output files as command-line parameters. -// All other parameters MUST be provided using the "task.ext" directive, see here: -// https://www.nextflow.io/docs/latest/process.html#ext -// where "task.ext" is a string. -// Any parameters that need to be evaluated in the context of a particular sample -// e.g. single-end/paired-end data MUST also be defined and evaluated appropriately. -// TODO nf-core: Software that can be piped together SHOULD be added to separate module files -// unless there is a run-time, storage advantage in implementing in this way -// e.g. it's ok to have a single module for bwa to output BAM instead of SAM: -// bwa mem | samtools view -B -T ref.fasta -// TODO nf-core: Optional inputs are not currently supported by Nextflow. However, using an empty -// list (`[]`) instead of a file can be used to work around this issue. - process SCDBLFINDER { tag "$meta.id" - label 'process_medium' + label 'process_low' conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-d8c5d0c7834f29eb8adde3fe8c4e9b6fbf89db2f:9fecf4e535ec29b85ab3c03bd26e5cca8e7d29a9-0' : - 'quay.io/biocontainers/mulled-v2-d8c5d0c7834f29eb8adde3fe8c4e9b6fbf89db2f:9fecf4e535ec29b85ab3c03bd26e5cca8e7d29a9-0' }" + 'oras://community.wave.seqera.io/library/bioconductor-anndatar_bioconductor-biocparallel_bioconductor-rhdf5_bioconductor-scdblfinder_pruned:28444625ead66428' : + 'community.wave.seqera.io/library/bioconductor-anndatar_bioconductor-biocparallel_bioconductor-rhdf5_bioconductor-scdblfinder_pruned:0f9db6b0855861de' }" input: tuple val(meta), path(h5ad) diff --git a/modules/local/scdblfinder/templates/scdblfinder.R b/modules/local/scdblfinder/templates/scdblfinder.R index 8f83ec99..48e3b1a2 100644 --- a/modules/local/scdblfinder/templates/scdblfinder.R +++ b/modules/local/scdblfinder/templates/scdblfinder.R @@ -7,7 +7,7 @@ library(BiocParallel) library(anndataR) adata <- read_h5ad("${h5ad}") -sce <- adata$as_SingleCellExperiment() +sce <- adata\$as_SingleCellExperiment() # Set the param to a specified RNG seed for reproducibility bp <- MulticoreParam(workers = multicoreWorkers(), RNGseed=123) @@ -33,28 +33,42 @@ multiplet_rate <- multiplet_rates_10x %>% message(paste0("Setting multiplet rate to ", multiplet_rate, " for ", ncol(sce), " cells")) -# Run scDblFinder on the REAL data (not mock data!) -# scDblFinder creates artificial doublets internally +# Save original cell names and count before overwriting sce +original_cell_names <- colnames(sce) +n_cells <- ncol(sce) + +# Run scDblFinder on the counts matrix (first assay) +# scDblFinder creates artificial doublets internally and returns a new SCE set.seed(123) sce <- scDblFinder( - sce, + assays(sce)[[1]], BPPARAM = bp, dbr = multiplet_rate, - artificialDoublets = ncol(sce) + artificialDoublets = n_cells ) +# Restore original cell names +if (!is.null(original_cell_names) && length(original_cell_names) == ncol(sce)) { + colnames(sce) <- original_cell_names +} + # Generate a summary table message("scDblFinder results summary:") print(table(sce\$scDblFinder.class)) # Rename scDblFinder.* columns for consistency with other doublet methods scdbl_cols <- grep("^scDblFinder\\\\.", colnames(colData(sce)), value = TRUE) -new_scdbl_cols <- paste0("scdblfinder_", gsub("^scDblFinder\\\\.", "", gsub("\\\\.", "_", scdbl_cols))) -# Rename columns in colData(sce) +# First remove "scDblFinder." prefix, THEN replace remaining dots with underscores +new_scdbl_cols <- paste0("scdblfinder_", gsub("\\\\.", "_", gsub("^scDblFinder\\\\.", "", scdbl_cols))) + +# Rename columns in colData(sce) - create new columns first, then delete old ones for (i in seq_along(scdbl_cols)) { colData(sce)[[new_scdbl_cols[i]]] <- colData(sce)[[scdbl_cols[i]]] - colData(sce)[[scdbl_cols[i]]] <- NULL # Remove the original column +} +# Now delete old columns +for (col in scdbl_cols) { + colData(sce)[[col]] <- NULL } # Convert back to AnnData and save @@ -63,10 +77,18 @@ write_h5ad(adata_processed, "${prefix}.h5ad") # Extract predictions for doublet removal step # Create a binary doublet call based on class -predictions <- data.frame( - doublet = colData(sce)\$scdblfinder_class == "doublet", - row.names = colnames(sce) -) +# Ensure we have valid row names +if (is.null(colnames(sce)) || length(colnames(sce)) != ncol(sce)) { + colnames(sce) <- paste0("cell_", seq_len(ncol(sce))) +} + +# Create predictions vector +doublet_calls <- colData(sce)\$scdblfinder_class == "doublet" + +# Create data frame without row.names first, then add them +predictions <- data.frame(doublet = doublet_calls) +row.names(predictions) <- colnames(sce) + colnames(predictions) <- "${prefix}" # Save predictions to CSV diff --git a/modules/local/scdblfinder/tests/main.nf.test b/modules/local/scdblfinder/tests/main.nf.test index c0168559..1ed6f7ec 100644 --- a/modules/local/scdblfinder/tests/main.nf.test +++ b/modules/local/scdblfinder/tests/main.nf.test @@ -1,61 +1,68 @@ nextflow_process { name "Test Process SCDBLFINDER" - script "../main.nf" + script "modules/local/scdblfinder/main.nf" process "SCDBLFINDER" tag "modules" tag "modules_local" - tag "scdblfinder" - test("homo_sapiens - h5ad") { + test("Should run without failures") { when { + params { + outdir = "$outputDir" + } process { """ - input[0] = [ - [ id:'test' ], - file(params.modules_testdata_base_path + 'genomics/homo_sapiens/scrnaseq/h5ad/SRR28679759_filtered_matrix.h5ad', checkIfExists: true) - ] + input[0] = channel.of([ + [ id: 'test' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/scrnaseq/h5ad/SRR28679759_filtered_matrix.h5ad', checkIfExists: true) + ] + ) """ } } then { - assert process.success assertAll( - { assert snapshot( - process.out.versions, - process.out.predictions, - // Hashing does not work due to this issue: - // https://github.com/scverse/anndataR/issues/272 - file(process.out.h5ad.get(0).get(1)).exists(), - file(process.out.h5ad.get(0).get(1)).size() - ).match() } + { assert process.success }, + { assert snapshot( + process.out.versions, + process.out.predictions, + // Hashing does not work due to this issue: + // https://github.com/scverse/anndataR/issues/272 + file(process.out.h5ad.get(0).get(1)).exists(), + file(process.out.h5ad.get(0).get(1)).size() + ).match() } ) } } - test("homo_sapiens - h5ad - stub") { + test("Should run without failures - stub") { - options "-stub" + options '-stub' when { + params { + outdir = "$outputDir" + } process { """ - input[0] = [ - [ id:'test' ], - file(params.modules_testdata_base_path + 'genomics/homo_sapiens/scrnaseq/h5ad/SRR28679759_filtered_matrix.h5ad', checkIfExists: true) - ] + input[0] = channel.of([ + [ id: 'test' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/scrnaseq/h5ad/SRR28679759_filtered_matrix.h5ad', checkIfExists: true) + ] + ) """ } } then { - assert process.success assertAll( - { assert snapshot(process.out).match() } + { assert process.success }, + { assert snapshot(process.out).match() } ) } diff --git a/modules/local/scdblfinder/tests/main.nf.test.snap b/modules/local/scdblfinder/tests/main.nf.test.snap new file mode 100644 index 00000000..721b5284 --- /dev/null +++ b/modules/local/scdblfinder/tests/main.nf.test.snap @@ -0,0 +1,122 @@ +{ + "homo_sapiens - h5ad - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_scdblfinder.h5ad:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test_scdblfinder.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "h5ad": [ + [ + { + "id": "test" + }, + "test_scdblfinder.h5ad:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "predictions": [ + [ + { + "id": "test" + }, + "test_scdblfinder.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + } + ], + "timestamp": "2026-03-12T11:44:51.894263326", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "Should run without failures - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_scdblfinder.h5ad:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test_scdblfinder.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "h5ad": [ + [ + { + "id": "test" + }, + "test_scdblfinder.h5ad:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "predictions": [ + [ + { + "id": "test" + }, + "test_scdblfinder.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + } + ], + "timestamp": "2026-03-12T12:34:23.397301125", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "Should run without failures": { + "content": [ + [ + "versions.yml:md5,ce056c78586769ad5433f5fbb86f70c7" + ], + [ + [ + { + "id": "test" + }, + "test_scdblfinder.csv:md5,130130ae215768e16e0df93a064dc5e9" + ] + ], + true, + 5101352 + ], + "timestamp": "2026-03-12T13:08:16.847676966", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file From 0fb9d77ae5096b2f8fbc33274d073bb3397d2ffc Mon Sep 17 00:00:00 2001 From: KurayiChawatama Date: Thu, 12 Mar 2026 13:25:14 +0300 Subject: [PATCH 05/28] Update documentation to include scDblFinder --- CHANGELOG.md | 1 + README.md | 1 + docs/output.md | 2 +- modules/local/scdblfinder/main.nf | 2 +- 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ae737c6..888d9ccd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ Initial release of nf-core/scdownstream, created with the [nf-core](https://nf-c ### `Added` - Added `singleR` module for automated cell type annotation. +- Added `scDblFinder` module for doublet detection. ### `Fixed` diff --git a/README.md b/README.md index 9eb07ccb..6edbd11e 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,7 @@ Steps marked with the boat icon are not yet implemented. For the other steps, th - [scrublet](https://scanpy.readthedocs.io/en/stable/api/generated/scanpy.pp.scrublet.html) - [DoubletDetection](https://doubletdetection.readthedocs.io/en/v2.5.2/doubletdetection.doubletdetection.html) - [SCDS](https://bioconductor.org/packages/devel/bioc/vignettes/scds/inst/doc/scds.html) + - [scDblFinder](https://bioconductor.org/packages/release/bioc/html/scDblFinder.html) 2. Sample aggregation 1. Merge into a single h5ad file 2. Present QC for merged counts ([`MultiQC`](http://multiqc.info/)) diff --git a/docs/output.md b/docs/output.md index 27fad158..04ffcff4 100644 --- a/docs/output.md +++ b/docs/output.md @@ -57,7 +57,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - `custom_thresholds/`: Results of applying user-defined QC thresholds. - `doublet_detection/`: Directories related to doublet detection. - `input_rds/`: RDS version of the h5ad file that is used as input to the doublet detection tools. - - `(doubletdetection|scds|scrublet|solo)/`: Results of doublet detection. Each directory contains a filtered `h5ad`/`rds` and a `csv`/`pkl` file with the doublet annotations. + - `(doubletdetection|scdblfinder|scds|scrublet|solo)/`: Results of doublet detection. Each directory contains a filtered `h5ad`/`rds` and a `csv`/`pkl` file with the doublet annotations. - `${sample_id}.h5ad`: The h5ad without doublets. - `qc_preprocessed/`: QC plots for the preprocessed data. diff --git a/modules/local/scdblfinder/main.nf b/modules/local/scdblfinder/main.nf index 18727835..415593ed 100644 --- a/modules/local/scdblfinder/main.nf +++ b/modules/local/scdblfinder/main.nf @@ -1,6 +1,6 @@ process SCDBLFINDER { tag "$meta.id" - label 'process_low' + label 'process_medium' conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? From 9af9d361e74d1af7462ad5c3d9cb671f2391392a Mon Sep 17 00:00:00 2001 From: KurayiChawatama Date: Thu, 12 Mar 2026 13:35:52 +0300 Subject: [PATCH 06/28] added more documentation for scdblfinder --- ro-crate-metadata.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index 0ce2b699..07662401 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -23,7 +23,7 @@ "@type": "Dataset", "creativeWorkStatus": "InProgress", "datePublished": "2025-11-20T09:32:29+00:00", - "description": "

\n \n \n \"nf-core/scdownstream\"\n \n

\n\n[![Open in GitHub Codespaces](https://img.shields.io/badge/Open_In_GitHub_Codespaces-black?labelColor=grey&logo=github)](https://github.com/codespaces/new/nf-core/scdownstream)\n[![GitHub Actions CI Status](https://github.com/nf-core/scdownstream/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/scdownstream/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/scdownstream/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/scdownstream/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/scdownstream/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.10.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.5.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.5.1)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/scdownstream)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23scdownstream-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/scdownstream)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/scdownstream** is a bioinformatics pipeline that can be used to process already quantified single-cell RNA-seq data. It takes a samplesheet and h5ad-, SingleCellExperiment/Seurat- or CSV files as input and performs quality control, integration, dimensionality reduction and clustering. It produces an integrated h5ad and SingleCellExperiment file and an extensive QC report.\n\nThe pipeline is based on the learnings and implementations from the following pipelines (alphabetical):\n\n- [panpipes](https://github.com/DendrouLab/panpipes)\n- [scFlow](https://combiz.github.io/scFlow/)\n- [scRAFIKI](https://github.com/Mye-InfoBank/scRAFIKI)\n- [YASCP](https://github.com/wtsi-hgi/yascp)\n\n# ![nf-core/scdownstream](docs/images/metromap.png)\n\nSteps marked with the boat icon are not yet implemented. For the other steps, the pipeline uses the following tools:\n\n1. Per-sample preprocessing\n 1. Convert all RDS files to h5ad format\n 2. Create filtered matrix (if not provided)\n 3. Present QC for raw counts ([`MultiQC`](http://multiqc.info/))\n 4. Remove ambient RNA\n - [decontX](https://bioconductor.org/packages/release/bioc/html/decontX.html)\n - [soupX](https://cran.r-project.org/web/packages/SoupX/readme/README.html)\n - [cellbender](https://cellbender.readthedocs.io/en/latest/)\n - [scAR](https://docs.scvi-tools.org/en/stable/user_guide/models/scar.html)\n 5. Apply user-defined QC filters (can be defined per sample in the samplesheet)\n 6. Doublet detection (Majority vote possible)\n - [SOLO](https://docs.scvi-tools.org/en/stable/user_guide/models/solo.html)\n - [scrublet](https://scanpy.readthedocs.io/en/stable/api/generated/scanpy.pp.scrublet.html)\n - [DoubletDetection](https://doubletdetection.readthedocs.io/en/v2.5.2/doubletdetection.doubletdetection.html)\n - [SCDS](https://bioconductor.org/packages/devel/bioc/vignettes/scds/inst/doc/scds.html)\n2. Sample aggregation\n 1. Merge into a single h5ad file\n 2. Present QC for merged counts ([`MultiQC`](http://multiqc.info/))\n 3. Integration\n - [scVI](https://docs.scvi-tools.org/en/stable/user_guide/models/scvi.html)\n - [scANVI](https://docs.scvi-tools.org/en/stable/user_guide/models/scanvi.html)\n - [Harmony](https://portals.broadinstitute.org/harmony/articles/quickstart.html)\n - [BBKNN](https://github.com/Teichlab/bbknn)\n - [Combat](https://scanpy.readthedocs.io/en/latest/api/generated/scanpy.pp.combat.html)\n - [Seurat](https://satijalab.org/seurat/articles/integration_introduction)\n3. Cell type annotation\n - [celltypist](https://www.celltypist.org/)\n4. Clustering and dimensionality reduction\n 1. [Leiden clustering](https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.leiden.html)\n 2. [UMAP](https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.umap.html)\n5. Create report ([`MultiQC`](http://multiqc.info/))\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\n> [!NOTE]\n> If you are confused by the terms `filtered` and `unfiltered`, please check out the respective [documentation](https://nf-co.re/scdownstream/dev/docs/usage/#filtered-and-unfiltered-matrices).\n\nFirst, prepare a samplesheet with your input data that looks as follows:\n\n```csv title=\"samplesheet.csv\"\nsample,unfiltered\nsample1,/absolute/path/to/sample1.h5ad\nsample2,/absolute/path/to/sample3.h5\nsample3,relative/path/to/sample2.rds\nsample4,/absolute/path/to/sample3.csv\n```\n\nEach entry represents a h5ad, h5, RDS or CSV file. RDS files may contain any object that can be converted to a SingleCellExperiment using the [Seurat `as.SingleCellExperiment`](https://satijalab.org/seurat/reference/as.singlecellexperiment) function.\nCSV files should contain a matrix with genes as columns and cells as rows. The first column should contain cell names/barcodes.\n\n-->\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run nf-core/scdownstream \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/scdownstream/usage) and the [parameter documentation](https://nf-co.re/scdownstream/parameters).\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/scdownstream/results) tab on the nf-core website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://nf-co.re/scdownstream/output).\n\n## Credits\n\nnf-core/scdownstream was originally written by [Nico Trummer](https://github.com/nictru).\n\nWe thank the following people for their extensive assistance in the development of this pipeline (alphabetical):\n\n- [Fabian Rost](https://github.com/fbnrst)\n- [Fabiola Curion](https://github.com/bio-la)\n- [Gregor Sturm](https://github.com/grst)\n- [Jonathan Talbot-Martin](https://github.com/jtalbotmartin)\n- [Lukas Heumos](https://github.com/zethson)\n- [Matiss Ozols](https://github.com/maxozo)\n- [Nathan Skene](https://github.com/NathanSkene)\n- [Nurun Fancy](https://github.com/nfancy)\n- [Riley Grindle](https://github.com/Riley-Grindle)\n- [Ryan Seaman](https://github.com/RPSeaman)\n- [Steffen M\u00f6ller](https://github.com/smoe)\n- [Wojtek Sowinski](https://github.com/WojtekSowinski)\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#scdownstream` channel](https://nfcore.slack.com/channels/scdownstream) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\n\n\n\n\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", + "description": "

\n \n \n \"nf-core/scdownstream\"\n \n

\n\n[![Open in GitHub Codespaces](https://img.shields.io/badge/Open_In_GitHub_Codespaces-black?labelColor=grey&logo=github)](https://github.com/codespaces/new/nf-core/scdownstream)\n[![GitHub Actions CI Status](https://github.com/nf-core/scdownstream/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/scdownstream/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/scdownstream/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/scdownstream/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/scdownstream/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.10.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.5.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.5.1)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/scdownstream)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23scdownstream-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/scdownstream)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/scdownstream** is a bioinformatics pipeline that can be used to process already quantified single-cell RNA-seq data. It takes a samplesheet and h5ad-, SingleCellExperiment/Seurat- or CSV files as input and performs quality control, integration, dimensionality reduction and clustering. It produces an integrated h5ad and SingleCellExperiment file and an extensive QC report.\n\nThe pipeline is based on the learnings and implementations from the following pipelines (alphabetical):\n\n- [panpipes](https://github.com/DendrouLab/panpipes)\n- [scFlow](https://combiz.github.io/scFlow/)\n- [scRAFIKI](https://github.com/Mye-InfoBank/scRAFIKI)\n- [YASCP](https://github.com/wtsi-hgi/yascp)\n\n# ![nf-core/scdownstream](docs/images/metromap.png)\n\nSteps marked with the boat icon are not yet implemented. For the other steps, the pipeline uses the following tools:\n\n1. Per-sample preprocessing\n 1. Convert all RDS files to h5ad format\n 2. Create filtered matrix (if not provided)\n 3. Present QC for raw counts ([`MultiQC`](http://multiqc.info/))\n 4. Remove ambient RNA\n - [decontX](https://bioconductor.org/packages/release/bioc/html/decontX.html)\n - [soupX](https://cran.r-project.org/web/packages/SoupX/readme/README.html)\n - [cellbender](https://cellbender.readthedocs.io/en/latest/)\n - [scAR](https://docs.scvi-tools.org/en/stable/user_guide/models/scar.html)\n 5. Apply user-defined QC filters (can be defined per sample in the samplesheet)\n 6. Doublet detection (Majority vote possible)\n - [SOLO](https://docs.scvi-tools.org/en/stable/user_guide/models/solo.html)\n - [scrublet](https://scanpy.readthedocs.io/en/stable/api/generated/scanpy.pp.scrublet.html)\n - [DoubletDetection](https://doubletdetection.readthedocs.io/en/v2.5.2/doubletdetection.doubletdetection.html)\n - [SCDS](https://bioconductor.org/packages/devel/bioc/vignettes/scds/inst/doc/scds.html)\n - [scDblFinder](https://bioconductor.org/packages/release/bioc/html/scDblFinder.html)\n2. Sample aggregation\n 1. Merge into a single h5ad file\n 2. Present QC for merged counts ([`MultiQC`](http://multiqc.info/))\n 3. Integration\n - [scVI](https://docs.scvi-tools.org/en/stable/user_guide/models/scvi.html)\n - [scANVI](https://docs.scvi-tools.org/en/stable/user_guide/models/scanvi.html)\n - [Harmony](https://portals.broadinstitute.org/harmony/articles/quickstart.html)\n - [BBKNN](https://github.com/Teichlab/bbknn)\n - [Combat](https://scanpy.readthedocs.io/en/latest/api/generated/scanpy.pp.combat.html)\n - [Seurat](https://satijalab.org/seurat/articles/integration_introduction)\n3. Cell type annotation\n - [celltypist](https://www.celltypist.org/)\n4. Clustering and dimensionality reduction\n 1. [Leiden clustering](https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.leiden.html)\n 2. [UMAP](https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.umap.html)\n5. Create report ([`MultiQC`](http://multiqc.info/))\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\n> [!NOTE]\n> If you are confused by the terms `filtered` and `unfiltered`, please check out the respective [documentation](https://nf-co.re/scdownstream/dev/docs/usage/#filtered-and-unfiltered-matrices).\n\nFirst, prepare a samplesheet with your input data that looks as follows:\n\n```csv title=\"samplesheet.csv\"\nsample,unfiltered\nsample1,/absolute/path/to/sample1.h5ad\nsample2,/absolute/path/to/sample3.h5\nsample3,relative/path/to/sample2.rds\nsample4,/absolute/path/to/sample3.csv\n```\n\nEach entry represents a h5ad, h5, RDS or CSV file. RDS files may contain any object that can be converted to a SingleCellExperiment using the [Seurat `as.SingleCellExperiment`](https://satijalab.org/seurat/reference/as.singlecellexperiment) function.\nCSV files should contain a matrix with genes as columns and cells as rows. The first column should contain cell names/barcodes.\n\n-->\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run nf-core/scdownstream \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/scdownstream/usage) and the [parameter documentation](https://nf-co.re/scdownstream/parameters).\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/scdownstream/results) tab on the nf-core website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://nf-co.re/scdownstream/output).\n\n## Credits\n\nnf-core/scdownstream was originally written by [Nico Trummer](https://github.com/nictru).\n\nWe thank the following people for their extensive assistance in the development of this pipeline (alphabetical):\n\n- [Fabian Rost](https://github.com/fbnrst)\n- [Fabiola Curion](https://github.com/bio-la)\n- [Gregor Sturm](https://github.com/grst)\n- [Jonathan Talbot-Martin](https://github.com/jtalbotmartin)\n- [Lukas Heumos](https://github.com/zethson)\n- [Matiss Ozols](https://github.com/maxozo)\n- [Nathan Skene](https://github.com/NathanSkene)\n- [Nurun Fancy](https://github.com/nfancy)\n- [Riley Grindle](https://github.com/Riley-Grindle)\n- [Ryan Seaman](https://github.com/RPSeaman)\n- [Steffen M\u00f6ller](https://github.com/smoe)\n- [Wojtek Sowinski](https://github.com/WojtekSowinski)\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#scdownstream` channel](https://nfcore.slack.com/channels/scdownstream) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\n\n\n\n\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", "hasPart": [ { "@id": "main.nf" From bd40dce9ce7f706abfd6d0be3f9ec9913076d754 Mon Sep 17 00:00:00 2001 From: KurayiChawatama Date: Thu, 12 Mar 2026 14:31:57 +0300 Subject: [PATCH 07/28] added scdblfinder citation to citations md --- CITATIONS.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CITATIONS.md b/CITATIONS.md index bee9e80f..5ce4742e 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -47,6 +47,10 @@ > Cannoodt R, Zappia L, Morgan M, Deconinck L (2025). anndataR: AnnData interoperability in R. R package version 0.99.0 +- [scDblFinder](https://pubmed.ncbi.nlm.nih.gov/35118618/) + + > Germain P, Lun A, Garcia Meixide C, Macnair W, Robinson M. Doublet identification in single-cell sequencing data using scDblFinder. F1000Res. 2022;11:979. doi: 10.12688/f1000research.73600.2. + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) From 807f00f277fff41a774e82fd238e534a9b9650ea Mon Sep 17 00:00:00 2001 From: KurayiChawatama Date: Thu, 12 Mar 2026 14:39:21 +0300 Subject: [PATCH 08/28] removed template comment from meta yml --- modules/local/scdblfinder/meta.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/local/scdblfinder/meta.yml b/modules/local/scdblfinder/meta.yml index 367e73c5..33810388 100644 --- a/modules/local/scdblfinder/meta.yml +++ b/modules/local/scdblfinder/meta.yml @@ -1,4 +1,3 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json name: "scdblfinder" description: Detect doublets in single-cell RNA-seq data using scDblFinder keywords: From 275fac8cb2ef149439ab4f08f04ce153213a6a2e Mon Sep 17 00:00:00 2001 From: KurayiChawatama Date: Thu, 12 Mar 2026 14:46:08 +0300 Subject: [PATCH 09/28] moved scdblfinder module to doublet detection dirtectory --- .../local/{ => doublet_detection}/scdblfinder/environment.yml | 0 modules/local/{ => doublet_detection}/scdblfinder/main.nf | 0 modules/local/{ => doublet_detection}/scdblfinder/meta.yml | 0 .../{ => doublet_detection}/scdblfinder/templates/scdblfinder.R | 0 .../{ => doublet_detection}/scdblfinder/tests/main.nf.test | 2 +- .../{ => doublet_detection}/scdblfinder/tests/main.nf.test.snap | 0 subworkflows/local/doublet_detection/main.nf | 2 +- 7 files changed, 2 insertions(+), 2 deletions(-) rename modules/local/{ => doublet_detection}/scdblfinder/environment.yml (100%) rename modules/local/{ => doublet_detection}/scdblfinder/main.nf (100%) rename modules/local/{ => doublet_detection}/scdblfinder/meta.yml (100%) rename modules/local/{ => doublet_detection}/scdblfinder/templates/scdblfinder.R (100%) rename modules/local/{ => doublet_detection}/scdblfinder/tests/main.nf.test (96%) rename modules/local/{ => doublet_detection}/scdblfinder/tests/main.nf.test.snap (100%) diff --git a/modules/local/scdblfinder/environment.yml b/modules/local/doublet_detection/scdblfinder/environment.yml similarity index 100% rename from modules/local/scdblfinder/environment.yml rename to modules/local/doublet_detection/scdblfinder/environment.yml diff --git a/modules/local/scdblfinder/main.nf b/modules/local/doublet_detection/scdblfinder/main.nf similarity index 100% rename from modules/local/scdblfinder/main.nf rename to modules/local/doublet_detection/scdblfinder/main.nf diff --git a/modules/local/scdblfinder/meta.yml b/modules/local/doublet_detection/scdblfinder/meta.yml similarity index 100% rename from modules/local/scdblfinder/meta.yml rename to modules/local/doublet_detection/scdblfinder/meta.yml diff --git a/modules/local/scdblfinder/templates/scdblfinder.R b/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R similarity index 100% rename from modules/local/scdblfinder/templates/scdblfinder.R rename to modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R diff --git a/modules/local/scdblfinder/tests/main.nf.test b/modules/local/doublet_detection/scdblfinder/tests/main.nf.test similarity index 96% rename from modules/local/scdblfinder/tests/main.nf.test rename to modules/local/doublet_detection/scdblfinder/tests/main.nf.test index 1ed6f7ec..0ed11140 100644 --- a/modules/local/scdblfinder/tests/main.nf.test +++ b/modules/local/doublet_detection/scdblfinder/tests/main.nf.test @@ -1,7 +1,7 @@ nextflow_process { name "Test Process SCDBLFINDER" - script "modules/local/scdblfinder/main.nf" + script "modules/local/doublet_detection/scdblfinder/main.nf" process "SCDBLFINDER" tag "modules" diff --git a/modules/local/scdblfinder/tests/main.nf.test.snap b/modules/local/doublet_detection/scdblfinder/tests/main.nf.test.snap similarity index 100% rename from modules/local/scdblfinder/tests/main.nf.test.snap rename to modules/local/doublet_detection/scdblfinder/tests/main.nf.test.snap diff --git a/subworkflows/local/doublet_detection/main.nf b/subworkflows/local/doublet_detection/main.nf index 110a3c4a..cc195cfc 100644 --- a/subworkflows/local/doublet_detection/main.nf +++ b/subworkflows/local/doublet_detection/main.nf @@ -2,7 +2,7 @@ include { SCVITOOLS_SOLO } from '../../../modules/nf-core/scvitools/solo' include { SCANPY_SCRUBLET } from '../../../modules/nf-core/scanpy/scrublet' include { DOUBLETDETECTION } from '../../../modules/nf-core/doubletdetection' include { SCDS } from '../../../modules/local/doublet_detection/scds' -include { SCDBLFINDER } from '../../../modules/local/scdblfinder' +include { SCDBLFINDER } from '../../../modules/local/doublet_detection/scdblfinder' include { DOUBLET_REMOVAL } from '../../../modules/local/doublet_detection/doublet_removal' workflow DOUBLET_DETECTION { From 993a8f2f5f88a0d92482494c587a9def5680fa80 Mon Sep 17 00:00:00 2001 From: KurayiChawatama Date: Thu, 12 Mar 2026 15:33:52 +0300 Subject: [PATCH 10/28] updated docs ouput to include scdblfinder --- docs/output.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/output.md b/docs/output.md index 04ffcff4..e08397b8 100644 --- a/docs/output.md +++ b/docs/output.md @@ -25,6 +25,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [scrublet](https://scanpy.readthedocs.io/en/stable/api/generated/scanpy.pp.scrublet.html) - [DoubletDetection](https://doubletdetection.readthedocs.io/en/v2.5.2/doubletdetection.doubletdetection.html) - [SCDS](https://bioconductor.org/packages/devel/bioc/vignettes/scds/inst/doc/scds.html) + - [scDblFinder](https://bioconductor.org/packages/release/bioc/html/scDblFinder.html) 2. Sample aggregation 1. Merge into a single h5ad file 2. Present QC for merged counts ([`MultiQC`](http://multiqc.info/)) From a56e65693974fa9d005ba7e5d2900856ebca6e1f Mon Sep 17 00:00:00 2001 From: nf-core-bot Date: Thu, 12 Mar 2026 13:02:17 +0000 Subject: [PATCH 11/28] [automated] Fix code linting --- CITATIONS.md | 2 +- modules/local/doublet_detection/scdblfinder/environment.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CITATIONS.md b/CITATIONS.md index 5ce4742e..10ad4fc5 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -50,7 +50,7 @@ - [scDblFinder](https://pubmed.ncbi.nlm.nih.gov/35118618/) > Germain P, Lun A, Garcia Meixide C, Macnair W, Robinson M. Doublet identification in single-cell sequencing data using scDblFinder. F1000Res. 2022;11:979. doi: 10.12688/f1000research.73600.2. - + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/modules/local/doublet_detection/scdblfinder/environment.yml b/modules/local/doublet_detection/scdblfinder/environment.yml index ea37ba79..b3c8a625 100644 --- a/modules/local/doublet_detection/scdblfinder/environment.yml +++ b/modules/local/doublet_detection/scdblfinder/environment.yml @@ -8,4 +8,4 @@ dependencies: - bioconda::bioconductor-biocparallel=1.44.0 - bioconda::bioconductor-anndatar=1.0.2 - bioconda::bioconductor-rhdf5=2.54.1 - - conda-forge::r-tidyverse=2.0.0 \ No newline at end of file + - conda-forge::r-tidyverse=2.0.0 From ec56f35da159f673149357714169f760bdb84224 Mon Sep 17 00:00:00 2001 From: Kurayi Chawatama <142725139+KurayiChawatama@users.noreply.github.com> Date: Thu, 12 Mar 2026 16:09:58 +0300 Subject: [PATCH 12/28] Update modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../doublet_detection/scdblfinder/templates/scdblfinder.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R b/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R index 48e3b1a2..d7ca894a 100644 --- a/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R +++ b/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R @@ -10,7 +10,11 @@ adata <- read_h5ad("${h5ad}") sce <- adata\$as_SingleCellExperiment() # Set the param to a specified RNG seed for reproducibility -bp <- MulticoreParam(workers = multicoreWorkers(), RNGseed=123) +nxf_task_cpus <- as.integer(Sys.getenv("NXF_TASK_CPUS", unset = "1")) +if (is.na(nxf_task_cpus) || nxf_task_cpus < 1L) { + nxf_task_cpus <- 1L +} +bp <- MulticoreParam(workers = nxf_task_cpus, RNGseed=123) # 10 Genomics Doublet Rate calculator used to get multiplet rate if not provided From 0b0b19d70625b50dfd2211002f72469c45e5f06b Mon Sep 17 00:00:00 2001 From: KurayiChawatama Date: Fri, 13 Mar 2026 11:17:42 +0300 Subject: [PATCH 13/28] added https version of the singularity container link --- modules/local/doublet_detection/scdblfinder/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/doublet_detection/scdblfinder/main.nf b/modules/local/doublet_detection/scdblfinder/main.nf index 415593ed..8c9e691a 100644 --- a/modules/local/doublet_detection/scdblfinder/main.nf +++ b/modules/local/doublet_detection/scdblfinder/main.nf @@ -4,7 +4,7 @@ process SCDBLFINDER { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'oras://community.wave.seqera.io/library/bioconductor-anndatar_bioconductor-biocparallel_bioconductor-rhdf5_bioconductor-scdblfinder_pruned:28444625ead66428' : + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/99/993a012a69d920412b090701eb733ccf35c8655c3d012756ca6b0af1cfcd4780/data' : 'community.wave.seqera.io/library/bioconductor-anndatar_bioconductor-biocparallel_bioconductor-rhdf5_bioconductor-scdblfinder_pruned:0f9db6b0855861de' }" input: From 6d741bc065e653548086844c0a5256c109f9c846 Mon Sep 17 00:00:00 2001 From: KurayiChawatama Date: Fri, 13 Mar 2026 11:32:19 +0300 Subject: [PATCH 14/28] refactor(scDblFinder): optimize multiplet rate calculation using findInterval --- .../scdblfinder/templates/scdblfinder.R | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R b/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R index d7ca894a..b3ade99d 100644 --- a/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R +++ b/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R @@ -29,12 +29,11 @@ multiplet_rates_10x <- data.frame( ) # Adjust to use the number of cells in the SCE object -multiplet_rate <- multiplet_rates_10x %>% - dplyr::filter(Recovered_cells < ncol(sce)) %>% - dplyr::slice(which.max(Recovered_cells)) %>% - dplyr::pull(Multiplet_rate) %>% - as.numeric() +idx <- findInterval(ncol(sce), multiplet_rates_10x\$Recovered_cells) +if (idx < 1L) idx <- 1L +if (idx > nrow(multiplet_rates_10x)) idx <- nrow(multiplet_rates_10x) +multiplet_rate <- as.numeric(multiplet_rates_10x\$Multiplet_rate[idx]) message(paste0("Setting multiplet rate to ", multiplet_rate, " for ", ncol(sce), " cells")) # Save original cell names and count before overwriting sce From 947ffa2c293ae182da91b342b15f19f905f4a339 Mon Sep 17 00:00:00 2001 From: KurayiChawatama Date: Fri, 13 Mar 2026 11:46:18 +0300 Subject: [PATCH 15/28] added explanation for column name change --- .../scdblfinder/templates/scdblfinder.R | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R b/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R index b3ade99d..f70f4637 100644 --- a/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R +++ b/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R @@ -10,12 +10,8 @@ adata <- read_h5ad("${h5ad}") sce <- adata\$as_SingleCellExperiment() # Set the param to a specified RNG seed for reproducibility -nxf_task_cpus <- as.integer(Sys.getenv("NXF_TASK_CPUS", unset = "1")) -if (is.na(nxf_task_cpus) || nxf_task_cpus < 1L) { - nxf_task_cpus <- 1L -} -bp <- MulticoreParam(workers = nxf_task_cpus, RNGseed=123) - +num_threads <- max(1L, as.integer("${task.cpus}")) +bp <- MulticoreParam(workers = num_threads, RNGseed = 123) # 10 Genomics Doublet Rate calculator used to get multiplet rate if not provided # 10X multiplet rate table(https://rpubs.com/kenneditodd/doublet_finder_example) @@ -50,7 +46,10 @@ sce <- scDblFinder( artificialDoublets = n_cells ) -# Restore original cell names +# Restore the input barcodes because running scDblFinder on the just the assay matrix above can +# return a new SCE whose column names no longer match the original AnnData cell IDs. +# Keeping the original names is required so the output h5ad obs_names and CSV rows +# still map back to the same cells seen by downstream steps. if (!is.null(original_cell_names) && length(original_cell_names) == ncol(sce)) { colnames(sce) <- original_cell_names } From 8c2e3827296073d02c6ae1ecfc236d8efb026b17 Mon Sep 17 00:00:00 2001 From: KurayiChawatama Date: Fri, 13 Mar 2026 11:52:25 +0300 Subject: [PATCH 16/28] write updated SingleCellExperiment directly as h5ad without explicit conversion --- .../doublet_detection/scdblfinder/templates/scdblfinder.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R b/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R index f70f4637..a11a0b2c 100644 --- a/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R +++ b/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R @@ -73,9 +73,8 @@ for (col in scdbl_cols) { colData(sce)[[col]] <- NULL } -# Convert back to AnnData and save -adata_processed <- as_AnnData(sce) -write_h5ad(adata_processed, "${prefix}.h5ad") +# Write the updated SingleCellExperiment directly as h5ad +write_h5ad(sce, "${prefix}.h5ad") # Extract predictions for doublet removal step # Create a binary doublet call based on class From 212b3e2bc2b675406bb12628dac6a9a9b6879523 Mon Sep 17 00:00:00 2001 From: KurayiChawatama Date: Fri, 13 Mar 2026 12:16:58 +0300 Subject: [PATCH 17/28] enhance h5ad writing with validation for cell barcodes and primary assay --- .../scdblfinder/templates/scdblfinder.R | 20 +++++++++++++------ .../scdblfinder/tests/main.nf.test | 9 ++++----- .../scdblfinder/tests/main.nf.test.snap | 6 ++---- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R b/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R index a11a0b2c..6bc3540e 100644 --- a/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R +++ b/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R @@ -73,15 +73,23 @@ for (col in scdbl_cols) { colData(sce)[[col]] <- NULL } -# Write the updated SingleCellExperiment directly as h5ad -write_h5ad(sce, "${prefix}.h5ad") +# The doublet calls must stay keyed by the original cell barcodes. If they are not +# present here, something went wrong during conversion or scDblFinder processing and +# we should fail instead of inventing replacement identifiers. +if (is.null(colnames(sce)) || length(colnames(sce)) != ncol(sce)) { + stop("scDblFinder output is missing valid cell barcodes; cannot write aligned h5ad and prediction outputs.") +} + +# Write the updated SingleCellExperiment directly as h5ad, explicitly mapping the +# primary assay to AnnData X so downstream readers see a valid matrix field. +primary_assay <- assayNames(sce)[1] +if (is.na(primary_assay) || primary_assay == "") { + stop("scDblFinder output is missing a primary assay; cannot write h5ad output.") +} +write_h5ad(sce, "${prefix}.h5ad", x_mapping = primary_assay) # Extract predictions for doublet removal step # Create a binary doublet call based on class -# Ensure we have valid row names -if (is.null(colnames(sce)) || length(colnames(sce)) != ncol(sce)) { - colnames(sce) <- paste0("cell_", seq_len(ncol(sce))) -} # Create predictions vector doublet_calls <- colData(sce)\$scdblfinder_class == "doublet" diff --git a/modules/local/doublet_detection/scdblfinder/tests/main.nf.test b/modules/local/doublet_detection/scdblfinder/tests/main.nf.test index 0ed11140..8172a071 100644 --- a/modules/local/doublet_detection/scdblfinder/tests/main.nf.test +++ b/modules/local/doublet_detection/scdblfinder/tests/main.nf.test @@ -29,12 +29,11 @@ nextflow_process { { assert process.success }, { assert snapshot( process.out.versions, - process.out.predictions, - // Hashing does not work due to this issue: - // https://github.com/scverse/anndataR/issues/272 - file(process.out.h5ad.get(0).get(1)).exists(), - file(process.out.h5ad.get(0).get(1)).size() + process.out.predictions ).match() } + , + { assert file(process.out.h5ad.get(0).get(1)).exists() }, + { assert file(process.out.h5ad.get(0).get(1)).size() > 0 } ) } diff --git a/modules/local/doublet_detection/scdblfinder/tests/main.nf.test.snap b/modules/local/doublet_detection/scdblfinder/tests/main.nf.test.snap index 721b5284..ba49cf65 100644 --- a/modules/local/doublet_detection/scdblfinder/tests/main.nf.test.snap +++ b/modules/local/doublet_detection/scdblfinder/tests/main.nf.test.snap @@ -109,11 +109,9 @@ }, "test_scdblfinder.csv:md5,130130ae215768e16e0df93a064dc5e9" ] - ], - true, - 5101352 + ] ], - "timestamp": "2026-03-12T13:08:16.847676966", + "timestamp": "2026-03-13T12:03:27.014939887", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.4" From 9774828800af69041963b7a41d0f705d38b20aa6 Mon Sep 17 00:00:00 2001 From: KurayiChawatama Date: Fri, 13 Mar 2026 12:18:37 +0300 Subject: [PATCH 18/28] add scdblfinder to input methods in doublet detection subworkflow test --- subworkflows/local/doublet_detection/tests/main.nf.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/doublet_detection/tests/main.nf.test b/subworkflows/local/doublet_detection/tests/main.nf.test index 26fb4f2b..b8f0ef18 100644 --- a/subworkflows/local/doublet_detection/tests/main.nf.test +++ b/subworkflows/local/doublet_detection/tests/main.nf.test @@ -50,7 +50,7 @@ nextflow_workflow { file(params.modules_testdata_base_path + 'genomics/homo_sapiens/scrnaseq/h5ad/SRR28679759_filtered_matrix.h5ad', checkIfExists: true) ] ) - input[1] = ['scds', 'solo', 'scrublet'] + input[1] = ['scds', 'solo', 'scrublet', 'scdblfinder'] input[2] = 2 input[3] = 1 """ From e32e393094310d743ba53dbdd6cfadb76ad69740 Mon Sep 17 00:00:00 2001 From: KurayiChawatama Date: Fri, 13 Mar 2026 12:27:18 +0300 Subject: [PATCH 19/28] streamline renaming of scDblFinder columns with less clumsy code --- .../scdblfinder/templates/scdblfinder.R | 22 +++++++------------ 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R b/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R index 6bc3540e..3e2dfd84 100644 --- a/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R +++ b/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R @@ -58,20 +58,14 @@ if (!is.null(original_cell_names) && length(original_cell_names) == ncol(sce)) { message("scDblFinder results summary:") print(table(sce\$scDblFinder.class)) -# Rename scDblFinder.* columns for consistency with other doublet methods -scdbl_cols <- grep("^scDblFinder\\\\.", colnames(colData(sce)), value = TRUE) - -# First remove "scDblFinder." prefix, THEN replace remaining dots with underscores -new_scdbl_cols <- paste0("scdblfinder_", gsub("\\\\.", "_", gsub("^scDblFinder\\\\.", "", scdbl_cols))) - -# Rename columns in colData(sce) - create new columns first, then delete old ones -for (i in seq_along(scdbl_cols)) { - colData(sce)[[new_scdbl_cols[i]]] <- colData(sce)[[scdbl_cols[i]]] -} -# Now delete old columns -for (col in scdbl_cols) { - colData(sce)[[col]] <- NULL -} +# Rename scDblFinder.* columns for consistency with other doublet methods. +# Replace prefix first, then replace any remaining dots with underscores. +idx <- grep("^scDblFinder\\\\.", colnames(colData(sce))) +colnames(colData(sce))[idx] <- gsub( + "\\\\.", + "_", + sub("^scDblFinder\\\\.", "scdblfinder_", colnames(colData(sce))[idx]) +) # The doublet calls must stay keyed by the original cell barcodes. If they are not # present here, something went wrong during conversion or scDblFinder processing and From addb64126710bd2d422d8268030a03c6746e860e Mon Sep 17 00:00:00 2001 From: KurayiChawatama Date: Fri, 13 Mar 2026 12:36:19 +0300 Subject: [PATCH 20/28] removed explicit call of artifical doublet number in scdblfinder function --- .../doublet_detection/scdblfinder/templates/scdblfinder.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R b/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R index 3e2dfd84..0e4dc658 100644 --- a/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R +++ b/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R @@ -42,8 +42,7 @@ set.seed(123) sce <- scDblFinder( assays(sce)[[1]], BPPARAM = bp, - dbr = multiplet_rate, - artificialDoublets = n_cells + dbr = multiplet_rate ) # Restore the input barcodes because running scDblFinder on the just the assay matrix above can From 1ad461fb275c2608fa00af1f821880e29fc5c9d8 Mon Sep 17 00:00:00 2001 From: KurayiChawatama Date: Fri, 13 Mar 2026 12:36:48 +0300 Subject: [PATCH 21/28] updated test snapshot to match previous commit --- .../doublet_detection/scdblfinder/tests/main.nf.test.snap | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/local/doublet_detection/scdblfinder/tests/main.nf.test.snap b/modules/local/doublet_detection/scdblfinder/tests/main.nf.test.snap index ba49cf65..2c1f2b89 100644 --- a/modules/local/doublet_detection/scdblfinder/tests/main.nf.test.snap +++ b/modules/local/doublet_detection/scdblfinder/tests/main.nf.test.snap @@ -107,11 +107,11 @@ { "id": "test" }, - "test_scdblfinder.csv:md5,130130ae215768e16e0df93a064dc5e9" + "test_scdblfinder.csv:md5,26628dd50c32c06df8fd1ffb973c9e3d" ] ] ], - "timestamp": "2026-03-13T12:03:27.014939887", + "timestamp": "2026-03-13T12:35:33.96981645", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.4" From 63c0307fc41bc9f01a80246ccd4eba123e102a71 Mon Sep 17 00:00:00 2001 From: KurayiChawatama Date: Fri, 13 Mar 2026 13:15:28 +0300 Subject: [PATCH 22/28] Enhance scDblFinder functionality and documentation - Added optional `doublet_rate` column in input samplesheet for per-sample expected doublet rate in `scDblFinder`. - Updated `scDblFinder` to utilize internal `dbr` estimation when `doublet_rate` is not provided. - Modified input and output handling in `SCDBLFINDER` process to accommodate new `doublet_rate` parameter. - Updated relevant documentation including CHANGELOG, README, and usage examples to reflect changes. - Added tests to validate functionality with provided `doublet_rate`. --- CHANGELOG.md | 3 + README.md | 2 + assets/schema_input.json | 7 +++ docs/usage.md | 9 +-- .../doublet_detection/scdblfinder/main.nf | 2 +- .../doublet_detection/scdblfinder/meta.yml | 5 ++ .../scdblfinder/templates/scdblfinder.R | 40 +++++------- .../scdblfinder/tests/main.nf.test | 38 ++++++++++- .../scdblfinder/tests/main.nf.test.snap | 63 +++++-------------- ro-crate-metadata.json | 2 +- subworkflows/local/doublet_detection/main.nf | 3 +- 11 files changed, 96 insertions(+), 78 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 888d9ccd..a5b22f70 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,9 +11,12 @@ Initial release of nf-core/scdownstream, created with the [nf-core](https://nf-c - Added `singleR` module for automated cell type annotation. - Added `scDblFinder` module for doublet detection. +- Added optional `doublet_rate` column in input samplesheet to provide per-sample expected doublet rate for `scDblFinder`. ### `Fixed` +- Updated `scDblFinder` to use internal `dbr` estimation when `doublet_rate` is not provided, and to use provided `doublet_rate` when available. + ### `Dependencies` ### `Deprecated` diff --git a/README.md b/README.md index 6edbd11e..5d987582 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,8 @@ sample4,/absolute/path/to/sample3.csv Each entry represents a h5ad, h5, RDS or CSV file. RDS files may contain any object that can be converted to a SingleCellExperiment using the [Seurat `as.SingleCellExperiment`](https://satijalab.org/seurat/reference/as.singlecellexperiment) function. CSV files should contain a matrix with genes as columns and cells as rows. The first column should contain cell names/barcodes. +For `scDblFinder`, you can optionally add a `doublet_rate` column (values between `0` and `1`) to the samplesheet. If omitted, `scDblFinder` estimates the doublet rate internally. + --> Now, you can run the pipeline using: diff --git a/assets/schema_input.json b/assets/schema_input.json index dedad5d7..d2636e85 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -122,6 +122,13 @@ "errorMessage": "Number of cells expected from the experimental design, used as input to cellbender.", "meta": ["expected_cells"] }, + "doublet_rate": { + "type": "number", + "minimum": 0, + "maximum": 1, + "errorMessage": "doublet_rate must be a number between 0 and 1.", + "meta": ["doublet_rate"] + }, "ambient_correction": { "type": "boolean", "default": true, diff --git a/docs/usage.md b/docs/usage.md index f635963e..8273218e 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -38,10 +38,10 @@ sample3,/absolute/path/to/sample3.csv There are a couple of optional columns that can be used for more advanced features: ```csv title="samplesheet.csv" -sample,filtered,unfiltered,batch_col,label_col,condition_col,unknown_label,min_genes,min_cells,min_counts_cell,min_counts_gene,expected_cells,ambient_correction,ambient_corrected_integration -sample1,/absolute/path/to/sample1_filtered.h5ad,/absolute/path/to/sample1.h5ad,batch,cell_type,condition,unknown,1,2,3,4,5000,true,false -sample2,relative/path/to/sample2_filtered.rds,relative/path/to/sample2.rds,batch_id,annotation,condition,unannotated,5,6,7,8,3000,false, -sample3,/absolute/path/to/sample3_filtered.csv,/absolute/path/to/sample3.csv,,,,,9,10,11,12,,true,true +sample,filtered,unfiltered,batch_col,label_col,condition_col,unknown_label,min_genes,min_cells,min_counts_cell,min_counts_gene,expected_cells,doublet_rate,ambient_correction,ambient_corrected_integration +sample1,/absolute/path/to/sample1_filtered.h5ad,/absolute/path/to/sample1.h5ad,batch,cell_type,condition,unknown,1,2,3,4,5000,0.08,true,false +sample2,relative/path/to/sample2_filtered.rds,relative/path/to/sample2.rds,batch_id,annotation,condition,unannotated,5,6,7,8,3000,,false, +sample3,/absolute/path/to/sample3_filtered.csv,/absolute/path/to/sample3.csv,,,,,9,10,11,12,,,true,true ``` For CSV input files, specifying the `batch_col`, `label_col`, `condition_col`, and `unknown_label` columns will not have any effect, as no additional metadata is available in the CSV file. @@ -63,6 +63,7 @@ For CSV input files, specifying the `batch_col`, `label_col`, `condition_col`, a | `min_counts_cell` | Minimum number of counts required for a cell to be considered. Defaults to `1`. | | `min_counts_gene` | Minimum number of counts required for a gene to be considered. Defaults to `1`. | | `expected_cells` | Number of expected cells, used as input to CellBender for empty droplet detection. | +| `doublet_rate` | Optional expected doublet rate (0-1) for `scDblFinder`. If not provided, `scDblFinder` estimates it internally. | | `max_mito_percentage` | Maximum percentage of mitochondrial reads for a cell to be considered. Defaults to `100`. | | `ambient_correction` | Whether to perform ambient RNA correction for this sample. Set to `true` to use the globally configured method, `false` to skip ambient correction for this sample. Defaults to `true`. | | `ambient_corrected_integration` | Whether to use ambient-corrected counts for integration for this sample. Set to `true` to use corrected counts in downstream integration, `false` to store them only as additional layers. Can override the global `--ambient_corrected_integration` parameter. Defaults to global setting. | diff --git a/modules/local/doublet_detection/scdblfinder/main.nf b/modules/local/doublet_detection/scdblfinder/main.nf index 8c9e691a..2f643d15 100644 --- a/modules/local/doublet_detection/scdblfinder/main.nf +++ b/modules/local/doublet_detection/scdblfinder/main.nf @@ -8,7 +8,7 @@ process SCDBLFINDER { 'community.wave.seqera.io/library/bioconductor-anndatar_bioconductor-biocparallel_bioconductor-rhdf5_bioconductor-scdblfinder_pruned:0f9db6b0855861de' }" input: - tuple val(meta), path(h5ad) + tuple val(meta), path(h5ad), val(dbr) output: tuple val(meta), path("${prefix}.h5ad"), emit: h5ad diff --git a/modules/local/doublet_detection/scdblfinder/meta.yml b/modules/local/doublet_detection/scdblfinder/meta.yml index 33810388..566d28ec 100644 --- a/modules/local/doublet_detection/scdblfinder/meta.yml +++ b/modules/local/doublet_detection/scdblfinder/meta.yml @@ -27,6 +27,11 @@ input: pattern: "*.{h5ad}" ontologies: - edam: "http://edamontology.org/format_3590" # HDF5 format + - dbr: + type: number + description: | + Optional expected doublet rate (0-1). If null, scDblFinder estimates + the doublet rate internally. output: h5ad: diff --git a/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R b/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R index 0e4dc658..21d526dd 100644 --- a/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R +++ b/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R @@ -13,37 +13,31 @@ sce <- adata\$as_SingleCellExperiment() num_threads <- max(1L, as.integer("${task.cpus}")) bp <- MulticoreParam(workers = num_threads, RNGseed = 123) -# 10 Genomics Doublet Rate calculator used to get multiplet rate if not provided -# 10X multiplet rate table(https://rpubs.com/kenneditodd/doublet_finder_example) -multiplet_rates_10x <- data.frame( - "Multiplet_rate" = c(0.004, 0.008, 0.0160, 0.023, 0.031, - 0.039, 0.046, 0.054, 0.061, 0.069, 0.076), - "Loaded_cells" = c(800, 1600, 3200, 4800, 6400, 8000, 9600, - 11200, 12800, 14400, 16000), - "Recovered_cells" = c(500, 1000, 2000, 3000, 4000, 5000, 6000, - 7000, 8000, 9000, 10000) -) - -# Adjust to use the number of cells in the SCE object -idx <- findInterval(ncol(sce), multiplet_rates_10x\$Recovered_cells) -if (idx < 1L) idx <- 1L -if (idx > nrow(multiplet_rates_10x)) idx <- nrow(multiplet_rates_10x) - -multiplet_rate <- as.numeric(multiplet_rates_10x\$Multiplet_rate[idx]) -message(paste0("Setting multiplet rate to ", multiplet_rate, " for ", ncol(sce), " cells")) - # Save original cell names and count before overwriting sce original_cell_names <- colnames(sce) -n_cells <- ncol(sce) + +# Parse per-sample doublet rate from Nextflow input. If unavailable, let +# scDblFinder estimate dbr internally (recommended default for 10X data). +dbr_raw <- trimws("${dbr}") +dbr <- suppressWarnings(as.numeric(dbr_raw)) # Run scDblFinder on the counts matrix (first assay) # scDblFinder creates artificial doublets internally and returns a new SCE set.seed(123) -sce <- scDblFinder( +if (!is.na(dbr)) { + message(paste0("Using provided doublet_rate (dbr): ", dbr)) + sce <- scDblFinder( assays(sce)[[1]], BPPARAM = bp, - dbr = multiplet_rate -) + dbr = dbr + ) +} else { + message("No valid doublet_rate provided; using scDblFinder internal dbr estimation") + sce <- scDblFinder( + assays(sce)[[1]], + BPPARAM = bp + ) +} # Restore the input barcodes because running scDblFinder on the just the assay matrix above can # return a new SCE whose column names no longer match the original AnnData cell IDs. diff --git a/modules/local/doublet_detection/scdblfinder/tests/main.nf.test b/modules/local/doublet_detection/scdblfinder/tests/main.nf.test index 8172a071..b0831c94 100644 --- a/modules/local/doublet_detection/scdblfinder/tests/main.nf.test +++ b/modules/local/doublet_detection/scdblfinder/tests/main.nf.test @@ -17,7 +17,8 @@ nextflow_process { """ input[0] = channel.of([ [ id: 'test' ], - file(params.modules_testdata_base_path + 'genomics/homo_sapiens/scrnaseq/h5ad/SRR28679759_filtered_matrix.h5ad', checkIfExists: true) + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/scrnaseq/h5ad/SRR28679759_filtered_matrix.h5ad', checkIfExists: true), + null ] ) """ @@ -39,6 +40,38 @@ nextflow_process { } + test("Should run with provided doublet_rate") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = channel.of([ + [ id: 'test' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/scrnaseq/h5ad/SRR28679759_filtered_matrix.h5ad', checkIfExists: true), + 0.08 + ] + ) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.versions, + process.out.predictions + ).match() }, + { assert file(process.out.h5ad.get(0).get(1)).exists() }, + { assert file(process.out.h5ad.get(0).get(1)).size() > 0 } + ) + } + + } + test("Should run without failures - stub") { options '-stub' @@ -51,7 +84,8 @@ nextflow_process { """ input[0] = channel.of([ [ id: 'test' ], - file(params.modules_testdata_base_path + 'genomics/homo_sapiens/scrnaseq/h5ad/SRR28679759_filtered_matrix.h5ad', checkIfExists: true) + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/scrnaseq/h5ad/SRR28679759_filtered_matrix.h5ad', checkIfExists: true), + null ] ) """ diff --git a/modules/local/doublet_detection/scdblfinder/tests/main.nf.test.snap b/modules/local/doublet_detection/scdblfinder/tests/main.nf.test.snap index 2c1f2b89..dda2e33e 100644 --- a/modules/local/doublet_detection/scdblfinder/tests/main.nf.test.snap +++ b/modules/local/doublet_detection/scdblfinder/tests/main.nf.test.snap @@ -1,5 +1,5 @@ { - "homo_sapiens - h5ad - stub": { + "Should run without failures - stub": { "content": [ { "0": [ @@ -42,62 +42,33 @@ ] } ], - "timestamp": "2026-03-12T11:44:51.894263326", + "timestamp": "2026-03-12T12:34:23.397301125", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.4" } }, - "Should run without failures - stub": { + "Should run without failures": { "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test_scdblfinder.h5ad:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "1": [ - [ - { - "id": "test" - }, - "test_scdblfinder.csv:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "2": [ - "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e" - ], - "h5ad": [ - [ - { - "id": "test" - }, - "test_scdblfinder.h5ad:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "predictions": [ - [ - { - "id": "test" - }, - "test_scdblfinder.csv:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "versions": [ - "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e" + [ + "versions.yml:md5,ce056c78586769ad5433f5fbb86f70c7" + ], + [ + [ + { + "id": "test" + }, + "test_scdblfinder.csv:md5,e92cd0219440b0caab1afb4b5b7f3e60" ] - } + ] ], - "timestamp": "2026-03-12T12:34:23.397301125", + "timestamp": "2026-03-13T12:53:40.597040086", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.4" } }, - "Should run without failures": { + "Should run with provided doublet_rate": { "content": [ [ "versions.yml:md5,ce056c78586769ad5433f5fbb86f70c7" @@ -107,11 +78,11 @@ { "id": "test" }, - "test_scdblfinder.csv:md5,26628dd50c32c06df8fd1ffb973c9e3d" + "test_scdblfinder.csv:md5,ad5d0bf6045f81b6a04980d1f522420e" ] ] ], - "timestamp": "2026-03-13T12:35:33.96981645", + "timestamp": "2026-03-13T13:02:31.168458271", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.4" diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index 07662401..d1e47b0c 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -23,7 +23,7 @@ "@type": "Dataset", "creativeWorkStatus": "InProgress", "datePublished": "2025-11-20T09:32:29+00:00", - "description": "

\n \n \n \"nf-core/scdownstream\"\n \n

\n\n[![Open in GitHub Codespaces](https://img.shields.io/badge/Open_In_GitHub_Codespaces-black?labelColor=grey&logo=github)](https://github.com/codespaces/new/nf-core/scdownstream)\n[![GitHub Actions CI Status](https://github.com/nf-core/scdownstream/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/scdownstream/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/scdownstream/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/scdownstream/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/scdownstream/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.10.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.5.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.5.1)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/scdownstream)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23scdownstream-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/scdownstream)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/scdownstream** is a bioinformatics pipeline that can be used to process already quantified single-cell RNA-seq data. It takes a samplesheet and h5ad-, SingleCellExperiment/Seurat- or CSV files as input and performs quality control, integration, dimensionality reduction and clustering. It produces an integrated h5ad and SingleCellExperiment file and an extensive QC report.\n\nThe pipeline is based on the learnings and implementations from the following pipelines (alphabetical):\n\n- [panpipes](https://github.com/DendrouLab/panpipes)\n- [scFlow](https://combiz.github.io/scFlow/)\n- [scRAFIKI](https://github.com/Mye-InfoBank/scRAFIKI)\n- [YASCP](https://github.com/wtsi-hgi/yascp)\n\n# ![nf-core/scdownstream](docs/images/metromap.png)\n\nSteps marked with the boat icon are not yet implemented. For the other steps, the pipeline uses the following tools:\n\n1. Per-sample preprocessing\n 1. Convert all RDS files to h5ad format\n 2. Create filtered matrix (if not provided)\n 3. Present QC for raw counts ([`MultiQC`](http://multiqc.info/))\n 4. Remove ambient RNA\n - [decontX](https://bioconductor.org/packages/release/bioc/html/decontX.html)\n - [soupX](https://cran.r-project.org/web/packages/SoupX/readme/README.html)\n - [cellbender](https://cellbender.readthedocs.io/en/latest/)\n - [scAR](https://docs.scvi-tools.org/en/stable/user_guide/models/scar.html)\n 5. Apply user-defined QC filters (can be defined per sample in the samplesheet)\n 6. Doublet detection (Majority vote possible)\n - [SOLO](https://docs.scvi-tools.org/en/stable/user_guide/models/solo.html)\n - [scrublet](https://scanpy.readthedocs.io/en/stable/api/generated/scanpy.pp.scrublet.html)\n - [DoubletDetection](https://doubletdetection.readthedocs.io/en/v2.5.2/doubletdetection.doubletdetection.html)\n - [SCDS](https://bioconductor.org/packages/devel/bioc/vignettes/scds/inst/doc/scds.html)\n - [scDblFinder](https://bioconductor.org/packages/release/bioc/html/scDblFinder.html)\n2. Sample aggregation\n 1. Merge into a single h5ad file\n 2. Present QC for merged counts ([`MultiQC`](http://multiqc.info/))\n 3. Integration\n - [scVI](https://docs.scvi-tools.org/en/stable/user_guide/models/scvi.html)\n - [scANVI](https://docs.scvi-tools.org/en/stable/user_guide/models/scanvi.html)\n - [Harmony](https://portals.broadinstitute.org/harmony/articles/quickstart.html)\n - [BBKNN](https://github.com/Teichlab/bbknn)\n - [Combat](https://scanpy.readthedocs.io/en/latest/api/generated/scanpy.pp.combat.html)\n - [Seurat](https://satijalab.org/seurat/articles/integration_introduction)\n3. Cell type annotation\n - [celltypist](https://www.celltypist.org/)\n4. Clustering and dimensionality reduction\n 1. [Leiden clustering](https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.leiden.html)\n 2. [UMAP](https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.umap.html)\n5. Create report ([`MultiQC`](http://multiqc.info/))\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\n> [!NOTE]\n> If you are confused by the terms `filtered` and `unfiltered`, please check out the respective [documentation](https://nf-co.re/scdownstream/dev/docs/usage/#filtered-and-unfiltered-matrices).\n\nFirst, prepare a samplesheet with your input data that looks as follows:\n\n```csv title=\"samplesheet.csv\"\nsample,unfiltered\nsample1,/absolute/path/to/sample1.h5ad\nsample2,/absolute/path/to/sample3.h5\nsample3,relative/path/to/sample2.rds\nsample4,/absolute/path/to/sample3.csv\n```\n\nEach entry represents a h5ad, h5, RDS or CSV file. RDS files may contain any object that can be converted to a SingleCellExperiment using the [Seurat `as.SingleCellExperiment`](https://satijalab.org/seurat/reference/as.singlecellexperiment) function.\nCSV files should contain a matrix with genes as columns and cells as rows. The first column should contain cell names/barcodes.\n\n-->\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run nf-core/scdownstream \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/scdownstream/usage) and the [parameter documentation](https://nf-co.re/scdownstream/parameters).\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/scdownstream/results) tab on the nf-core website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://nf-co.re/scdownstream/output).\n\n## Credits\n\nnf-core/scdownstream was originally written by [Nico Trummer](https://github.com/nictru).\n\nWe thank the following people for their extensive assistance in the development of this pipeline (alphabetical):\n\n- [Fabian Rost](https://github.com/fbnrst)\n- [Fabiola Curion](https://github.com/bio-la)\n- [Gregor Sturm](https://github.com/grst)\n- [Jonathan Talbot-Martin](https://github.com/jtalbotmartin)\n- [Lukas Heumos](https://github.com/zethson)\n- [Matiss Ozols](https://github.com/maxozo)\n- [Nathan Skene](https://github.com/NathanSkene)\n- [Nurun Fancy](https://github.com/nfancy)\n- [Riley Grindle](https://github.com/Riley-Grindle)\n- [Ryan Seaman](https://github.com/RPSeaman)\n- [Steffen M\u00f6ller](https://github.com/smoe)\n- [Wojtek Sowinski](https://github.com/WojtekSowinski)\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#scdownstream` channel](https://nfcore.slack.com/channels/scdownstream) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\n\n\n\n\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", + "description": "

\n \n \n \"nf-core/scdownstream\"\n \n

\n\n[![Open in GitHub Codespaces](https://img.shields.io/badge/Open_In_GitHub_Codespaces-black?labelColor=grey&logo=github)](https://github.com/codespaces/new/nf-core/scdownstream)\n[![GitHub Actions CI Status](https://github.com/nf-core/scdownstream/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/scdownstream/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/scdownstream/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/scdownstream/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/scdownstream/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.10.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.5.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.5.1)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/scdownstream)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23scdownstream-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/scdownstream)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/scdownstream** is a bioinformatics pipeline that can be used to process already quantified single-cell RNA-seq data. It takes a samplesheet and h5ad-, SingleCellExperiment/Seurat- or CSV files as input and performs quality control, integration, dimensionality reduction and clustering. It produces an integrated h5ad and SingleCellExperiment file and an extensive QC report.\n\nThe pipeline is based on the learnings and implementations from the following pipelines (alphabetical):\n\n- [panpipes](https://github.com/DendrouLab/panpipes)\n- [scFlow](https://combiz.github.io/scFlow/)\n- [scRAFIKI](https://github.com/Mye-InfoBank/scRAFIKI)\n- [YASCP](https://github.com/wtsi-hgi/yascp)\n\n# ![nf-core/scdownstream](docs/images/metromap.png)\n\nSteps marked with the boat icon are not yet implemented. For the other steps, the pipeline uses the following tools:\n\n1. Per-sample preprocessing\n 1. Convert all RDS files to h5ad format\n 2. Create filtered matrix (if not provided)\n 3. Present QC for raw counts ([`MultiQC`](http://multiqc.info/))\n 4. Remove ambient RNA\n - [decontX](https://bioconductor.org/packages/release/bioc/html/decontX.html)\n - [soupX](https://cran.r-project.org/web/packages/SoupX/readme/README.html)\n - [cellbender](https://cellbender.readthedocs.io/en/latest/)\n - [scAR](https://docs.scvi-tools.org/en/stable/user_guide/models/scar.html)\n 5. Apply user-defined QC filters (can be defined per sample in the samplesheet)\n 6. Doublet detection (Majority vote possible)\n - [SOLO](https://docs.scvi-tools.org/en/stable/user_guide/models/solo.html)\n - [scrublet](https://scanpy.readthedocs.io/en/stable/api/generated/scanpy.pp.scrublet.html)\n - [DoubletDetection](https://doubletdetection.readthedocs.io/en/v2.5.2/doubletdetection.doubletdetection.html)\n - [SCDS](https://bioconductor.org/packages/devel/bioc/vignettes/scds/inst/doc/scds.html)\n - [scDblFinder](https://bioconductor.org/packages/release/bioc/html/scDblFinder.html)\n2. Sample aggregation\n 1. Merge into a single h5ad file\n 2. Present QC for merged counts ([`MultiQC`](http://multiqc.info/))\n 3. Integration\n - [scVI](https://docs.scvi-tools.org/en/stable/user_guide/models/scvi.html)\n - [scANVI](https://docs.scvi-tools.org/en/stable/user_guide/models/scanvi.html)\n - [Harmony](https://portals.broadinstitute.org/harmony/articles/quickstart.html)\n - [BBKNN](https://github.com/Teichlab/bbknn)\n - [Combat](https://scanpy.readthedocs.io/en/latest/api/generated/scanpy.pp.combat.html)\n - [Seurat](https://satijalab.org/seurat/articles/integration_introduction)\n3. Cell type annotation\n - [celltypist](https://www.celltypist.org/)\n4. Clustering and dimensionality reduction\n 1. [Leiden clustering](https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.leiden.html)\n 2. [UMAP](https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.umap.html)\n5. Create report ([`MultiQC`](http://multiqc.info/))\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\n> [!NOTE]\n> If you are confused by the terms `filtered` and `unfiltered`, please check out the respective [documentation](https://nf-co.re/scdownstream/dev/docs/usage/#filtered-and-unfiltered-matrices).\n\nFirst, prepare a samplesheet with your input data that looks as follows:\n\n```csv title=\"samplesheet.csv\"\nsample,unfiltered\nsample1,/absolute/path/to/sample1.h5ad\nsample2,/absolute/path/to/sample3.h5\nsample3,relative/path/to/sample2.rds\nsample4,/absolute/path/to/sample3.csv\n```\n\nEach entry represents a h5ad, h5, RDS or CSV file. RDS files may contain any object that can be converted to a SingleCellExperiment using the [Seurat `as.SingleCellExperiment`](https://satijalab.org/seurat/reference/as.singlecellexperiment) function.\nCSV files should contain a matrix with genes as columns and cells as rows. The first column should contain cell names/barcodes.\n\nFor `scDblFinder`, you can optionally add a `doublet_rate` column (values between `0` and `1`) to the samplesheet. If omitted, `scDblFinder` estimates the doublet rate internally.\n\n-->\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run nf-core/scdownstream \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/scdownstream/usage) and the [parameter documentation](https://nf-co.re/scdownstream/parameters).\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/scdownstream/results) tab on the nf-core website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://nf-co.re/scdownstream/output).\n\n## Credits\n\nnf-core/scdownstream was originally written by [Nico Trummer](https://github.com/nictru).\n\nWe thank the following people for their extensive assistance in the development of this pipeline (alphabetical):\n\n- [Fabian Rost](https://github.com/fbnrst)\n- [Fabiola Curion](https://github.com/bio-la)\n- [Gregor Sturm](https://github.com/grst)\n- [Jonathan Talbot-Martin](https://github.com/jtalbotmartin)\n- [Lukas Heumos](https://github.com/zethson)\n- [Matiss Ozols](https://github.com/maxozo)\n- [Nathan Skene](https://github.com/NathanSkene)\n- [Nurun Fancy](https://github.com/nfancy)\n- [Riley Grindle](https://github.com/Riley-Grindle)\n- [Ryan Seaman](https://github.com/RPSeaman)\n- [Steffen M\u00f6ller](https://github.com/smoe)\n- [Wojtek Sowinski](https://github.com/WojtekSowinski)\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#scdownstream` channel](https://nfcore.slack.com/channels/scdownstream) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\n\n\n\n\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", "hasPart": [ { "@id": "main.nf" diff --git a/subworkflows/local/doublet_detection/main.nf b/subworkflows/local/doublet_detection/main.nf index cc195cfc..e75056ad 100644 --- a/subworkflows/local/doublet_detection/main.nf +++ b/subworkflows/local/doublet_detection/main.nf @@ -21,6 +21,7 @@ workflow DOUBLET_DETECTION { log.info("DOUBLET_DETECTION: Not performed since no methods selected.") } else { ch_batch_col = ch_h5ad.map { meta, _h5ad -> meta.batch_col } + ch_h5ad_doublet_rate = ch_h5ad.map { meta, h5ad -> [meta, h5ad, meta.doublet_rate] } if (methods.contains('scds')) { SCDS ( @@ -59,7 +60,7 @@ workflow DOUBLET_DETECTION { if (methods.contains('scdblfinder')) { SCDBLFINDER ( - ch_h5ad + ch_h5ad_doublet_rate ) ch_predictions = ch_predictions.mix(SCDBLFINDER.out.predictions) ch_versions = ch_versions.mix(SCDBLFINDER.out.versions) From ee63725dbf8b07c3be8d4eefec3ab0819f4d653a Mon Sep 17 00:00:00 2001 From: nf-core-bot Date: Fri, 13 Mar 2026 10:32:41 +0000 Subject: [PATCH 23/28] [automated] Fix code linting --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 8273218e..d5dd5c52 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -63,7 +63,7 @@ For CSV input files, specifying the `batch_col`, `label_col`, `condition_col`, a | `min_counts_cell` | Minimum number of counts required for a cell to be considered. Defaults to `1`. | | `min_counts_gene` | Minimum number of counts required for a gene to be considered. Defaults to `1`. | | `expected_cells` | Number of expected cells, used as input to CellBender for empty droplet detection. | -| `doublet_rate` | Optional expected doublet rate (0-1) for `scDblFinder`. If not provided, `scDblFinder` estimates it internally. | +| `doublet_rate` | Optional expected doublet rate (0-1) for `scDblFinder`. If not provided, `scDblFinder` estimates it internally. | | `max_mito_percentage` | Maximum percentage of mitochondrial reads for a cell to be considered. Defaults to `100`. | | `ambient_correction` | Whether to perform ambient RNA correction for this sample. Set to `true` to use the globally configured method, `false` to skip ambient correction for this sample. Defaults to `true`. | | `ambient_corrected_integration` | Whether to use ambient-corrected counts for integration for this sample. Set to `true` to use corrected counts in downstream integration, `false` to store them only as additional layers. Can override the global `--ambient_corrected_integration` parameter. Defaults to global setting. | From 6480259e53f23c933e8bd8a14a86f707091632cf Mon Sep 17 00:00:00 2001 From: Kurayi Chawatama <142725139+KurayiChawatama@users.noreply.github.com> Date: Fri, 13 Mar 2026 13:46:08 +0300 Subject: [PATCH 24/28] change other doublet detection methods to use mix Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- subworkflows/local/doublet_detection/main.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/doublet_detection/main.nf b/subworkflows/local/doublet_detection/main.nf index e75056ad..81e26962 100644 --- a/subworkflows/local/doublet_detection/main.nf +++ b/subworkflows/local/doublet_detection/main.nf @@ -28,7 +28,7 @@ workflow DOUBLET_DETECTION { ch_h5ad ) ch_predictions = ch_predictions.mix(SCDS.out.predictions) - ch_versions = SCDS.out.versions + ch_versions = ch_versions.mix(SCDS.out.versions) } if (methods.contains('solo')) { @@ -38,7 +38,7 @@ workflow DOUBLET_DETECTION { scvi_max_epochs ?: [] ) ch_predictions = ch_predictions.mix(SCVITOOLS_SOLO.out.predictions) - ch_versions = SCVITOOLS_SOLO.out.versions + ch_versions = ch_versions.mix(SCVITOOLS_SOLO.out.versions) } if (methods.contains('scrublet')) { @@ -47,7 +47,7 @@ workflow DOUBLET_DETECTION { ch_batch_col ) ch_predictions = ch_predictions.mix(SCANPY_SCRUBLET.out.predictions) - ch_versions = SCANPY_SCRUBLET.out.versions + ch_versions = ch_versions.mix(SCANPY_SCRUBLET.out.versions) } if (methods.contains('doubletdetection')) { @@ -55,7 +55,7 @@ workflow DOUBLET_DETECTION { ch_h5ad ) ch_predictions = ch_predictions.mix(DOUBLETDETECTION.out.predictions) - ch_versions = DOUBLETDETECTION.out.versions + ch_versions = ch_versions.mix(DOUBLETDETECTION.out.versions) } if (methods.contains('scdblfinder')) { From 4b5b2b3fa1735bf7bf75042543626272519d610d Mon Sep 17 00:00:00 2001 From: KurayiChawatama Date: Fri, 13 Mar 2026 13:45:11 +0300 Subject: [PATCH 25/28] Remove redundant restoration of original cell barcodes in scDblFinder processing --- .../doublet_detection/scdblfinder/templates/scdblfinder.R | 8 -------- 1 file changed, 8 deletions(-) diff --git a/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R b/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R index 21d526dd..190743ee 100644 --- a/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R +++ b/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R @@ -39,14 +39,6 @@ if (!is.na(dbr)) { ) } -# Restore the input barcodes because running scDblFinder on the just the assay matrix above can -# return a new SCE whose column names no longer match the original AnnData cell IDs. -# Keeping the original names is required so the output h5ad obs_names and CSV rows -# still map back to the same cells seen by downstream steps. -if (!is.null(original_cell_names) && length(original_cell_names) == ncol(sce)) { - colnames(sce) <- original_cell_names -} - # Generate a summary table message("scDblFinder results summary:") print(table(sce\$scDblFinder.class)) From ee1d883846a6148eeddeb2d33ba86ae92e5cd06e Mon Sep 17 00:00:00 2001 From: KurayiChawatama Date: Fri, 13 Mar 2026 14:13:08 +0300 Subject: [PATCH 26/28] Remove unnecessary comment about RNG seed parameter in scDblFinder script --- .../local/doublet_detection/scdblfinder/templates/scdblfinder.R | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R b/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R index 190743ee..d930fefd 100644 --- a/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R +++ b/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R @@ -9,7 +9,6 @@ library(anndataR) adata <- read_h5ad("${h5ad}") sce <- adata\$as_SingleCellExperiment() -# Set the param to a specified RNG seed for reproducibility num_threads <- max(1L, as.integer("${task.cpus}")) bp <- MulticoreParam(workers = num_threads, RNGseed = 123) From a2fe8ab88721a0f890ea8c6ceab95bfacb647d24 Mon Sep 17 00:00:00 2001 From: KurayiChawatama Date: Fri, 13 Mar 2026 14:20:41 +0300 Subject: [PATCH 27/28] Refactor doublet rate handling in scDblFinder to streamline logic and improve readability --- .../scdblfinder/templates/scdblfinder.R | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R b/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R index d930fefd..50edc27c 100644 --- a/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R +++ b/modules/local/doublet_detection/scdblfinder/templates/scdblfinder.R @@ -23,21 +23,19 @@ dbr <- suppressWarnings(as.numeric(dbr_raw)) # Run scDblFinder on the counts matrix (first assay) # scDblFinder creates artificial doublets internally and returns a new SCE set.seed(123) -if (!is.na(dbr)) { - message(paste0("Using provided doublet_rate (dbr): ", dbr)) - sce <- scDblFinder( - assays(sce)[[1]], - BPPARAM = bp, - dbr = dbr - ) -} else { +if (is.na(dbr)) { message("No valid doublet_rate provided; using scDblFinder internal dbr estimation") - sce <- scDblFinder( - assays(sce)[[1]], - BPPARAM = bp - ) + dbr <- NULL +} else { + message(paste0("Using provided doublet_rate (dbr): ", dbr)) } +sce <- scDblFinder( + assays(sce)[[1]], + BPPARAM = bp, + dbr = dbr +) + # Generate a summary table message("scDblFinder results summary:") print(table(sce\$scDblFinder.class)) From 6194ca488b26b5ae7e13664cf63258e950e02a93 Mon Sep 17 00:00:00 2001 From: KurayiChawatama Date: Fri, 13 Mar 2026 14:32:30 +0300 Subject: [PATCH 28/28] Fix regex pattern for doublet detection tool options in nextflow_schema.json --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 2233d8a6..545f8f8d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -106,7 +106,7 @@ "default": "scrublet", "description": "Specify the tools to use for doublet detection. Setting to 'none' will skip this step", "help_text": "If you want to use multiple tools, separate them with a comma. Available methods are: solo, scrublet, doubletdetection, scds, scdblfinder", - "pattern": "^(none|((solo|scrublet|doubletdetection|scds|scdblfinder)?,?)*[^,]+$)" + "pattern": "^(none|(solo|scrublet|doubletdetection|scds|scdblfinder)(,(solo|scrublet|doubletdetection|scds|scdblfinder))*)$" }, "doublet_detection_threshold": { "type": "integer",