-
Notifications
You must be signed in to change notification settings - Fork 61
Module/scdblfinder #261
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: dev
Are you sure you want to change the base?
Module/scdblfinder #261
Changes from all commits
f949409
0c1c2ee
8ba85f7
5a109a5
0fb9d77
9af9d36
bd40dce
807f00f
275fac8
993a8f2
a56e656
ec56f35
632676f
7b03b01
0b0b19d
6d741bc
947ffa2
8c2e382
212b3e2
9774828
e32e393
addb641
1ad461f
63c0307
ee63725
6480259
4b5b2b3
ee1d883
a2fe8ab
6194ca4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| name: scdblfinder | ||
| channels: | ||
| - conda-forge | ||
| - bioconda | ||
| dependencies: | ||
| - bioconda::bioconductor-scdblfinder=1.24.0 | ||
| - bioconda::bioconductor-singlecellexperiment=1.32.0 | ||
| - bioconda::bioconductor-biocparallel=1.44.0 | ||
| - bioconda::bioconductor-anndatar=1.0.2 | ||
| - bioconda::bioconductor-rhdf5=2.54.1 | ||
| - conda-forge::r-tidyverse=2.0.0 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| process SCDBLFINDER { | ||
| tag "$meta.id" | ||
| label 'process_medium' | ||
|
|
||
| conda "${moduleDir}/environment.yml" | ||
| container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? | ||
| 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/99/993a012a69d920412b090701eb733ccf35c8655c3d012756ca6b0af1cfcd4780/data' : | ||
| 'community.wave.seqera.io/library/bioconductor-anndatar_bioconductor-biocparallel_bioconductor-rhdf5_bioconductor-scdblfinder_pruned:0f9db6b0855861de' }" | ||
|
|
||
| input: | ||
| tuple val(meta), path(h5ad), val(dbr) | ||
|
|
||
| output: | ||
| tuple val(meta), path("${prefix}.h5ad"), emit: h5ad | ||
| tuple val(meta), path("${prefix}.csv"), emit: predictions | ||
| path "versions.yml", emit: versions | ||
|
|
||
| when: | ||
| task.ext.when == null || task.ext.when | ||
|
|
||
| script: | ||
| prefix = task.ext.prefix ?: "${meta.id}" | ||
| template('scdblfinder.R') | ||
|
|
||
| stub: | ||
| prefix = task.ext.prefix ?: "${meta.id}" | ||
| """ | ||
| touch ${prefix}.h5ad | ||
| touch ${prefix}.csv | ||
| touch versions.yml | ||
| """ | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,72 @@ | ||
| name: "scdblfinder" | ||
| description: Detect doublets in single-cell RNA-seq data using scDblFinder | ||
| keywords: | ||
| - doublet-detection | ||
| - single-cell | ||
| - scrnaseq | ||
| - quality-control | ||
| tools: | ||
| - "scdblfinder": | ||
| description: "scDblFinder: Computational identification of doublets in single-cell transcriptomics data" | ||
| homepage: "https://bioconductor.org/packages/scDblFinder" | ||
| documentation: "https://bioconductor.org/packages/release/bioc/vignettes/scDblFinder/inst/doc/scDblFinder.html" | ||
| tool_dev_url: "https://github.com/plger/scDblFinder" | ||
| doi: "10.12688/f1000research.73600.2" | ||
| licence: ["GPL-3.0"] | ||
| identifier: biotools:scdblfinder | ||
|
|
||
| input: | ||
| - - meta: | ||
| type: map | ||
| description: | | ||
| Groovy Map containing sample information | ||
| e.g. `[ id:'sample1' ]` | ||
| - h5ad: | ||
| type: file | ||
| description: AnnData object in h5ad format | ||
| pattern: "*.{h5ad}" | ||
| ontologies: | ||
| - edam: "http://edamontology.org/format_3590" # HDF5 format | ||
| - dbr: | ||
| type: number | ||
| description: | | ||
| Optional expected doublet rate (0-1). If null, scDblFinder estimates | ||
| the doublet rate internally. | ||
|
|
||
| output: | ||
| h5ad: | ||
| - - meta: | ||
| type: map | ||
| description: | | ||
| Groovy Map containing sample information | ||
| e.g. `[ id:'sample1' ]` | ||
| - "*.h5ad": | ||
| type: file | ||
| description: AnnData object with doublet annotations | ||
| pattern: "*.h5ad" | ||
| ontologies: | ||
| - edam: "http://edamontology.org/format_3590" # HDF5 format | ||
| predictions: | ||
| - - meta: | ||
| type: map | ||
| description: | | ||
| Groovy Map containing sample information | ||
| e.g. `[ id:'sample1' ]` | ||
| - "*.csv": | ||
| type: file | ||
| description: CSV file containing doublet predictions (boolean) | ||
| pattern: "*.csv" | ||
| ontologies: | ||
| - edam: "http://edamontology.org/format_3752" # CSV | ||
| versions: | ||
| - versions.yml: | ||
| type: file | ||
| description: File containing software versions | ||
| pattern: "versions.yml" | ||
| ontologies: | ||
| - edam: http://edamontology.org/format_3750 # YAML | ||
|
|
||
| authors: | ||
| - "@KurayiChawatama" | ||
| maintainers: | ||
| - "@KurayiChawatama" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,102 @@ | ||
| #!/usr/bin/env Rscript | ||
|
|
||
| library(scDblFinder) | ||
| library(tidyverse) | ||
| library(SingleCellExperiment) | ||
| library(BiocParallel) | ||
| library(anndataR) | ||
|
|
||
| adata <- read_h5ad("${h5ad}") | ||
| sce <- adata\$as_SingleCellExperiment() | ||
|
|
||
| num_threads <- max(1L, as.integer("${task.cpus}")) | ||
| bp <- MulticoreParam(workers = num_threads, RNGseed = 123) | ||
|
|
||
| # Save original cell names and count before overwriting sce | ||
| original_cell_names <- colnames(sce) | ||
|
|
||
| # Parse per-sample doublet rate from Nextflow input. If unavailable, let | ||
| # scDblFinder estimate dbr internally (recommended default for 10X data). | ||
| dbr_raw <- trimws("${dbr}") | ||
| dbr <- suppressWarnings(as.numeric(dbr_raw)) | ||
|
|
||
| # Run scDblFinder on the counts matrix (first assay) | ||
| # scDblFinder creates artificial doublets internally and returns a new SCE | ||
| set.seed(123) | ||
KurayiChawatama marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if (is.na(dbr)) { | ||
| message("No valid doublet_rate provided; using scDblFinder internal dbr estimation") | ||
| dbr <- NULL | ||
| } else { | ||
| message(paste0("Using provided doublet_rate (dbr): ", dbr)) | ||
| } | ||
|
|
||
| sce <- scDblFinder( | ||
| assays(sce)[[1]], | ||
| BPPARAM = bp, | ||
| dbr = dbr | ||
| ) | ||
|
Comment on lines
+33
to
+37
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I found that scDblFinder has an optional |
||
|
|
||
| # Generate a summary table | ||
| message("scDblFinder results summary:") | ||
| print(table(sce\$scDblFinder.class)) | ||
|
|
||
| # Rename scDblFinder.* columns for consistency with other doublet methods. | ||
| # Replace prefix first, then replace any remaining dots with underscores. | ||
| idx <- grep("^scDblFinder\\\\.", colnames(colData(sce))) | ||
| colnames(colData(sce))[idx] <- gsub( | ||
| "\\\\.", | ||
| "_", | ||
| sub("^scDblFinder\\\\.", "scdblfinder_", colnames(colData(sce))[idx]) | ||
| ) | ||
|
|
||
| # The doublet calls must stay keyed by the original cell barcodes. If they are not | ||
| # present here, something went wrong during conversion or scDblFinder processing and | ||
| # we should fail instead of inventing replacement identifiers. | ||
| if (is.null(colnames(sce)) || length(colnames(sce)) != ncol(sce)) { | ||
| stop("scDblFinder output is missing valid cell barcodes; cannot write aligned h5ad and prediction outputs.") | ||
| } | ||
|
|
||
| # Write the updated SingleCellExperiment directly as h5ad, explicitly mapping the | ||
| # primary assay to AnnData X so downstream readers see a valid matrix field. | ||
| primary_assay <- assayNames(sce)[1] | ||
| if (is.na(primary_assay) || primary_assay == "") { | ||
| stop("scDblFinder output is missing a primary assay; cannot write h5ad output.") | ||
| } | ||
|
Comment on lines
+59
to
+64
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is also unnecessary because the sce is created from the anndata in a way that is perfectly prepared for reversing the process |
||
| write_h5ad(sce, "${prefix}.h5ad", x_mapping = primary_assay) | ||
|
|
||
| # Extract predictions for doublet removal step | ||
| # Create a binary doublet call based on class | ||
|
|
||
| # Create predictions vector | ||
| doublet_calls <- colData(sce)\$scdblfinder_class == "doublet" | ||
|
|
||
| # Create data frame without row.names first, then add them | ||
| predictions <- data.frame(doublet = doublet_calls) | ||
| row.names(predictions) <- colnames(sce) | ||
|
|
||
| colnames(predictions) <- "${prefix}" | ||
|
|
||
| # Save predictions to CSV | ||
| write.csv(predictions, "${prefix}.csv") | ||
|
|
||
| ################################################ | ||
| ################################################ | ||
| ## VERSIONS FILE ## | ||
| ################################################ | ||
| ################################################ | ||
|
|
||
| r.version <- strsplit(version[['version.string']], ' ')[[1]][3] | ||
| scDblFinder.version <- as.character(packageVersion('scDblFinder')) | ||
|
|
||
| writeLines( | ||
| c( | ||
| '"${task.process}":', | ||
| paste(' R:', r.version), | ||
| paste(' scDblFinder:', scDblFinder.version) | ||
| ), | ||
| 'versions.yml') | ||
|
|
||
| ################################################ | ||
| ################################################ | ||
| ################################################ | ||
| ################################################ | ||
Uh oh!
There was an error while loading. Please reload this page.