diff --git a/.gitignore b/.gitignore index 5516a066..7c60a3c2 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,6 @@ testing* null/ .lineage/ .nf-test* +tests/assets/*.h5ad +contrib/nf-core-test-datasets/build_output/ +contrib/nf-core-test-datasets/extension_base/ diff --git a/README.md b/README.md index e531a984..d45ee4ae 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ Steps marked with the boat icon are not yet implemented. For the other steps, th 3. Integration - [scVI](https://docs.scvi-tools.org/en/stable/user_guide/models/scvi.html) - [scANVI](https://docs.scvi-tools.org/en/stable/user_guide/models/scanvi.html) - - [Harmony](https://portals.broadinstitute.org/harmony/articles/quickstart.html) + - [Symphony](https://github.com/immunogenomics/symphony) / Harmony (via [symphonypy](https://pypi.org/project/symphonypy/)) - [BBKNN](https://github.com/Teichlab/bbknn) - [Combat](https://scanpy.readthedocs.io/en/latest/api/generated/scanpy.pp.combat.html) - [Seurat](https://satijalab.org/seurat/articles/integration_introduction) diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index bf1be27f..976136b4 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -18,12 +18,12 @@ report_section_order: order: -1005 "scanvi": order: -1006 - "harmony": + "symphony": order: -1007 "bbknn": - order: -1008 - "combat": order: -1009 + "combat": + order: -1010 # If new tools are add. They need to be added here "nf-core-scdownstream-methods-description": order: -2001 diff --git a/assets/schema_analysis_plan.json b/assets/schema_analysis_plan.json index 92be8712..fdb44dbc 100644 --- a/assets/schema_analysis_plan.json +++ b/assets/schema_analysis_plan.json @@ -12,27 +12,31 @@ "pattern": "^\\S*$", "default": null, "errorMessage": "Integration name cannot contain spaces", - "description": "Integration method name, or empty to match all integrations" + "description": "Integration method name, or empty to match all integrations", + "meta": ["integration"] }, "subset": { "type": "string", "pattern": "^\\S*$", "default": null, "errorMessage": "Subset cannot contain spaces", - "description": "Clustering subset (global or a label value), or empty to match all subsets" + "description": "Clustering subset (global or a label value), or empty to match all subsets", + "meta": ["subset"] }, "resolution": { "type": "number", "minimum": 0, "default": null, - "description": "Leiden resolution, or empty to match all resolutions" + "description": "Leiden resolution, or empty to match all resolutions", + "meta": ["resolution"] }, "analyses": { "type": "string", "pattern": "^(|paga|liana|de|cytetype)(,(paga|liana|de|cytetype))*$", "default": null, "errorMessage": "Analyses must be a comma-separated list of paga, liana, de, and/or cytetype", - "description": "Downstream analyses to run for matching clusterings, or empty to run all analyses" + "description": "Downstream analyses to run for matching clusterings, or empty to run all analyses", + "meta": ["analyses"] } } } diff --git a/conf/modules.config b/conf/modules.config index 84748e8e..4260f400 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -379,9 +379,25 @@ process { ] } - withName: SCANPY_HARMONY { + withName: SYMPHONY_HARMONYINTEGRATE { publishDir = [ - path: { "${params.outdir}/combine/integrate/harmony" }, + path: { "${params.outdir}/combine/integrate/symphony" }, + mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.endsWith('_reference.h5ad')) { + return 'symphony_reference.h5ad' + } + if (params.save_intermediates && !filename.equals('versions.yml')) { + return filename + } + return null + }, + ] + } + + withName: SYMPHONY_MAPEMBEDDING { + publishDir = [ + path: { "${params.outdir}/combine/integrate/symphony" }, mode: params.publish_dir_mode, enabled: params.save_intermediates, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, diff --git a/conf/test.config b/conf/test.config index 31b5a05b..fc2ae4f0 100644 --- a/conf/test.config +++ b/conf/test.config @@ -24,7 +24,7 @@ params { // Input data input = params.pipelines_testdata_base_path + 'samplesheet.csv' - integration_methods = 'scvi,harmony,bbknn,combat' + integration_methods = 'scvi,symphony,bbknn,combat' doublet_detection = 'solo,scrublet,scdblfinder' celltypist_model = 'Adult_Human_Skin' celldex_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/scdownstream/singleR/references.csv' diff --git a/conf/test_full.config b/conf/test_full.config index f4629109..96e64421 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -24,7 +24,7 @@ params { // Input data for full size test input = params.pipelines_testdata_base_path + 'samplesheet.csv' - integration_methods = 'scvi,harmony,bbknn,combat' + integration_methods = 'scvi,symphony,bbknn,combat' doublet_detection = 'solo,scrublet,doubletdetection,scdblfinder' celltypist_model = 'Adult_Human_Skin' celldex_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/scdownstream/singleR/references.csv' diff --git a/docs/output.md b/docs/output.md index b35ab9d4..97403fec 100644 --- a/docs/output.md +++ b/docs/output.md @@ -34,7 +34,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d 3. Integration - [scVI](https://docs.scvi-tools.org/en/stable/user_guide/models/scvi.html) - [scANVI](https://docs.scvi-tools.org/en/stable/user_guide/models/scanvi.html) - - [Harmony](https://portals.broadinstitute.org/harmony/articles/quickstart.html) + - [Symphony](https://github.com/immunogenomics/symphony) / Harmony (via [symphonypy](https://pypi.org/project/symphonypy/)) - [BBKNN](https://github.com/Teichlab/bbknn) - [Combat](https://scanpy.readthedocs.io/en/latest/api/generated/scanpy.pp.combat.html) - [Seurat](https://satijalab.org/seurat/articles/integration_introduction) @@ -98,6 +98,7 @@ The `preprocess` directory contains a subdirectory for each sample, which contai - `${tool}` - `*.h5ad/*.rds`: The integrated H5AD or RDS file. - `X_${tool}.pkl`: Low-dimensional representation of the integrated data. + - `symphony_reference.h5ad` (Symphony only): Compact Symphony reference AnnData for query mapping, published from de novo Symphony runs. diff --git a/docs/reproducibility.md b/docs/reproducibility.md index 2e2e4312..f93ff464 100644 --- a/docs/reproducibility.md +++ b/docs/reproducibility.md @@ -120,23 +120,29 @@ The **Test strategy (this branch)** column describes what the tests on this bran ### `scanpy/` -| Module | Description | Reproducibility | Test strategy (this branch) | -| ------------------------ | -------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------- | -| `scanpy/bbknn` | Constructs a batch-balanced k-nearest-neighbour graph (BBKNN) on a PCA embedding. | Fully deterministic — kNN construction is deterministic given the input embedding. | structural — versions + schema only | -| `scanpy/cellcycle` | Scores each cell for S-phase and G2M-phase activity and assigns a predicted cell cycle phase. | Fully deterministic | hash | -| `scanpy/combat` | Applies ComBat batch correction and then runs PCA, storing the result as `X_emb`. | Seeded / quasi-deterministic — ComBat is deterministic; downstream PCA floats may vary across LAPACK backends. | structural — versions + schema only | -| `scanpy/filter` | Filters cells and genes by count, gene, and mitochondrial percentage thresholds. | Fully deterministic | hash + structural — standard `hash` triple; multiple parameter scenarios | -| `scanpy/harmony` | Runs Harmony batch integration after log-normalisation and PCA, storing the corrected embedding as `X_emb`. | **Non-deterministic** — Harmony is an iterative optimisation with no fixed seed; upstream PCA is also unseeded. | structural — versions + schema only; `variance_ratio` output removed | -| `scanpy/hvgs` | Selects highly variable genes and subsets the AnnData to those genes. | Seeded / quasi-deterministic — HVG variance statistics rely on NumPy/SciPy floating-point operations that can produce slightly different results across library versions. | structural — versions + schema only | -| `scanpy/leiden` | Performs Leiden community-detection clustering at a specified resolution. | **Non-deterministic** — Leiden uses random restarts with no fixed seed. | structural — range assertion on cluster count + versions + schema | -| `scanpy/neighbors` | Computes a k-nearest-neighbour graph on a specified embedding. | Fully deterministic given a fixed input embedding. | structural — versions + schema only | -| `scanpy/paga` | Computes PAGA coarse-grained cluster connectivity and saves a graph and plot. | Fully deterministic — PAGA is a deterministic graph-summarisation step given fixed Leiden labels. | hash | -| `scanpy/pca` | Runs PCA with `random_state=0` and stores the result under a specified key. | Seeded / quasi-deterministic — seed is fixed, but float coordinates can differ across LAPACK/MKL backends. | structural — versions + schema only | -| `scanpy/plotqc` | Calculates QC metrics and produces a counts-vs-genes scatter plot for MultiQC. | Fully deterministic | hash (no H5AD output — PNG / MultiQC JSON + versions) | -| `scanpy/rankgenesgroups` | Runs differential gene expression (rank genes groups) across clusters using a configurable statistical method. | **Seeded / quasi-deterministic** — wilcoxon and t-test are deterministic in theory, but tied-rank handling and floating-point tie-breaking can differ across SciPy versions. | structural — versions + `adata.yaml`; one path with **empty h5ad** snapshots **versions only** | -| `scanpy/readh5` | Reads a 10x Genomics HDF5 (`.h5`) file and writes it as an AnnData H5AD. | Fully deterministic | hash | -| `scanpy/sample` | Down-samples cells to a fixed count or fraction using `rng=0`. | Seeded / quasi-deterministic — seed is fixed, but sampled cell set may vary across NumPy versions. | hash | -| `scanpy/umap` | Computes a UMAP embedding from a pre-built neighbour graph using `random_state=0`. | Seeded / quasi-deterministic — seed is fixed, but float coordinates vary across umap-learn/numba versions. | structural — versions + schema only | +| Module | Description | Reproducibility | Test strategy (this branch) | +| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------- | +| `scanpy/bbknn` | Constructs a batch-balanced k-nearest-neighbour graph (BBKNN) on a PCA embedding. | Fully deterministic — kNN construction is deterministic given the input embedding. | structural — versions + schema only | +| `scanpy/cellcycle` | Scores each cell for S-phase and G2M-phase activity and assigns a predicted cell cycle phase. | Fully deterministic | hash | +| `scanpy/combat` | Applies ComBat batch correction and then runs PCA, storing the result as `X_emb`. | Seeded / quasi-deterministic — ComBat is deterministic; downstream PCA floats may vary across LAPACK backends. | structural — versions + schema only | +| `scanpy/filter` | Filters cells and genes by count, gene, and mitochondrial percentage thresholds. | Fully deterministic | hash + structural — standard `hash` triple; multiple parameter scenarios | +| `scanpy/hvgs` | Normalizes counts (`normalize_total` → `log1p`), selects highly variable genes, and subsets the AnnData to those genes while keeping raw counts in `X`. | Seeded / quasi-deterministic — HVG variance statistics rely on NumPy/SciPy floating-point operations that can produce slightly different results across library versions. | structural — versions + schema only | +| `scanpy/leiden` | Performs Leiden community-detection clustering at a specified resolution. | **Non-deterministic** — Leiden uses random restarts with no fixed seed. | structural — range assertion on cluster count + versions + schema | +| `scanpy/neighbors` | Computes a k-nearest-neighbour graph on a specified embedding. | Fully deterministic given a fixed input embedding. | structural — versions + schema only | +| `scanpy/paga` | Computes PAGA coarse-grained cluster connectivity and saves a graph and plot. | Fully deterministic — PAGA is a deterministic graph-summarisation step given fixed Leiden labels. | hash | +| `scanpy/pca` | Runs library-size normalization, log1p, and PCA with `random_state=0`, storing the result under a specified key. | Seeded / quasi-deterministic — seed is fixed, but float coordinates can differ across LAPACK/MKL backends. Embeddings differ from earlier pipeline versions that ran PCA on unnormalized counts. | structural — versions + schema only | +| `scanpy/plotqc` | Calculates QC metrics and produces a counts-vs-genes scatter plot for MultiQC. | Fully deterministic | hash (no H5AD output — PNG / MultiQC JSON + versions) | +| `scanpy/rankgenesgroups` | Runs differential gene expression (rank genes groups) across clusters using a configurable statistical method. | **Seeded / quasi-deterministic** — wilcoxon and t-test are deterministic in theory, but tied-rank handling and floating-point tie-breaking can differ across SciPy versions. | structural — versions + `adata.yaml`; one path with **empty h5ad** snapshots **versions only** | +| `scanpy/readh5` | Reads a 10x Genomics HDF5 (`.h5`) file and writes it as an AnnData H5AD. | Fully deterministic | hash | +| `scanpy/sample` | Down-samples cells to a fixed count or fraction using `rng=0`. | Seeded / quasi-deterministic — seed is fixed, but sampled cell set may vary across NumPy versions. | hash | +| `scanpy/umap` | Computes a UMAP embedding from a pre-built neighbour graph using `random_state=0`. | Seeded / quasi-deterministic — seed is fixed, but float coordinates vary across umap-learn/numba versions. | structural — versions + schema only | + +### `symphony/` + +| Module | Description | Reproducibility | Test strategy (this branch) | +| --------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- | ----------------------------------- | +| `symphony/harmonyintegrate` | Runs Symphony integration via symphonypy/Harmony after normalize_total → log1p → scale(max_value=10) → PCA(zero_center=False), storing `X_symphony`, `X_emb`, Symphony reference metadata (`var` mean/std/HVG, `varm['PCs']`, `uns['harmony']`, `uns['normalize']`), and publishing a compact `symphony_reference.h5ad`. Requires symphonypy ≥0.2.3 ([symphonypy#8](https://github.com/potulabe/symphonypy/issues/8), [symphonypy#9](https://github.com/potulabe/symphonypy/issues/9)). | **Non-deterministic** — Harmony is an iterative optimisation; symphonypy passes `random_seed=1` but upstream PCA is unseeded. | structural — versions + schema only | +| `symphony/mapembedding` | Maps query cells onto a Symphony reference via symphonypy `map_embedding`, storing mapped coordinates in `X_symphony` and `X_emb`. | **Non-deterministic** — inherits Symphony/Harmony mapping variability. | structural — versions + schema only | ### `scimilarity/` @@ -180,11 +186,11 @@ The **Test strategy (this branch)** column describes what the tests on this bran | `ambient_correction` | Dispatches ambient RNA correction to decontX, SoupX, or none based on a parameter. | **Non-deterministic** for decontX (no seed) and SoupX (seeded clustering but variable results); fully deterministic for the `none` passthrough. | **Scenario-dependent:** often `versions` as YAML + `adata.yaml` when an H5AD is produced; **`none` / meta-disabled** paths may snapshot **only `versions` + `workflow.out.h5ad.size()`** (counts, not hashes). | | `celltype_assignment` | Orchestrates cell type annotation by running SingleR and/or CellTypist. | Fully deterministic at inference time for both methods. | **`workflow.out.versions` + `workflow.out.obs.size()`** for non-stub tests; separate **stub** test exercises subworkflow wiring. | | `cluster` | Full clustering pipeline: neighbours → UMAP → Leiden at multiple resolutions → Shannon entropy. | Seeded / quasi-deterministic for UMAP; **non-deterministic** due to unseeded Leiden. | structural — **`workflow.out.versions` only** (each as YAML); graph / embedding presence asserted in code outside `snapshot`. | -| `combine` | Merges all samples and runs all configured integration methods. | Inherits from constituent modules — ranges from fully deterministic (no integration) to seeded/quasi-deterministic (scVI, Harmony, Seurat). | structural — **`workflow.out.versions` (YAML) + `adata.yaml`** on merged H5AD. | +| `combine` | Merges all samples and runs all configured integration methods. | Inherits from constituent modules — ranges from fully deterministic (no integration) to seeded/quasi-deterministic (scVI, Symphony, Seurat). | structural — **`workflow.out.versions` (YAML) + `adata.yaml`** on merged H5AD. | | `differential_expression` | Runs rank-genes-groups DE analysis across all combinations of clustering labels, conditions, and cell-type subsets. | Fully deterministic for the default wilcoxon/t-test methods. | structural — **`workflow.out.versions` only** (YAML); DE / MultiQC presence asserted outside `snapshot` where needed. | | `doublet_detection` | Runs one or more doublet-detection methods (scdblfinder, solo, scrublet, doubletdetection) and removes called doublets. | **Non-deterministic** — solo, scrublet, and doubletdetection have stochastic components; scdblfinder is seeded. | structural + **range assertion** on **`n_obs`**; snapshot uses **`versions` (YAML) + `adata.yaml`**. | | `finalize` | Assembles the final AnnData by extending it with all collected obs/obsm/uns/layers outputs. | Fully deterministic | hash — **`workflow.out.h5ad` + `workflow.out.versions` (YAML) + `adata.yaml`** — not a bare `snapshot(workflow.out)` in non-stub tests. | -| `integrate` | Applies HVG selection then one or more integration methods (scVI, scANVI, Harmony, BBKNN, ComBat, Seurat, SCimilarity, PCA, EXPIMAP). | Seeded / quasi-deterministic for scVI/scANVI/ComBat/Seurat/BBKNN/PCA; **non-deterministic** for Harmony and EXPIMAP (iterative training). | structural — **`workflow.out.versions` (YAML) + `adata.yaml`** on integration H5AD (e.g. Harmony / BBKNN / ComBat / PCA tests). | +| `integrate` | Applies HVG selection then one or more integration methods (scVI, scANVI, Symphony, BBKNN, ComBat, Seurat, SCimilarity, PCA, EXPIMAP). | Seeded / quasi-deterministic for scVI/scANVI/ComBat/Seurat/BBKNN/PCA; **non-deterministic** for Symphony and EXPIMAP (iterative training). | structural — **`workflow.out.versions` (YAML) + `adata.yaml`** on integration H5AD (e.g. Symphony / BBKNN / ComBat / PCA tests). | | `load_h5ad` | Loads input files in H5AD, 10x H5, RDS, or CSV format and converts all to AnnData H5AD. | Fully deterministic | hash — **`snapshot(workflow.out)` only** (passthrough-safe; avoids `anndata().yaml` on unstaged inputs per nf-test rules). | | `per_group` | Runs PAGA, LIANA rank-aggregate, rank-genes DE, and optional CyteType per cluster grouping. | **Seeded / quasi-deterministic** — inherits from constituent modules; CyteType is non-deterministic when enabled. | structural — **`workflow.out.versions` only** (YAML); optional `workflow.out.obs.size()` when CyteType is enabled. | | `pseudobulking` | Aggregates single-cell data into pseudobulk profiles grouped by specified metadata columns. | Fully deterministic | hash — **`workflow.out` + `versions` (YAML) + `adata.yaml`** on pseudobulk H5AD. | diff --git a/docs/usage.md b/docs/usage.md index 50ad2c88..8447d758 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -51,29 +51,29 @@ sample3,/absolute/path/to/sample3_filtered.csv,/absolute/path/to/sample3.csv,,,, For CSV input files, specifying the `batch_col`, `label_col`, `condition_col`, and `unknown_label` columns will not have any effect, as no additional metadata is available in the CSV file. -| Column | Description | -| ------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Unique sample identifier. Will be added to the pipeline output objects as `sample` column. | -| `filtered` | May contain paths to `h5ad`, `h5`, `rds`, or `csv` files. `rds` files may contain any object that can be converted to a `SingleCellExperiment` using the [Seurat `as.SingleCellExperiment`](https://satijalab.org/seurat/reference/as.singlecellexperiment) function. `csv` files should contain a matrix with genes as columns and cells as rows. | -| `unfiltered` | Same as `filtered`, but for the unfiltered cellranger or nf-core/scrnaseq output. If not provided, only `decontX` can be used for ambient RNA removal. | -| `batch_col` | Column in the input file containing batch information. If not provided, the entire input object will be considered as one batch. If the `batch_col` is something else than `batch`, it will be renamed to `batch` during pipeline execution. | -| `symbol_col` | Column in the input file containing gene symbol information. Defaults to `index`. There are two special values that can be used: `index` and `none`. `index` will use the row names of the matrix as gene symbols. `none` will trigger the pipeline to perform gene symbol conversion using MyGene.info based on the `geneid_col`. The values from `symbol_col` will be set as the index during pipeline execution. | -| `geneid_col` | Column in the input file containing gene identifier information. Defaults to `index`. Only used if `symbol_col` is set to `none`. | -| `label_col` | Column in the input file containing cell type information. Defaults to `label`. If the column does not exist in the input object, the pipeline will create a new column and put `unknown` in it. If the `label_col` is something else than `label`, it will be renamed to `label` during pipeline execution. | -| `condition_col` | Column in the input file containing condition information (e.g. disease state, treatment). If the column does not exist in the input object, the pipeline will create a new column and put `unknown` in it. If the `condition_col` is something else than `condition`, it will be renamed to `condition` during pipeline execution. | -| `unknown_label` | Value in the `label_col` column that should be considered as unknown. Defaults to `unknown`. If the `unknown_label` is something else than `unknown`, it will be renamed to `unknown` during pipeline execution. If trying to perform integration with scANVI, more than one unique label other than `unknown` must exist in the input data. | -| `counts_layer` | Layer in the input file containing the raw counts matrix. Defaults to `X`. | -| `min_genes` | Minimum number of genes required for a cell to be considered. Defaults to `1`. | -| `min_cells` | Minimum number of cells required for a gene to be considered. Defaults to `1`. | -| `min_counts_cell` | Minimum number of counts required for a cell to be considered. Defaults to `1`. | -| `min_counts_gene` | Minimum number of counts required for a gene to be considered. Defaults to `1`. | -| `expected_cells` | Number of expected cells, used as input to CellBender for empty droplet detection. | -| `doublet_rate` | Optional expected doublet rate (0-1) for `scDblFinder`. If not provided, `scDblFinder` estimates it internally. | -| `max_mito_percentage` | Maximum percentage of mitochondrial reads for a cell to be considered. Defaults to `100`. | -| `min_ribo_percentage` | Minimum percentage of ribosomal reads for a cell to be considered. Defaults to `0`. | -| `max_hb_percentage` | Maximum percentage of haemoglobin reads for a cell to be considered. Defaults to `100`. | -| `ambient_correction` | Whether to perform ambient RNA correction for this sample. Set to `true` to use the globally configured method, `false` to skip ambient correction for this sample. Defaults to `true`. | -| `ambient_corrected_integration` | Whether to use ambient-corrected counts for integration for this sample. Set to `true` to use corrected counts in downstream integration, `false` to store them only as additional layers. Can override the global `--ambient_corrected_integration` parameter. Defaults to global setting. | +| Column | Description | +| ------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Unique sample identifier. Will be added to the pipeline output objects as `sample` column. | +| `filtered` | May contain paths to `h5ad`, `h5`, `rds`, or `csv` files. `rds` files may contain any object that can be converted to a `SingleCellExperiment` using the [Seurat `as.SingleCellExperiment`](https://satijalab.org/seurat/reference/as.singlecellexperiment) function. `csv` files should contain a matrix with genes as columns and cells as rows. | +| `unfiltered` | Same as `filtered`, but for the unfiltered cellranger or nf-core/scrnaseq output. If not provided, only `decontX` can be used for ambient RNA removal. | +| `batch_col` | Column in the input file containing batch information. If not provided, the entire input object will be considered as one batch. If the `batch_col` is something else than `batch`, it will be renamed to `batch` during pipeline execution. | +| `symbol_col` | Column in the input file containing gene symbol information. Defaults to `index`. There are two special values that can be used: `index` and `none`. `index` will use the row names of the matrix as gene symbols. `none` will trigger the pipeline to perform gene symbol conversion using MyGene.info based on the `geneid_col` and the pipeline `--species` parameter. The values from `symbol_col` will be set as the index during pipeline execution. | +| `geneid_col` | Column in the input file containing gene identifier information. Defaults to `index`. Only used if `symbol_col` is set to `none`. | +| `label_col` | Column in the input file containing cell type information. Defaults to `label`. If the column does not exist in the input object, the pipeline will create a new column and put `unknown` in it. If the `label_col` is something else than `label`, it will be renamed to `label` during pipeline execution. | +| `condition_col` | Column in the input file containing condition information (e.g. disease state, treatment). If the column does not exist in the input object, the pipeline will create a new column and put `unknown` in it. If the `condition_col` is something else than `condition`, it will be renamed to `condition` during pipeline execution. | +| `unknown_label` | Value in the `label_col` column that should be considered as unknown. Defaults to `unknown`. If the `unknown_label` is something else than `unknown`, it will be renamed to `unknown` during pipeline execution. If trying to perform integration with scANVI, more than one unique label other than `unknown` must exist in the input data. | +| `counts_layer` | Layer in the input file containing the raw counts matrix. Defaults to `X`. | +| `min_genes` | Minimum number of genes required for a cell to be considered. Defaults to `1`. | +| `min_cells` | Minimum number of cells required for a gene to be considered. Defaults to `1`. | +| `min_counts_cell` | Minimum number of counts required for a cell to be considered. Defaults to `1`. | +| `min_counts_gene` | Minimum number of counts required for a gene to be considered. Defaults to `1`. | +| `expected_cells` | Number of expected cells, used as input to CellBender for empty droplet detection. | +| `doublet_rate` | Optional expected doublet rate (0-1) for `scDblFinder`. If not provided, `scDblFinder` estimates it internally. | +| `max_mito_percentage` | Maximum percentage of mitochondrial reads for a cell to be considered. Defaults to `100`. | +| `min_ribo_percentage` | Minimum percentage of ribosomal reads for a cell to be considered. Defaults to `0`. | +| `max_hb_percentage` | Maximum percentage of haemoglobin reads for a cell to be considered. Defaults to `100`. | +| `ambient_correction` | Whether to perform ambient RNA correction for this sample. Set to `true` to use the globally configured method, `false` to skip ambient correction for this sample. Defaults to `true`. | +| `ambient_corrected_integration` | Whether to use ambient-corrected counts for integration for this sample. Set to `true` to use corrected counts in downstream integration, `false` to store them only as additional layers. Can override the global `--ambient_corrected_integration` parameter. Defaults to global setting. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. @@ -185,6 +185,7 @@ nextflow run nf-core/scdownstream --input samplesheet.csv --outdir results --cel #### Species Bundled gene lists are provided for human and mouse. +`--species` also selects the MyGene.info taxonomy used when samples have `symbol_col: none` and gene identifiers are converted via MyGene.info. Select the appropriate species with `--species`: ```bash @@ -216,14 +217,16 @@ nextflow run nf-core/scdownstream --input samplesheet.csv --outdir results \ ### Reference mapping and extension **Reference mapping** means **mapping new cells into a latent space using a pre-trained model** instead of training that integration step only on the query data. -In this pipeline this can be done using **scVI**, **scANVI**, and **scimilarity**. -To enable it, add the corresponding method to [`integration_methods`](https://nf-co.re/scdownstream/parameters#integration_methods) (`scvi`, `scanvi`, and/or `scimilarity`) and set the matching model parameters for each method you use: [`scvi_model`](https://nf-co.re/scdownstream/parameters#scvi_model), [`scanvi_model`](https://nf-co.re/scdownstream/parameters#scanvi_model), and [`scimilarity_model`](https://nf-co.re/scdownstream/parameters#scimilarity_model) (see the [parameter reference](https://nf-co.re/scdownstream/parameters) for file types, defaults, and help text). +In this pipeline this can be done using **scVI**, **scANVI**, **scimilarity**, and **Symphony**. +To enable it, add the corresponding method to [`integration_methods`](https://nf-co.re/scdownstream/parameters#integration_methods) (`scvi`, `scanvi`, `scimilarity`, and/or `symphony`) and set the matching model parameters for each method you use: [`scvi_model`](https://nf-co.re/scdownstream/parameters#scvi_model), [`scanvi_model`](https://nf-co.re/scdownstream/parameters#scanvi_model), [`scimilarity_model`](https://nf-co.re/scdownstream/parameters#scimilarity_model), and [`symphony_reference`](https://nf-co.re/scdownstream/parameters#symphony_reference) (see the [parameter reference](https://nf-co.re/scdownstream/parameters) for file types, defaults, and help text). + +For Symphony reference mapping, provide the compact Symphony reference AnnData from a prior de novo run (`{outdir}/combine/integrate/symphony/symphony_reference.h5ad`). It contains the gene statistics, PCA loadings, Harmony centroids, and normalization metadata required for query mapping. **Extension** is for users that have outputs of a previous run of `nf-core/scdownstream` and want to extend it with new data, without re-running the integration from scratch. -It only works if `scvi`, `scanvi` and/or `scimilarity` have been enabled in `integration_methods` in the original pipeline run. -Other integration methods than the three mentioned before are not supported for this. +It only works if `scvi`, `scanvi`, `scimilarity`, and/or `symphony` have been enabled in `integration_methods` in the original pipeline run. +Other integration methods than the four mentioned before are not supported for this. In simple terms, in this setup the workflow is: (1) project new data into the latent space learned from the data in the original run, and then (2) combine the datasets. -For (1), provide the same checkpoints as for reference mapping ([`scvi_model`](https://nf-co.re/scdownstream/parameters#scvi_model), [`scanvi_model`](https://nf-co.re/scdownstream/parameters#scanvi_model), [`scimilarity_model`](https://nf-co.re/scdownstream/parameters#scimilarity_model)). +For (1), provide the same checkpoints as for reference mapping ([`scvi_model`](https://nf-co.re/scdownstream/parameters#scvi_model), [`scanvi_model`](https://nf-co.re/scdownstream/parameters#scanvi_model), [`scimilarity_model`](https://nf-co.re/scdownstream/parameters#scimilarity_model), [`symphony_reference`](https://nf-co.re/scdownstream/parameters#symphony_reference)). For (2), pass the integrated `.h5ad` from the original run as [`base_adata`](https://nf-co.re/scdownstream/parameters#base_adata). Pre-trained scVI models are also shared on [scvi-hub](https://huggingface.co/scvi-tools). @@ -281,11 +284,11 @@ Each row in the CSV selects a subset of clusterings. **All columns are optional* When multiple rows match a clustering result, their `analyses` lists are **combined** (duplicates removed). If any matching row leaves `analyses` empty, all analyses run for that clustering. Clusterings that match **no** row are excluded from Leiden and all downstream analyses — but their UMAP and neighbour graph are still computed. -Example plan: full analysis on Harmony at resolution 0.5, DE-only at resolution 1.0 for every integration, and DE-only for scVI at any resolution: +Example plan: full analysis on Symphony at resolution 0.5, DE-only at resolution 1.0 for every integration, and DE-only for scVI at any resolution: ```csv title="analysis_plan.csv" integration,subset,resolution,analyses -harmony,global,0.5,"paga,de,cytetype" +symphony,global,0.5,"paga,de,cytetype" ,,1.0,de scvi,,,de ``` diff --git a/main.nf b/main.nf index b99f8488..63a26ad8 100644 --- a/main.nf +++ b/main.nf @@ -46,6 +46,7 @@ workflow NFCORE_SCDOWNSTREAM { cell_cycle_scoring // value: boolean s_genes // path: file or [] g2m_genes // path: file or [] + species // value: string qc_only // value: boolean celldex_reference // value: string celltypist_model // value: string @@ -61,6 +62,7 @@ workflow NFCORE_SCDOWNSTREAM { scvi_categorical_covariates // value: string scvi_continuous_covariates // value: string scimilarity_model // value: string + symphony_reference // value: string expimap_gmt // value: string skip_liana // value: boolean skip_rankgenesgroups // value: boolean @@ -102,6 +104,7 @@ workflow NFCORE_SCDOWNSTREAM { cell_cycle_scoring, s_genes, g2m_genes, + species, qc_only, celldex_reference, celltypist_model, @@ -117,6 +120,7 @@ workflow NFCORE_SCDOWNSTREAM { scvi_categorical_covariates, scvi_continuous_covariates, scimilarity_model, + symphony_reference, expimap_gmt, skip_liana, skip_rankgenesgroups, @@ -180,6 +184,10 @@ workflow { def analysis_plan = analysisPlanToList() + def symphony_reference = params.symphony_reference + ? file(params.symphony_reference, checkIfExists: true) + : null + NFCORE_SCDOWNSTREAM ( PIPELINE_INITIALISATION.out.samplesheet, ch_base_adata, @@ -196,6 +204,7 @@ workflow { params.cell_cycle_scoring, s_genes_file, g2m_genes_file, + params.species, params.qc_only, params.celldex_reference, params.celltypist_model, @@ -211,6 +220,7 @@ workflow { params.scvi_categorical_covariates, params.scvi_continuous_covariates, params.scimilarity_model, + symphony_reference, params.expimap_gmt, params.skip_liana, params.skip_rankgenesgroups, diff --git a/modules/local/adata/mygene/main.nf b/modules/local/adata/mygene/main.nf index d80629a4..6ab5b022 100644 --- a/modules/local/adata/mygene/main.nf +++ b/modules/local/adata/mygene/main.nf @@ -9,6 +9,7 @@ process ADATA_MYGENE { input: tuple val(meta), path(h5ad) + val(species) output: tuple val(meta), path("*.h5ad"), emit: h5ad diff --git a/modules/local/adata/mygene/templates/mygene.py b/modules/local/adata/mygene/templates/mygene.py index e9050d91..56173426 100644 --- a/modules/local/adata/mygene/templates/mygene.py +++ b/modules/local/adata/mygene/templates/mygene.py @@ -21,10 +21,16 @@ ) mg = mygene.MyGeneInfo() -df_genes = mg.querymany(inputs, - scopes=["symbol", "entrezgene", "ensemblgene"], - fields="symbol", species="human", as_dataframe=True) -mapping = df_genes["symbol"].dropna().to_dict() +mapping = {} +for i in range(0, len(inputs), 500): + df_genes = mg.querymany( + inputs[i : i + 500], + scopes=["symbol", "entrezgene", "ensemblgene"], + fields="symbol", + species="${species}", + as_dataframe=True, + ) + mapping.update(df_genes["symbol"].dropna().to_dict()) outputs = [mapping.get(i, i) for i in inputs] diff --git a/modules/local/adata/mygene/tests/main.nf.test b/modules/local/adata/mygene/tests/main.nf.test index 75cf7191..97ec0dc4 100644 --- a/modules/local/adata/mygene/tests/main.nf.test +++ b/modules/local/adata/mygene/tests/main.nf.test @@ -20,6 +20,7 @@ nextflow_process { file(params.modules_testdata_base_path + 'genomics/homo_sapiens/scrnaseq/h5ad/SRR28679759_filtered_matrix.h5ad', checkIfExists: true) ] ) + input[1] = 'human' """ } } @@ -51,6 +52,7 @@ nextflow_process { file(params.modules_testdata_base_path + 'genomics/homo_sapiens/scrnaseq/h5ad/SRR28679759_filtered_matrix.h5ad', checkIfExists: true) ] ) + input[1] = 'human' """ } } diff --git a/modules/local/adata/prepcellxgene/templates/prepcellxgene.py b/modules/local/adata/prepcellxgene/templates/prepcellxgene.py index a36ffce3..7f00e6c9 100644 --- a/modules/local/adata/prepcellxgene/templates/prepcellxgene.py +++ b/modules/local/adata/prepcellxgene/templates/prepcellxgene.py @@ -17,7 +17,7 @@ adata = ad.read_h5ad("${h5ad}") -integration_methods = ["harmony", "scvi", "scanvi", "scimilarity", "seurat", "bbknn", "combat", "pca", "expimap"] +integration_methods = ["symphony", "scvi", "scanvi", "scimilarity", "seurat", "bbknn", "combat", "pca", "expimap"] for integration in integration_methods: embedding_key = f"X_{integration}" diff --git a/modules/local/adata/prepcellxgene/tests/main.nf.test.snap b/modules/local/adata/prepcellxgene/tests/main.nf.test.snap index 58b78d2a..8fa47513 100644 --- a/modules/local/adata/prepcellxgene/tests/main.nf.test.snap +++ b/modules/local/adata/prepcellxgene/tests/main.nf.test.snap @@ -40,7 +40,7 @@ { "id": "test" }, - "test.h5ad:md5,e962f73664186924dfe5269caed069bb" + "test.h5ad:md5,1ea9af3fd7a7908e99d6a0ec04f62b89" ] ], "1": [ @@ -51,7 +51,7 @@ { "id": "test" }, - "test.h5ad:md5,e962f73664186924dfe5269caed069bb" + "test.h5ad:md5,1ea9af3fd7a7908e99d6a0ec04f62b89" ] ], "versions": [ @@ -115,10 +115,10 @@ "obsm": [ "X_bbknn-global_umap", "X_combat-global_umap", + "X_harmony", "X_harmony-global_umap", "X_scvi-global_umap", "combat", - "harmony", "scvi" ], "varm": [ @@ -153,10 +153,10 @@ ] } ], - "timestamp": "2026-03-29T12:57:46.020211425", + "timestamp": "2026-05-31T10:12:36.828571877", "meta": { "nf-test": "0.9.4", - "nextflow": "25.10.2" + "nextflow": "26.04.0" } } } \ No newline at end of file diff --git a/modules/local/adata/splitembeddings/tests/main.nf.test.snap b/modules/local/adata/splitembeddings/tests/main.nf.test.snap index 911f0c48..7d87289d 100644 --- a/modules/local/adata/splitembeddings/tests/main.nf.test.snap +++ b/modules/local/adata/splitembeddings/tests/main.nf.test.snap @@ -144,10 +144,10 @@ ] } ], - "timestamp": "2026-03-29T14:55:42.179483745", + "timestamp": "2026-05-31T10:09:07.090064015", "meta": { "nf-test": "0.9.4", - "nextflow": "25.10.2" + "nextflow": "26.04.0" } } } \ No newline at end of file diff --git a/modules/local/scanpy/combat/tests/main.nf.test.snap b/modules/local/scanpy/combat/tests/main.nf.test.snap index c3becdf6..67c00aa4 100644 --- a/modules/local/scanpy/combat/tests/main.nf.test.snap +++ b/modules/local/scanpy/combat/tests/main.nf.test.snap @@ -56,7 +56,7 @@ }, { "n_obs": 38234, - "n_vars": 100, + "n_vars": 101, "obs": { "index": "_index", "columns": [ @@ -95,7 +95,7 @@ ] } ], - "timestamp": "2026-05-11T12:33:40.280258286", + "timestamp": "2026-05-31T07:26:05.351659325", "meta": { "nf-test": "0.9.4", "nextflow": "26.04.0" diff --git a/modules/local/scanpy/harmony/environment.yml b/modules/local/scanpy/harmony/environment.yml deleted file mode 100644 index c048fcad..00000000 --- a/modules/local/scanpy/harmony/environment.yml +++ /dev/null @@ -1,8 +0,0 @@ -channels: - - conda-forge - - bioconda -dependencies: - - conda-forge::python=3.13.12 - - bioconda::harmonypy=0.2.0 - - conda-forge::pyyaml=6.0.3 - - conda-forge::scanpy=1.12 diff --git a/modules/local/scanpy/harmony/templates/harmony.py b/modules/local/scanpy/harmony/templates/harmony.py deleted file mode 100644 index 61b307ef..00000000 --- a/modules/local/scanpy/harmony/templates/harmony.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 - -# Disable OpenMP CPU topology detection for MacOS compatibility -import os -os.environ["KMP_AFFINITY"] = "disabled" - -import platform -import yaml - -os.environ["MPLCONFIGDIR"] = "./tmp/mpl" -os.environ["NUMBA_CACHE_DIR"] = "./tmp/numba" - -import harmonypy -import scanpy as sc -import pandas as pd - -from threadpoolctl import threadpool_limits -threadpool_limits(int("${task.cpus}")) - -adata = sc.read_h5ad("${h5ad}") - -prefix = "${prefix}" - -adata_processing = adata.copy() - -if "${counts_layer}" != "X": - adata_processing.X = adata.layers["${counts_layer}"] - -sc.pp.log1p(adata_processing) -sc.pp.pca(adata_processing) - -harmony_out = harmonypy.run_harmony( - adata_processing.obsm["X_pca"].astype("float64"), - adata_processing.obs, - "${batch_col}", -) - -emb = harmony_out.Z_corr - -# harmonypy 0.2.0 changed Z_corr orientation; accept either layout. -# See https://github.com/potulabe/symphonypy/issues/8 -if emb.shape == adata_processing.obsm["X_pca"].shape: - adata_processing.obsm["X_emb"] = emb -elif emb.T.shape == adata_processing.obsm["X_pca"].shape: - adata_processing.obsm["X_emb"] = emb.T -else: - raise ValueError( - f"Unexpected Harmony embedding shape {emb.shape}; " - f"expected {adata_processing.obsm['X_pca'].shape} or its transpose." - ) - -adata.obsm["X_emb"] = adata_processing.obsm["X_emb"] - -adata.write_h5ad(f"{prefix}.h5ad") - -df = pd.DataFrame(adata.obsm["X_emb"], index=adata.obs_names) -df.to_pickle(f"X_{prefix}.pkl") - -# Versions - -versions = { - "${task.process}": { - "python": platform.python_version(), - "scanpy": sc.__version__, - "harmonypy": harmonypy.__version__, - "pandas": pd.__version__ - } -} - -with open("versions.yml", "w") as f: - yaml.dump(versions, f) diff --git a/modules/local/scanpy/hvgs/templates/hvgs.py b/modules/local/scanpy/hvgs/templates/hvgs.py index 70e43873..9e8bf0a1 100644 --- a/modules/local/scanpy/hvgs/templates/hvgs.py +++ b/modules/local/scanpy/hvgs/templates/hvgs.py @@ -41,6 +41,7 @@ raw_counts = adata.X.copy() + sc.pp.normalize_total(adata) sc.pp.log1p(adata) sc.pp.highly_variable_genes(adata, **kwargs) diff --git a/modules/local/scanpy/hvgs/tests/main.nf.test.snap b/modules/local/scanpy/hvgs/tests/main.nf.test.snap index 0fb740d0..e6dfc237 100644 --- a/modules/local/scanpy/hvgs/tests/main.nf.test.snap +++ b/modules/local/scanpy/hvgs/tests/main.nf.test.snap @@ -9,7 +9,7 @@ }, { "n_obs": 38234, - "n_vars": 100, + "n_vars": 101, "obs": { "index": "_index", "columns": [ @@ -26,19 +26,19 @@ ] }, "layers": [ - + ], "obsm": [ - + ], "varm": [ - + ], "obsp": [ - + ], "varp": [ - + ], "uns": [ "hvg", @@ -46,10 +46,10 @@ ] } ], - "timestamp": "2026-03-29T11:18:05.314404083", + "timestamp": "2026-05-28T12:02:56.794195774", "meta": { "nf-test": "0.9.4", - "nextflow": "25.10.2" + "nextflow": "26.04.0" } }, "Should run without a specified number of HVGs": { @@ -62,7 +62,7 @@ }, { "n_obs": 38234, - "n_vars": 251, + "n_vars": 111, "obs": { "index": "_index", "columns": [ @@ -79,19 +79,19 @@ ] }, "layers": [ - + ], "obsm": [ - + ], "varm": [ - + ], "obsp": [ - + ], "varp": [ - + ], "uns": [ "hvg", @@ -99,10 +99,10 @@ ] } ], - "timestamp": "2026-03-29T11:17:05.806436168", + "timestamp": "2026-05-28T12:02:32.511106972", "meta": { "nf-test": "0.9.4", - "nextflow": "25.10.2" + "nextflow": "26.04.0" } }, "Should run without failures - stub": { @@ -154,7 +154,7 @@ }, { "n_obs": 38234, - "n_vars": 100, + "n_vars": 101, "obs": { "index": "_index", "columns": [ @@ -171,19 +171,19 @@ ] }, "layers": [ - + ], "obsm": [ - + ], "varm": [ - + ], "obsp": [ - + ], "varp": [ - + ], "uns": [ "hvg", @@ -191,10 +191,10 @@ ] } ], - "timestamp": "2026-03-29T11:19:17.695541068", + "timestamp": "2026-05-31T10:13:07.459491579", "meta": { "nf-test": "0.9.4", - "nextflow": "25.10.2" + "nextflow": "26.04.0" } } -} +} \ No newline at end of file diff --git a/modules/local/scanpy/pca/templates/pca.py b/modules/local/scanpy/pca/templates/pca.py index 666ab4cb..b3554d0a 100644 --- a/modules/local/scanpy/pca/templates/pca.py +++ b/modules/local/scanpy/pca/templates/pca.py @@ -21,7 +21,8 @@ prefix = "${prefix}" key_added = "${key_added}" -# Run PCA +sc.pp.normalize_total(adata) +sc.pp.log1p(adata) sc.pp.pca(adata, random_state=0, key_added=key_added) adata.write_h5ad(f"{prefix}.h5ad") diff --git a/modules/local/scanpy/pca/tests/main.nf.test.snap b/modules/local/scanpy/pca/tests/main.nf.test.snap index dd729e71..1ae72069 100644 --- a/modules/local/scanpy/pca/tests/main.nf.test.snap +++ b/modules/local/scanpy/pca/tests/main.nf.test.snap @@ -59,11 +59,11 @@ "var": { "index": "_index", "columns": [ - + ] }, "layers": [ - + ], "obsm": [ "X_pca" @@ -72,20 +72,21 @@ "X_pca" ], "obsp": [ - + ], "varp": [ - + ], "uns": [ - "X_pca" + "X_pca", + "log1p" ] } ], - "timestamp": "2026-03-29T11:17:21.253081099", + "timestamp": "2026-05-28T12:10:47.461951809", "meta": { "nf-test": "0.9.4", - "nextflow": "25.10.2" + "nextflow": "26.04.0" } } -} +} \ No newline at end of file diff --git a/modules/local/seurat/integration/environment.yml b/modules/local/seurat/integration/environment.yml index f6bd79fc..d362a6f1 100644 --- a/modules/local/seurat/integration/environment.yml +++ b/modules/local/seurat/integration/environment.yml @@ -3,5 +3,6 @@ channels: - bioconda dependencies: - bioconda::bioconductor-anndatar=1.0.2 + - bioconda::bioconductor-glmgampoi=1.22.0 - bioconda::bioconductor-rhdf5=2.54.1 - - conda-forge::r-seurat=5.4.0 + - conda-forge::r-seurat=5.5.0 diff --git a/modules/local/seurat/integration/main.nf b/modules/local/seurat/integration/main.nf index 3cc2605c..33ff14fc 100644 --- a/modules/local/seurat/integration/main.nf +++ b/modules/local/seurat/integration/main.nf @@ -4,8 +4,8 @@ process SEURAT_INTEGRATION { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? - 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/b4/b4393c608e642b1232cd7bb84e6c5d7620c4b167462f342a4780307e5e67596b/data': - 'community.wave.seqera.io/library/bioconductor-anndatar_bioconductor-rhdf5_r-seurat:71809468c7d8a963' }" + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/7b/7bbad8d18ada67c2ca1dfaec11c5acb0fcd355713fec10331b0e202f1d6165f1/data': + 'community.wave.seqera.io/library/bioconductor-anndatar_bioconductor-glmgampoi_bioconductor-rhdf5_r-seurat:a0acfd4813d44adc' }" input: tuple val(meta), path(h5ad) diff --git a/modules/local/seurat/integration/tests/main.nf.test b/modules/local/seurat/integration/tests/main.nf.test index eac29227..8fac539d 100644 --- a/modules/local/seurat/integration/tests/main.nf.test +++ b/modules/local/seurat/integration/tests/main.nf.test @@ -8,8 +8,8 @@ nextflow_process { tag "modules_local" setup { - run("SCANPY_HVGS") { - script "modules/local/scanpy/hvgs/main.nf" + run("SCANPY_FILTER", alias: "QC_FILTER") { + script "modules/local/scanpy/filter/main.nf" process { """ input[0] = channel.of([ @@ -17,12 +17,29 @@ nextflow_process { file(params.modules_testdata_base_path + 'genomics/homo_sapiens/scrnaseq/h5ad/combined_filtered_matrix.h5ad', checkIfExists: true) ] ) + input[1] = "index" + input[2] = 20 + input[3] = 20 + input[4] = 50 + input[5] = 50 + input[6] = 100 + input[7] = 0 + input[8] = 100 + input[9] = [] + """ + } + } + run("SCANPY_HVGS") { + script "modules/local/scanpy/hvgs/main.nf" + process { + """ + input[0] = QC_FILTER.out.h5ad input[1] = 100 input[2] = [] """ } } - run("SCANPY_FILTER") { + run("SCANPY_FILTER", alias: "HVG_FILTER") { script "modules/local/scanpy/filter/main.nf" process { """ @@ -49,7 +66,7 @@ nextflow_process { } process { """ - input[0] = SCANPY_FILTER.out.h5ad + input[0] = HVG_FILTER.out.h5ad input[1] = 'sample' """ } @@ -79,7 +96,7 @@ nextflow_process { } process { """ - input[0] = SCANPY_FILTER.out.h5ad + input[0] = HVG_FILTER.out.h5ad input[1] = 'sample' """ } diff --git a/modules/local/seurat/integration/tests/main.nf.test.snap b/modules/local/seurat/integration/tests/main.nf.test.snap index 1957b1a4..cb67cdc5 100644 --- a/modules/local/seurat/integration/tests/main.nf.test.snap +++ b/modules/local/seurat/integration/tests/main.nf.test.snap @@ -37,12 +37,12 @@ { "SEURAT_INTEGRATION": { "R": "4.5.3", - "Seurat": "5.4.0", + "Seurat": "5.5.0", "anndataR": "1.0.2" } }, { - "n_obs": 27350, + "n_obs": 12381, "n_vars": 100, "obs": { "index": "_index", @@ -99,10 +99,10 @@ ] } ], - "timestamp": "2026-04-12T20:15:34.917181", + "timestamp": "2026-05-31T08:44:01.475540778", "meta": { - "nf-test": "0.9.5", - "nextflow": "25.10.4" + "nf-test": "0.9.4", + "nextflow": "26.04.0" } } } \ No newline at end of file diff --git a/modules/local/symphony/harmonyintegrate/environment.yml b/modules/local/symphony/harmonyintegrate/environment.yml new file mode 100644 index 00000000..1e4070eb --- /dev/null +++ b/modules/local/symphony/harmonyintegrate/environment.yml @@ -0,0 +1,10 @@ +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.13.13 + - conda-forge::pyyaml=6.0.3 + - conda-forge::scanpy=1.12.1 + - pip + - pip: + - symphonypy==0.2.4 diff --git a/modules/local/symphony/harmonyintegrate/main.nf b/modules/local/symphony/harmonyintegrate/main.nf new file mode 100644 index 00000000..9a109b6a --- /dev/null +++ b/modules/local/symphony/harmonyintegrate/main.nf @@ -0,0 +1,36 @@ +process SYMPHONY_HARMONYINTEGRATE { + tag "${meta.id}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/51/512121548a21b4d1bb8acfd5e30a75c5c2103ddd00cf1de4713c682b7e6b5387/data' + : 'community.wave.seqera.io/library/python_pyyaml_scanpy_pip_symphonypy:2198c27c5c9392d5'}" + + input: + tuple val(meta), path(h5ad) + val(batch_col) + val(counts_layer) + + output: + tuple val(meta), path("${prefix}.h5ad") , emit: h5ad + tuple val(meta), path("${prefix}_reference.h5ad"), emit: reference + path "X_${prefix}.pkl" , emit: obsm + path "versions.yml" , emit: versions, topic: versions + + script: + prefix = task.ext.prefix ?: "${meta.id}" + if ("${prefix}.h5ad" == "${h5ad}") { + error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + } + template('harmonyintegrate.py') + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.h5ad + touch ${prefix}_reference.h5ad + touch X_${prefix}.pkl + touch versions.yml + """ +} diff --git a/modules/local/symphony/harmonyintegrate/templates/harmonyintegrate.py b/modules/local/symphony/harmonyintegrate/templates/harmonyintegrate.py new file mode 100644 index 00000000..1afb31d4 --- /dev/null +++ b/modules/local/symphony/harmonyintegrate/templates/harmonyintegrate.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 + +import os + +os.environ["KMP_AFFINITY"] = "disabled" +os.environ["MPLCONFIGDIR"] = "./tmp/mpl" +os.environ["NUMBA_CACHE_DIR"] = "./tmp/numba" + +import importlib.metadata +import platform + +import numpy as np +import pandas as pd +import scanpy as sc +import symphonypy as sp +import yaml +from anndata import AnnData +from scipy.sparse import csr_matrix +from threadpoolctl import threadpool_limits + + +def build_reference(adata, target_sum): + harmony = adata.uns["harmony"] + return AnnData( + X=csr_matrix((0, adata.n_vars), dtype=np.float32), + var=adata.var[["mean", "std", "highly_variable"]].copy(), + varm={"PCs": adata.varm["PCs"].copy()}, + uns={ + "harmony": { + "Nr": harmony["Nr"], + "C": harmony["C"], + "K": harmony["K"], + "sigma": harmony.get("sigma"), + "ref_basis_loadings": harmony["ref_basis_loadings"], + }, + "normalize": {"target_sum": target_sum}, + }, + ) + + +threadpool_limits(int("${task.cpus}")) + +adata = sc.read_h5ad("${h5ad}") +adata_proc = adata.copy() +prefix = "${prefix}" +batch_col = "${batch_col}" +counts_layer = "${counts_layer}" + +if counts_layer != "X": + adata_proc.X = adata_proc.layers[counts_layer] + +target_sum = float(np.median(np.asarray(adata_proc.X.sum(axis=1)).ravel())) +sc.pp.normalize_total(adata_proc, target_sum=target_sum) +sc.pp.log1p(adata_proc) +sc.pp.scale(adata_proc, max_value=10) +sc.pp.pca(adata_proc, n_comps=30, zero_center=False) +if "highly_variable" not in adata_proc.var.columns: + adata_proc.var["highly_variable"] = True + +sp.pp.harmony_integrate( + adata_proc, + key=batch_col, + flavor="python", + ref_basis_source="X_pca", + ref_basis_adjusted="X_symphony", +) + +adata_proc.uns["symphony"] = adata_proc.uns["harmony"] +adata_proc.uns["normalize"] = {"target_sum": target_sum} + +build_reference(adata_proc, target_sum).write_h5ad(f"{prefix}_reference.h5ad") + +adata.obsm["X_symphony"] = adata_proc.obsm["X_symphony"] +adata.obsm["X_emb"] = adata_proc.obsm["X_symphony"] +adata.write_h5ad(f"{prefix}.h5ad") + +pd.DataFrame(adata.obsm["X_emb"], index=adata.obs_names).to_pickle(f"X_{prefix}.pkl") + +versions = { + "${task.process}": { + "python": platform.python_version(), + "scanpy": importlib.metadata.version("scanpy"), + "symphonypy": importlib.metadata.version("symphonypy"), + "pandas": pd.__version__, + } +} + +with open("versions.yml", "w") as f: + yaml.dump(versions, f) diff --git a/modules/local/scanpy/harmony/tests/main.nf.test b/modules/local/symphony/harmonyintegrate/tests/main.nf.test similarity index 72% rename from modules/local/scanpy/harmony/tests/main.nf.test rename to modules/local/symphony/harmonyintegrate/tests/main.nf.test index 8c6b2478..47825dc4 100644 --- a/modules/local/scanpy/harmony/tests/main.nf.test +++ b/modules/local/symphony/harmonyintegrate/tests/main.nf.test @@ -1,8 +1,8 @@ nextflow_process { - name "Test Process SCANPY_HARMONY" - script "modules/local/scanpy/harmony/main.nf" - process "SCANPY_HARMONY" + name "Test Process SYMPHONY_HARMONYINTEGRATE" + script "modules/local/symphony/harmonyintegrate/main.nf" + process "SYMPHONY_HARMONYINTEGRATE" tag "modules" tag "modules_local" @@ -28,11 +28,20 @@ nextflow_process { then { def adata = anndata(process.out.h5ad[0][1]) + def reference = anndata(process.out.reference[0][1]) assert process.success assert "X_emb" in adata.obsm + assert "harmony" in reference.uns + assert "normalize" in reference.uns + assert "mean" in reference.var.colnames + assert "std" in reference.var.colnames + assert "highly_variable" in reference.var.colnames + assert "PCs" in reference.varm + assert reference.n_obs == 0 assert snapshot( path(process.out.versions[0]).yaml, - adata.yaml + adata.yaml, + reference.yaml ).match() } diff --git a/modules/local/symphony/harmonyintegrate/tests/main.nf.test.snap b/modules/local/symphony/harmonyintegrate/tests/main.nf.test.snap new file mode 100644 index 00000000..06b31468 --- /dev/null +++ b/modules/local/symphony/harmonyintegrate/tests/main.nf.test.snap @@ -0,0 +1,146 @@ +{ + "Should run without failures - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.h5ad:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test_reference.h5ad:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "X_test.pkl:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "3": [ + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "h5ad": [ + [ + { + "id": "test" + }, + "test.h5ad:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "obsm": [ + "X_test.pkl:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "reference": [ + [ + { + "id": "test" + }, + "test_reference.h5ad:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + } + ], + "timestamp": "2026-05-28T13:28:41.939214142", + "meta": { + "nf-test": "0.9.4", + "nextflow": "26.04.0" + } + }, + "Should run without failures": { + "content": [ + { + "SYMPHONY_HARMONYINTEGRATE": { + "pandas": "2.3.3", + "python": "3.13.13", + "scanpy": "1.12.1", + "symphonypy": "0.2.4" + } + }, + { + "n_obs": 38234, + "n_vars": 9887, + "obs": { + "index": "_index", + "columns": [ + "sample" + ] + }, + "var": { + "index": "_index", + "columns": [ + + ] + }, + "layers": [ + + ], + "obsm": [ + "X_emb", + "X_symphony" + ], + "varm": [ + + ], + "obsp": [ + + ], + "varp": [ + + ], + "uns": [ + + ] + }, + { + "n_obs": 0, + "n_vars": 9887, + "obs": { + "index": "_index", + "columns": [ + + ] + }, + "var": { + "index": "_index", + "columns": [ + "highly_variable", + "mean", + "std" + ] + }, + "layers": [ + + ], + "obsm": [ + + ], + "varm": [ + "PCs" + ], + "obsp": [ + + ], + "varp": [ + + ], + "uns": [ + "harmony", + "normalize" + ] + } + ], + "timestamp": "2026-05-28T14:41:42.365043934", + "meta": { + "nf-test": "0.9.4", + "nextflow": "26.04.0" + } + } +} \ No newline at end of file diff --git a/modules/local/symphony/mapembedding/environment.yml b/modules/local/symphony/mapembedding/environment.yml new file mode 100644 index 00000000..1e4070eb --- /dev/null +++ b/modules/local/symphony/mapembedding/environment.yml @@ -0,0 +1,10 @@ +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.13.13 + - conda-forge::pyyaml=6.0.3 + - conda-forge::scanpy=1.12.1 + - pip + - pip: + - symphonypy==0.2.4 diff --git a/modules/local/scanpy/harmony/main.nf b/modules/local/symphony/mapembedding/main.nf similarity index 63% rename from modules/local/scanpy/harmony/main.nf rename to modules/local/symphony/mapembedding/main.nf index 315d1553..36bd70ce 100644 --- a/modules/local/scanpy/harmony/main.nf +++ b/modules/local/symphony/mapembedding/main.nf @@ -1,14 +1,15 @@ -process SCANPY_HARMONY { +process SYMPHONY_MAPEMBEDDING { tag "${meta.id}" label 'process_medium' conda "${moduleDir}/environment.yml" - container "${workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container - ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/45/45339bf761a2cf0cdb058492bc37f3df8b05b363731d491d1d3a14e9ba0b8f55/data' - : 'community.wave.seqera.io/library/harmonypy_anndata_leidenalg_numpy_pruned:43066d5f86f18261'}" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/51/512121548a21b4d1bb8acfd5e30a75c5c2103ddd00cf1de4713c682b7e6b5387/data' + : 'community.wave.seqera.io/library/python_pyyaml_scanpy_pip_symphonypy:2198c27c5c9392d5'}" input: tuple val(meta), path(h5ad) + tuple val(meta2), path(reference_h5ad, stageAs: 'reference/reference.h5ad') val(batch_col) val(counts_layer) @@ -22,7 +23,7 @@ process SCANPY_HARMONY { if ("${prefix}.h5ad" == "${h5ad}") { error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" } - template('harmony.py') + template('map_embedding.py') stub: prefix = task.ext.prefix ?: "${meta.id}" diff --git a/modules/local/symphony/mapembedding/templates/map_embedding.py b/modules/local/symphony/mapembedding/templates/map_embedding.py new file mode 100644 index 00000000..4ade854e --- /dev/null +++ b/modules/local/symphony/mapembedding/templates/map_embedding.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 + +import os + +os.environ["KMP_AFFINITY"] = "disabled" +os.environ["MPLCONFIGDIR"] = "./tmp/mpl" +os.environ["NUMBA_CACHE_DIR"] = "./tmp/numba" + +import importlib.metadata +import platform + +import pandas as pd +import scanpy as sc +import symphonypy as sp +import yaml +from threadpoolctl import threadpool_limits + + +threadpool_limits(int("${task.cpus}")) + +adata = sc.read_h5ad("${h5ad}") +adata_proc = adata.copy() +adata_ref = sc.read_h5ad("reference/reference.h5ad") +prefix = "${prefix}" +batch_col = "${batch_col}" +counts_layer = "${counts_layer}" + +if counts_layer != "X": + adata_proc.X = adata_proc.layers[counts_layer] + +target_sum = float(adata_ref.uns["normalize"]["target_sum"]) +sc.pp.normalize_total(adata_proc, target_sum=target_sum) +sc.pp.log1p(adata_proc) + +sp.tl.map_embedding( + adata_proc, + adata_ref, + key=batch_col, + transferred_adjusted_basis="X_symphony", + use_genes_column="highly_variable", +) + +adata.obsm["X_symphony"] = adata_proc.obsm["X_symphony"] +adata.obsm["X_emb"] = adata_proc.obsm["X_symphony"] + +adata.write_h5ad(f"{prefix}.h5ad") +pd.DataFrame(adata.obsm["X_emb"], index=adata.obs_names).to_pickle(f"X_{prefix}.pkl") + +versions = { + "${task.process}": { + "python": platform.python_version(), + "scanpy": importlib.metadata.version("scanpy"), + "symphonypy": importlib.metadata.version("symphonypy"), + "pandas": pd.__version__, + } +} + +with open("versions.yml", "w") as f: + yaml.dump(versions, f) diff --git a/modules/local/symphony/mapembedding/tests/main.nf.test b/modules/local/symphony/mapembedding/tests/main.nf.test new file mode 100644 index 00000000..8699ba98 --- /dev/null +++ b/modules/local/symphony/mapembedding/tests/main.nf.test @@ -0,0 +1,93 @@ +nextflow_process { + + name "Test Process SYMPHONY_MAPEMBEDDING" + script "modules/local/symphony/mapembedding/main.nf" + process "SYMPHONY_MAPEMBEDDING" + + tag "modules" + tag "modules_local" + + setup { + run("SYMPHONY_HARMONYINTEGRATE") { + script "modules/local/symphony/harmonyintegrate/main.nf" + process { + """ + input[0] = channel.of([ + [ id: 'symphony' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/scrnaseq/h5ad/combined_filtered_matrix.h5ad', checkIfExists: true) + ] + ) + input[1] = "sample" + input[2] = "X" + """ + } + } + } + + test("Should run without failures") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = channel.of([ + [ id: 'symphony' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/scrnaseq/h5ad/combined_filtered_matrix.h5ad', checkIfExists: true) + ] + ) + input[1] = SYMPHONY_HARMONYINTEGRATE.out.reference + input[2] = "sample" + input[3] = "X" + """ + } + } + + then { + def adata = anndata(process.out.h5ad[0][1]) + assert process.success + assert "X_emb" in adata.obsm + assert "X_symphony" in adata.obsm + assert snapshot( + path(process.out.versions[0]).yaml, + adata.yaml + ).match() + } + + } + + test("Should run without failures - stub") { + + options '-stub' + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = channel.of([ + [ id: 'symphony' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/scrnaseq/h5ad/combined_filtered_matrix.h5ad', checkIfExists: true) + ] + ) + input[1] = channel.of([ + [ id: 'symphony' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/scrnaseq/h5ad/combined_filtered_matrix.h5ad', checkIfExists: true) + ] + ) + input[2] = "sample" + input[3] = "X" + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + +} diff --git a/modules/local/scanpy/harmony/tests/main.nf.test.snap b/modules/local/symphony/mapembedding/tests/main.nf.test.snap similarity index 69% rename from modules/local/scanpy/harmony/tests/main.nf.test.snap rename to modules/local/symphony/mapembedding/tests/main.nf.test.snap index 4a71e69d..5588ee0b 100644 --- a/modules/local/scanpy/harmony/tests/main.nf.test.snap +++ b/modules/local/symphony/mapembedding/tests/main.nf.test.snap @@ -5,13 +5,13 @@ "0": [ [ { - "id": "test" + "id": "symphony" }, - "test.h5ad:md5,d41d8cd98f00b204e9800998ecf8427e" + "symphony.h5ad:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], "1": [ - "X_test.pkl:md5,d41d8cd98f00b204e9800998ecf8427e" + "X_symphony.pkl:md5,d41d8cd98f00b204e9800998ecf8427e" ], "2": [ "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e" @@ -19,33 +19,33 @@ "h5ad": [ [ { - "id": "test" + "id": "symphony" }, - "test.h5ad:md5,d41d8cd98f00b204e9800998ecf8427e" + "symphony.h5ad:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], "obsm": [ - "X_test.pkl:md5,d41d8cd98f00b204e9800998ecf8427e" + "X_symphony.pkl:md5,d41d8cd98f00b204e9800998ecf8427e" ], "versions": [ "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e" ] } ], - "timestamp": "2026-03-22T10:56:43.269700775", + "timestamp": "2026-05-28T16:31:40.664029252", "meta": { "nf-test": "0.9.4", - "nextflow": "25.10.2" + "nextflow": "26.04.0" } }, "Should run without failures": { "content": [ { - "SCANPY_HARMONY": { - "harmonypy": "0.2.0", + "SYMPHONY_MAPEMBEDDING": { "pandas": "2.3.3", - "python": "3.13.12", - "scanpy": "1.12" + "python": "3.13.13", + "scanpy": "1.12.1", + "symphonypy": "0.2.4" } }, { @@ -67,7 +67,8 @@ ], "obsm": [ - "X_emb" + "X_emb", + "X_symphony" ], "varm": [ @@ -83,10 +84,10 @@ ] } ], - "timestamp": "2026-03-29T11:17:47.094134151", + "timestamp": "2026-05-28T13:56:04.774526372", "meta": { "nf-test": "0.9.4", - "nextflow": "25.10.2" + "nextflow": "26.04.0" } } -} +} \ No newline at end of file diff --git a/modules/nf-core/cellbender/merge/templates/merge.py b/modules/nf-core/cellbender/merge/templates/merge.py index a15ada0d..8d424df5 100644 --- a/modules/nf-core/cellbender/merge/templates/merge.py +++ b/modules/nf-core/cellbender/merge/templates/merge.py @@ -30,11 +30,14 @@ def format_yaml_like(data: dict, indent: int = 0) -> str: adata_cellbender = load_anndata_from_input_and_output("${unfiltered}", "${cellbender_h5}", analyzed_barcodes_only=False) # Subset to the barcodes and genes present in the filtered matrix. -# Gene symbols (var index) may not be unique, so align on Ensembl IDs. -# The filtered h5ad uses 'gene_ids'; load_anndata_from_input_and_output uses 'gene_id'. -gene_id_col = "gene_id" if "gene_id" in adata_cellbender.var.columns else adata_cellbender.var.index.name -cb_id_to_pos = {gid: i for i, gid in enumerate(adata_cellbender.var[gene_id_col])} -var_positions = [cb_id_to_pos[gid] for gid in adata.var["gene_ids"]] +# Gene symbols (var index) may not be unique, so prefer Ensembl IDs when present. +# Column names differ: 10x/readh5 uses 'gene_ids'; unify/cellbender uses 'gene_id'. +filtered_gene_id_col = next((col for col in ("gene_ids", "gene_id") if col in adata.var.columns), None) +cellbender_gene_id_col = next((col for col in ("gene_id", "gene_ids") if col in adata_cellbender.var.columns), None) +filtered_ids = adata.var[filtered_gene_id_col] if filtered_gene_id_col else adata.var.index +cellbender_ids = adata_cellbender.var[cellbender_gene_id_col] if filtered_gene_id_col and cellbender_gene_id_col else adata_cellbender.var.index +cb_id_to_pos = {gid: i for i, gid in enumerate(cellbender_ids)} +var_positions = [cb_id_to_pos[gid] for gid in filtered_ids] adata_cellbender = adata_cellbender[adata.obs_names, var_positions] if "${output_layer}" == "X": diff --git a/nextflow.config b/nextflow.config index 1774ba60..1bfca071 100644 --- a/nextflow.config +++ b/nextflow.config @@ -43,6 +43,7 @@ params { scvi_model = null scanvi_model = null scimilarity_model = 'https://zenodo.org/records/10685499/files/model_v1.1.tar.gz' + symphony_reference = null expimap_gmt = null // Extension options @@ -112,7 +113,7 @@ params { help_full = false show_hidden = false version = false - pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/3ba0ba7174a5667fc2e005430594ffb063f986c7/' + pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nictru/test-datasets/97addfb0946c0e51dbb70ee1391142d12e70f085/' trace_report_suffix = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') // Config options config_profile_name = null diff --git a/nextflow_schema.json b/nextflow_schema.json index be1d18d2..43ed239b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -86,7 +86,7 @@ "species": { "type": "string", "default": "human", - "description": "Species of the input data. Used to auto-select bundled cell cycle gene lists (assets/cell_cycle_genes/_s_genes.txt and _g2m_genes.txt). Bundled lists are provided for 'human' and 'mouse'. Ignored when --s_genes and --g2m_genes are set explicitly." + "description": "Species of the input data. Used to auto-select bundled cell cycle gene lists (assets/cell_cycle_genes/_s_genes.txt and _g2m_genes.txt) and as the MyGene.info taxonomy when converting gene identifiers (samplesheet symbol_col: none). Bundled cell cycle lists are provided for 'human' and 'mouse'. Ignored when --s_genes and --g2m_genes are set explicitly." }, "cell_cycle_scoring": { "type": "boolean", @@ -152,8 +152,8 @@ "type": "string", "default": "scvi", "description": "Specify the tool to use for integration", - "help_text": "If you want to use multiple tools, separate them with a comma. Available methods are: scvi, scanvi, harmony, bbknn, combat, seurat, scimilarity, pca, expimap", - "pattern": "^((scvi|scanvi|harmony|bbknn|combat|seurat|scimilarity|pca|expimap)(,(scvi|scanvi|harmony|bbknn|combat|seurat|scimilarity|pca|expimap))*)?$" + "help_text": "If you want to use multiple tools, separate them with a comma. Available methods are: scvi, scanvi, symphony, bbknn, combat, seurat, scimilarity, pca, expimap", + "pattern": "^((scvi|scanvi|symphony|bbknn|combat|seurat|scimilarity|pca|expimap)(,(scvi|scanvi|symphony|bbknn|combat|seurat|scimilarity|pca|expimap))*)?$" }, "integration_hvgs": { "type": "integer", @@ -167,6 +167,14 @@ "description": "Optional file containing a list of gene symbols (one per line). If provided, these genes will be excluded from highly variable genes selection for integration.", "exists": true }, + "symphony_reference": { + "type": "string", + "format": "file-path", + "description": "Path to a Symphony reference AnnData, only relevant if Symphony is selected in `integration_methods`. If provided, query cells will be mapped onto this reference instead of running de novo Symphony integration.", + "help_text": "The file should be in the .h5ad format. It is produced by a prior de novo Symphony run as `{outdir}/combine/integrate/symphony/symphony_reference.h5ad` and contains the compact Symphony reference metadata required for query mapping. Required for Symphony reference mapping and when extending an atlas with `--base_adata`.", + "pattern": "^\\S+\\.h5ad$", + "exists": true + }, "scvi_model": { "type": "string", "format": "file-path", @@ -228,7 +236,7 @@ "type": "string", "description": "The keys in the obsm of the base AnnData object that contain the embeddings (without leading `X_`). Required if `input` is not provided - otherwise it is ignored.", "help_text": "If the `input` parameter is not provided (no new data to add), integration will not be performed. In order to be able to utilize existing integration results, you need to provide the keys in the obsm of the base AnnData object that contain the embeddings (without leading `X_`).", - "pattern": "^((scvi|scanvi|harmony|bbknn|combat|seurat)(,(scvi|scanvi|harmony|bbknn|combat|seurat))*)?$" + "pattern": "^((scvi|scanvi|symphony|bbknn|combat|seurat)(,(scvi|scanvi|symphony|bbknn|combat|seurat))*)?$" } } }, @@ -569,7 +577,7 @@ "type": "string", "fa_icon": "far fa-check-circle", "description": "Base URL or local path to location of pipeline test dataset files", - "default": "https://raw.githubusercontent.com/nf-core/test-datasets/3ba0ba7174a5667fc2e005430594ffb063f986c7/", + "default": "https://raw.githubusercontent.com/nictru/test-datasets/97addfb0946c0e51dbb70ee1391142d12e70f085/", "hidden": true }, "trace_report_suffix": { diff --git a/nf-test.config b/nf-test.config index 52203d1e..c56bd5d9 100644 --- a/nf-test.config +++ b/nf-test.config @@ -29,6 +29,7 @@ config { 'nf-test.config', 'tests/.nftignore', 'tests/nextflow.config', + 'tests/analysis_plan_extension.csv', ] // load the necessary plugins diff --git a/subworkflows/local/cluster/tests/main.nf.test b/subworkflows/local/cluster/tests/main.nf.test index 7bf52994..34d39524 100644 --- a/subworkflows/local/cluster/tests/main.nf.test +++ b/subworkflows/local/cluster/tests/main.nf.test @@ -292,7 +292,7 @@ nextflow_workflow { input[1] = false input[2] = true input[3] = '' - input[4] = [[integration: 'harmony', subset: null, resolution: null, analyses: null]] + input[4] = [[integration: 'symphony', subset: null, resolution: null, analyses: null]] input[5] = ['0.5', '1'] input[6] = 'sample' input[7] = 'X_scvi' diff --git a/subworkflows/local/combine/main.nf b/subworkflows/local/combine/main.nf index 1c150fb2..78c0d12a 100644 --- a/subworkflows/local/combine/main.nf +++ b/subworkflows/local/combine/main.nf @@ -17,6 +17,7 @@ workflow COMBINE { scvi_categorical_covariates // value: string scvi_continuous_covariates // value: string scimilarity_model // value: string + symphony_reference // value: string expimap_gmt // value: string condition_col // value: string scib // value: boolean @@ -52,6 +53,7 @@ workflow COMBINE { scvi_categorical_covariates, scvi_continuous_covariates, scimilarity_model, + symphony_reference, expimap_gmt, condition_col ) diff --git a/subworkflows/local/combine/tests/main.nf.test b/subworkflows/local/combine/tests/main.nf.test index 836d14d3..e8446503 100644 --- a/subworkflows/local/combine/tests/main.nf.test +++ b/subworkflows/local/combine/tests/main.nf.test @@ -32,8 +32,9 @@ nextflow_workflow { input[9] = '' input[10] = 'https://zenodo.org/records/10685499/files/model_v1.1.tar.gz' input[11] = null - input[12] = 'condition' - input[13] = false + input[12] = null + input[13] = 'condition' + input[14] = false """ } } @@ -68,8 +69,9 @@ nextflow_workflow { input[9] = '' input[10] = 'https://zenodo.org/records/10685499/files/model_v1.1.tar.gz' input[11] = null - input[12] = 'condition' - input[13] = false + input[12] = null + input[13] = 'condition' + input[14] = false """ } } diff --git a/subworkflows/local/integrate/main.nf b/subworkflows/local/integrate/main.nf index ebcce693..9d3d7e5c 100644 --- a/subworkflows/local/integrate/main.nf +++ b/subworkflows/local/integrate/main.nf @@ -2,7 +2,8 @@ include { SCANPY_HVGS } from '../../../modules/local/scanpy/hvgs' include { SCANPY_FILTER } from '../../../modules/local/scanpy/filter' include { SCVITOOLS_SCVI } from '../../../modules/local/scvitools/scvi' include { SCVITOOLS_SCANVI } from '../../../modules/local/scvitools/scanvi' -include { SCANPY_HARMONY } from '../../../modules/local/scanpy/harmony' +include { SYMPHONY_HARMONYINTEGRATE } from '../../../modules/local/symphony/harmonyintegrate' +include { SYMPHONY_MAPEMBEDDING } from '../../../modules/local/symphony/mapembedding' include { SCANPY_BBKNN } from '../../../modules/local/scanpy/bbknn' include { SCANPY_COMBAT } from '../../../modules/local/scanpy/combat' include { SCANPY_PCA } from '../../../modules/local/scanpy/pca' @@ -23,10 +24,12 @@ workflow INTEGRATE { scvi_categorical_covariates // list of string scvi_continuous_covariates // list of string scimilarity_model // path + symphony_reference // path expimap_gmt // path condition_col // string main: + ch_versions = channel.empty() ch_obs = channel.empty() ch_var = channel.empty() ch_obsm = channel.empty() @@ -40,6 +43,7 @@ workflow INTEGRATE { n_hvgs, excluded_genes ) + ch_versions = ch_versions.mix(SCANPY_HVGS.out.versions) ch_h5ad_hvg = SCANPY_HVGS.out.h5ad // See issue 215 @@ -59,6 +63,7 @@ workflow INTEGRATE { [] ) ch_h5ad_hvg = SCANPY_FILTER.out.h5ad + ch_versions = ch_versions.mix(SCANPY_FILTER.out.versions) } else { ch_h5ad_hvg = ch_h5ad @@ -68,6 +73,7 @@ workflow INTEGRATE { SEURAT_INTEGRATION ( ch_h5ad_hvg.map { _meta, h5ad -> [[id: 'seurat'], h5ad] }, "batch" ) + ch_versions = ch_versions.mix(SEURAT_INTEGRATION.out.versions) ch_integrations = ch_integrations.mix(SEURAT_INTEGRATION.out.h5ad) } @@ -82,6 +88,7 @@ workflow INTEGRATE { scvi_categorical_covariates, scvi_continuous_covariates, ) + ch_versions = ch_versions.mix(SCVITOOLS_SCVI.out.versions) ch_integrations = ch_integrations.mix(SCVITOOLS_SCVI.out.h5ad) ch_obsm = ch_obsm.mix(SCVITOOLS_SCVI.out.obsm) } @@ -100,19 +107,34 @@ workflow INTEGRATE { scvi_categorical_covariates, scvi_continuous_covariates, ) + ch_versions = ch_versions.mix(SCVITOOLS_SCANVI.out.versions) ch_integrations = ch_integrations.mix(SCVITOOLS_SCANVI.out.h5ad) ch_obs = ch_obs.mix(SCVITOOLS_SCANVI.out.obs) ch_obsm = ch_obsm.mix(SCVITOOLS_SCANVI.out.obsm) } - if (methods.contains('harmony')) { - SCANPY_HARMONY ( - ch_h5ad_hvg.map { _meta, h5ad -> [[id: 'harmony'], h5ad] }, - "batch", - "X" - ) - ch_integrations = ch_integrations.mix(SCANPY_HARMONY.out.h5ad) - ch_obsm = ch_obsm.mix(SCANPY_HARMONY.out.obsm) + if (methods.contains('symphony')) { + if (symphony_reference) { + SYMPHONY_MAPEMBEDDING ( + ch_h5ad.map { _meta, h5ad -> [[id: 'symphony'], h5ad] }, + channel.value([[id: 'symphony'], symphony_reference]), + "batch", + "X" + ) + ch_versions = ch_versions.mix(SYMPHONY_MAPEMBEDDING.out.versions) + ch_integrations = ch_integrations.mix(SYMPHONY_MAPEMBEDDING.out.h5ad) + ch_obsm = ch_obsm.mix(SYMPHONY_MAPEMBEDDING.out.obsm) + } + else { + SYMPHONY_HARMONYINTEGRATE ( + ch_h5ad_hvg.map { _meta, h5ad -> [[id: 'symphony'], h5ad] }, + "batch", + "X" + ) + ch_versions = ch_versions.mix(SYMPHONY_HARMONYINTEGRATE.out.versions) + ch_integrations = ch_integrations.mix(SYMPHONY_HARMONYINTEGRATE.out.h5ad) + ch_obsm = ch_obsm.mix(SYMPHONY_HARMONYINTEGRATE.out.obsm) + } } if (methods.contains('bbknn')) { @@ -120,6 +142,7 @@ workflow INTEGRATE { ch_h5ad_hvg.map { _meta, h5ad -> [[id: 'bbknn'], h5ad] }, "batch" ) + ch_versions = ch_versions.mix(SCANPY_BBKNN.out.versions) ch_integrations = ch_integrations.mix(SCANPY_BBKNN.out.h5ad) } @@ -128,6 +151,7 @@ workflow INTEGRATE { ch_h5ad_hvg.map { _meta, h5ad -> [[id: 'combat'], h5ad] }, "batch" ) + ch_versions = ch_versions.mix(SCANPY_COMBAT.out.versions) ch_integrations = ch_integrations.mix(SCANPY_COMBAT.out.h5ad) ch_obsm = ch_obsm.mix(SCANPY_COMBAT.out.obsm) } @@ -137,6 +161,7 @@ workflow INTEGRATE { ch_h5ad_hvg.map { _meta, h5ad -> [[id: 'pca'], h5ad] }, "X_emb" ) + ch_versions = ch_versions.mix(SCANPY_PCA.out.versions) ch_integrations = ch_integrations.mix(SCANPY_PCA.out.h5ad) ch_obsm = ch_obsm.mix(SCANPY_PCA.out.obsm) } @@ -150,6 +175,7 @@ workflow INTEGRATE { condition_col, "X" ) + ch_versions = ch_versions.mix(SCARCHES_EXPIMAP.out.versions) ch_integrations = ch_integrations.mix(SCARCHES_EXPIMAP.out.h5ad) ch_obsm = ch_obsm.mix(SCARCHES_EXPIMAP.out.obsm) } @@ -159,6 +185,7 @@ workflow INTEGRATE { ch_h5ad.map { _meta, h5ad -> [[id: 'scimilarity'], h5ad] }, scimilarity_model, ) + ch_versions = ch_versions.mix(SCIMILARITY.out.versions) ch_integrations = ch_integrations.mix(SCIMILARITY.out.integrations) ch_obs = ch_obs.mix(SCIMILARITY.out.obs) ch_obsm = ch_obsm.mix(SCIMILARITY.out.obsm) @@ -169,4 +196,5 @@ workflow INTEGRATE { obs = ch_obs // channel: [ pkl ] var = ch_var // channel: [ pkl ] obsm = ch_obsm // channel: [ pkl ] + versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/integrate/tests/main.nf.test b/subworkflows/local/integrate/tests/main.nf.test index 9562659e..9ee96c5d 100644 --- a/subworkflows/local/integrate/tests/main.nf.test +++ b/subworkflows/local/integrate/tests/main.nf.test @@ -7,7 +7,7 @@ nextflow_workflow { tag "subworkflows" tag "subworkflows_local" - test("Should run without failures - harmony - stub") { + test("Should run without failures - symphony - stub") { options '-stub' @@ -24,14 +24,15 @@ nextflow_workflow { input[1] = false input[2] = 2000 input[3] = [] - input[4] = ['harmony'] + input[4] = ['symphony'] input[5] = null input[6] = null input[7] = [] input[8] = [] input[9] = null input[10] = null - input[11] = 'condition' + input[11] = null + input[12] = 'condition' """ } } @@ -43,7 +44,7 @@ nextflow_workflow { } - test("Should run without failures - harmony") { + test("Should run without failures - symphony") { when { params { @@ -58,14 +59,15 @@ nextflow_workflow { input[1] = false input[2] = 2000 input[3] = [] - input[4] = ['harmony'] + input[4] = ['symphony'] input[5] = null input[6] = null input[7] = [] input[8] = [] input[9] = null input[10] = null - input[11] = 'condition' + input[11] = null + input[12] = 'condition' """ } } @@ -74,6 +76,7 @@ nextflow_workflow { def adata = anndata(workflow.out.integrations[0][1]) assert workflow.success assert "X_emb" in adata.obsm + assert "X_symphony" in adata.obsm assert snapshot( workflow.out.versions, adata.yaml @@ -106,7 +109,8 @@ nextflow_workflow { input[8] = [] input[9] = null input[10] = null - input[11] = 'condition' + input[11] = null + input[12] = 'condition' """ } } @@ -140,7 +144,8 @@ nextflow_workflow { input[8] = [] input[9] = null input[10] = null - input[11] = 'condition' + input[11] = null + input[12] = 'condition' """ } } @@ -185,7 +190,8 @@ nextflow_workflow { input[8] = [] input[9] = null input[10] = null - input[11] = 'condition' + input[11] = null + input[12] = 'condition' """ } } @@ -219,7 +225,8 @@ nextflow_workflow { input[8] = [] input[9] = null input[10] = null - input[11] = 'condition' + input[11] = null + input[12] = 'condition' """ } } @@ -253,14 +260,15 @@ nextflow_workflow { input[1] = true input[2] = -1 input[3] = [] - input[4] = ['harmony'] + input[4] = ['symphony'] input[5] = null input[6] = null input[7] = [] input[8] = [] input[9] = null input[10] = null - input[11] = 'condition' + input[11] = null + input[12] = 'condition' """ } } @@ -296,7 +304,8 @@ nextflow_workflow { input[8] = [] input[9] = null input[10] = null - input[11] = 'condition' + input[11] = null + input[12] = 'condition' """ } } @@ -330,7 +339,8 @@ nextflow_workflow { input[8] = [] input[9] = null input[10] = null - input[11] = 'condition' + input[11] = null + input[12] = 'condition' """ } } @@ -371,7 +381,8 @@ nextflow_workflow { input[8] = [] input[9] = null input[10] = null - input[11] = 'batch' + input[11] = null + input[12] = 'batch' """ } } @@ -405,7 +416,8 @@ nextflow_workflow { input[8] = [] input[9] = null input[10] = null - input[11] = 'batch' + input[11] = null + input[12] = 'batch' """ } } diff --git a/subworkflows/local/integrate/tests/main.nf.test.snap b/subworkflows/local/integrate/tests/main.nf.test.snap index 4feacdc8..14315dc9 100644 --- a/subworkflows/local/integrate/tests/main.nf.test.snap +++ b/subworkflows/local/integrate/tests/main.nf.test.snap @@ -1,49 +1,4 @@ { - "Should run without failures - harmony - stub": { - "content": [ - { - "0": [ - [ - { - "id": "harmony" - }, - "harmony.h5ad:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "1": [ - - ], - "2": [ - - ], - "3": [ - "X_harmony.pkl:md5,d41d8cd98f00b204e9800998ecf8427e" - ], - "integrations": [ - [ - { - "id": "harmony" - }, - "harmony.h5ad:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "obs": [ - - ], - "obsm": [ - "X_harmony.pkl:md5,d41d8cd98f00b204e9800998ecf8427e" - ], - "var": [ - - ] - } - ], - "timestamp": "2026-05-20T20:47:09.819743733", - "meta": { - "nf-test": "0.9.4", - "nextflow": "26.04.0" - } - }, "Should run without failures - bbknn - stub": { "content": [ { @@ -63,6 +18,11 @@ ], "3": [ + ], + "4": [ + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e", + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e", + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e" ], "integrations": [ [ @@ -80,21 +40,30 @@ ], "var": [ + ], + "versions": [ + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e", + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e", + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e" ] } ], - "timestamp": "2026-05-20T20:49:35.863158377", + "timestamp": "2026-03-25T15:47:40.215608271", "meta": { "nf-test": "0.9.4", - "nextflow": "26.04.0" + "nextflow": "25.10.2" } }, "Should run without failures - combat": { "content": [ - null, + [ + "versions.yml:md5,20020d8c9cf585aaa75dd5a14aa5d3ae", + "versions.yml:md5,a6c1e0a77e0d31423a9d77edba85127d", + "versions.yml:md5,d28b65c4c18c54e1abc34040b584b823" + ], { "n_obs": 12940, - "n_vars": 2077, + "n_vars": 2000, "obs": { "index": "_index", "columns": [ @@ -154,7 +123,7 @@ ] } ], - "timestamp": "2026-05-20T20:52:30.761335469", + "timestamp": "2026-05-28T14:03:08.524818368", "meta": { "nf-test": "0.9.4", "nextflow": "26.04.0" @@ -180,6 +149,11 @@ "3": [ "X_pca_pca.pkl:md5,d41d8cd98f00b204e9800998ecf8427e" ], + "4": [ + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e", + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e", + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e" + ], "integrations": [ [ { @@ -196,24 +170,29 @@ ], "var": [ + ], + "versions": [ + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e", + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e", + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e" ] } ], - "timestamp": "2026-05-20T20:53:05.651566126", + "timestamp": "2026-04-10T16:50:54.506012382", "meta": { "nf-test": "0.9.4", - "nextflow": "26.04.0" + "nextflow": "25.10.2" } }, - "Should run without failures - extension mode - stub": { + "Should run without failures - combat - stub": { "content": [ { "0": [ [ { - "id": "harmony" + "id": "combat" }, - "harmony.h5ad:md5,d41d8cd98f00b204e9800998ecf8427e" + "combat.h5ad:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], "1": [ @@ -223,39 +202,53 @@ ], "3": [ - "X_harmony.pkl:md5,d41d8cd98f00b204e9800998ecf8427e" + "combat.pkl:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "4": [ + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e", + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e", + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e" ], "integrations": [ [ { - "id": "harmony" + "id": "combat" }, - "harmony.h5ad:md5,d41d8cd98f00b204e9800998ecf8427e" + "combat.h5ad:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], "obs": [ ], "obsm": [ - "X_harmony.pkl:md5,d41d8cd98f00b204e9800998ecf8427e" + "combat.pkl:md5,d41d8cd98f00b204e9800998ecf8427e" ], "var": [ + ], + "versions": [ + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e", + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e", + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e" ] } ], - "timestamp": "2026-05-20T20:52:48.773987405", + "timestamp": "2026-03-25T15:49:26.334091777", "meta": { "nf-test": "0.9.4", - "nextflow": "26.04.0" + "nextflow": "25.10.2" } }, - "Should run without failures - pca": { + "Should run without failures - bbknn": { "content": [ - null, + [ + "versions.yml:md5,20020d8c9cf585aaa75dd5a14aa5d3ae", + "versions.yml:md5,ccf730637c4c61a84ac4a002bf9832e0", + "versions.yml:md5,d28b65c4c18c54e1abc34040b584b823" + ], { "n_obs": 12940, - "n_vars": 2077, + "n_vars": 2000, "obs": { "index": "_index", "columns": [ @@ -295,39 +288,41 @@ "counts" ], "obsm": [ - "X_emb" + "X_pca" ], "varm": [ - "X_emb" + "PCs" ], "obsp": [ - + "connectivities", + "distances" ], "varp": [ ], "uns": [ - "X_emb", "hvg", - "log1p" + "log1p", + "neighbors", + "pca" ] } ], - "timestamp": "2026-05-20T20:53:47.047398518", + "timestamp": "2026-05-28T14:01:44.359301169", "meta": { "nf-test": "0.9.4", "nextflow": "26.04.0" } }, - "Should run without failures - combat - stub": { + "Should run without failures - symphony - stub": { "content": [ { "0": [ [ { - "id": "combat" + "id": "symphony" }, - "combat.h5ad:md5,d41d8cd98f00b204e9800998ecf8427e" + "symphony.h5ad:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], "1": [ @@ -337,39 +332,53 @@ ], "3": [ - "combat.pkl:md5,d41d8cd98f00b204e9800998ecf8427e" + "X_symphony.pkl:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "4": [ + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e", + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e", + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e" ], "integrations": [ [ { - "id": "combat" + "id": "symphony" }, - "combat.h5ad:md5,d41d8cd98f00b204e9800998ecf8427e" + "symphony.h5ad:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], "obs": [ ], "obsm": [ - "combat.pkl:md5,d41d8cd98f00b204e9800998ecf8427e" + "X_symphony.pkl:md5,d41d8cd98f00b204e9800998ecf8427e" ], "var": [ + ], + "versions": [ + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e", + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e", + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e" ] } ], - "timestamp": "2026-05-20T20:51:21.560122268", + "timestamp": "2026-05-28T16:31:59.971834646", "meta": { "nf-test": "0.9.4", "nextflow": "26.04.0" } }, - "Should run without failures - harmony": { + "Should run without failures - symphony": { "content": [ - null, + [ + "versions.yml:md5,0941a4daea5c41d9e3259be11e9f2263", + "versions.yml:md5,20020d8c9cf585aaa75dd5a14aa5d3ae", + "versions.yml:md5,d28b65c4c18c54e1abc34040b584b823" + ], { "n_obs": 12940, - "n_vars": 2077, + "n_vars": 2000, "obs": { "index": "_index", "columns": [ @@ -409,7 +418,8 @@ "counts" ], "obsm": [ - "X_emb" + "X_emb", + "X_symphony" ], "varm": [ @@ -426,18 +436,73 @@ ] } ], - "timestamp": "2026-05-20T20:48:46.532961357", + "timestamp": "2026-05-28T16:32:53.466053531", "meta": { "nf-test": "0.9.4", "nextflow": "26.04.0" } }, - "Should run without failures - bbknn": { + "Should run without failures - extension mode - stub": { + "content": [ + { + "0": [ + [ + { + "id": "symphony" + }, + "symphony.h5ad:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "X_symphony.pkl:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "4": [ + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "integrations": [ + [ + { + "id": "symphony" + }, + "symphony.h5ad:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "obs": [ + + ], + "obsm": [ + "X_symphony.pkl:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "var": [ + + ], + "versions": [ + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + } + ], + "timestamp": "2026-05-28T16:35:48.236148467", + "meta": { + "nf-test": "0.9.4", + "nextflow": "26.04.0" + } + }, + "Should run without failures - pca": { "content": [ - null, + [ + "versions.yml:md5,20020d8c9cf585aaa75dd5a14aa5d3ae", + "versions.yml:md5,87a2cb96724430656d9c1276e91e0208", + "versions.yml:md5,d28b65c4c18c54e1abc34040b584b823" + ], { "n_obs": 12940, - "n_vars": 2077, + "n_vars": 2000, "obs": { "index": "_index", "columns": [ @@ -477,27 +542,25 @@ "counts" ], "obsm": [ - "X_pca" + "X_emb" ], "varm": [ - "PCs" + "X_emb" ], "obsp": [ - "connectivities", - "distances" + ], "varp": [ ], "uns": [ + "X_emb", "hvg", - "log1p", - "neighbors", - "pca" + "log1p" ] } ], - "timestamp": "2026-05-20T20:51:00.345519561", + "timestamp": "2026-05-28T14:04:36.10115423", "meta": { "nf-test": "0.9.4", "nextflow": "26.04.0" @@ -505,7 +568,9 @@ }, "Should run without failures - expimap": { "content": [ - null, + [ + "versions.yml:md5,44d9a1bbfabdc0ecae8adc586c7c1b2d" + ], { "n_obs": 12940, "n_vars": 9887, @@ -541,10 +606,10 @@ ] } ], - "timestamp": "2026-05-20T20:55:15.288341932", + "timestamp": "2026-04-11T15:55:42.640248171", "meta": { "nf-test": "0.9.4", - "nextflow": "26.04.0" + "nextflow": "25.10.4" } }, "Should run without failures - expimap - stub": { @@ -567,6 +632,9 @@ "3": [ "X_expimap.pkl:md5,d41d8cd98f00b204e9800998ecf8427e" ], + "4": [ + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e" + ], "integrations": [ [ { @@ -583,13 +651,16 @@ ], "var": [ + ], + "versions": [ + "versions.yml:md5,d41d8cd98f00b204e9800998ecf8427e" ] } ], - "timestamp": "2026-05-20T20:54:13.304991035", + "timestamp": "2026-04-11T09:14:13.34716941", "meta": { "nf-test": "0.9.4", - "nextflow": "26.04.0" + "nextflow": "25.10.2" } } } \ No newline at end of file diff --git a/subworkflows/local/quality_control/main.nf b/subworkflows/local/quality_control/main.nf index 0aced122..07d07524 100644 --- a/subworkflows/local/quality_control/main.nf +++ b/subworkflows/local/quality_control/main.nf @@ -31,6 +31,7 @@ workflow QUALITY_CONTROL { cell_cycle_scoring // value: boolean s_genes // path: file or [] g2m_genes // path: file or [] + species // value: string main: ch_multiqc_files = channel.empty() @@ -117,7 +118,8 @@ workflow QUALITY_CONTROL { ch_h5ad, unify_gene_symbols, duplicate_var_resolution, - aggregate_isoforms + aggregate_isoforms, + species ) ch_multiqc_files = ch_multiqc_files.mix(UNIFY.out.multiqc_files) ch_h5ad = UNIFY.out.h5ad diff --git a/subworkflows/local/quality_control/tests/main.nf.test b/subworkflows/local/quality_control/tests/main.nf.test index d582d91a..914abd60 100644 --- a/subworkflows/local/quality_control/tests/main.nf.test +++ b/subworkflows/local/quality_control/tests/main.nf.test @@ -35,6 +35,7 @@ nextflow_workflow { input[12] = false input[13] = [] input[14] = [] + input[15] = 'human' """ } } @@ -79,6 +80,7 @@ nextflow_workflow { input[12] = false input[13] = [] input[14] = [] + input[15] = 'human' """ } } @@ -122,6 +124,7 @@ nextflow_workflow { input[12] = false input[13] = [] input[14] = [] + input[15] = 'human' """ } } @@ -178,6 +181,7 @@ nextflow_workflow { input[12] = false input[13] = [] input[14] = [] + input[15] = 'human' """ } } @@ -217,6 +221,7 @@ nextflow_workflow { input[12] = true input[13] = file("${projectDir}/assets/cell_cycle_genes/human_s_genes.txt") input[14] = file("${projectDir}/assets/cell_cycle_genes/human_g2m_genes.txt") + input[15] = 'human' """ } } diff --git a/subworkflows/local/unify/main.nf b/subworkflows/local/unify/main.nf index 631c79a4..250a6b95 100644 --- a/subworkflows/local/unify/main.nf +++ b/subworkflows/local/unify/main.nf @@ -11,6 +11,7 @@ workflow UNIFY { unify_gene_symbols // value: boolean duplicate_var_resolution // value: string aggregate_isoforms // value: boolean + species // value: string main: ch_multiqc_files = channel.empty() @@ -21,7 +22,8 @@ workflow UNIFY { } MYGENE ( - ch_h5ad.needs_symbol_conversion + ch_h5ad.needs_symbol_conversion, + species ) ch_h5ad = ch_h5ad.has_symbol_col.mix( MYGENE.out.h5ad.map { meta, h5ad -> [meta + [symbol_col: 'symbols'], h5ad] } diff --git a/subworkflows/local/unify/tests/main.nf.test b/subworkflows/local/unify/tests/main.nf.test index a5c5cf08..35a9eb14 100644 --- a/subworkflows/local/unify/tests/main.nf.test +++ b/subworkflows/local/unify/tests/main.nf.test @@ -25,6 +25,7 @@ nextflow_workflow { input[1] = false input[2] = 'sum' input[3] = false + input[4] = 'human' """ } } @@ -52,6 +53,7 @@ nextflow_workflow { input[1] = false input[2] = 'sum' input[3] = false + input[4] = 'human' """ } } @@ -93,6 +95,7 @@ nextflow_workflow { input[1] = false input[2] = 'sum' input[3] = false + input[4] = 'human' """ } } @@ -120,6 +123,7 @@ nextflow_workflow { input[1] = false input[2] = 'sum' input[3] = false + input[4] = 'human' """ } } @@ -161,6 +165,7 @@ nextflow_workflow { input[1] = true input[2] = 'sum' input[3] = false + input[4] = 'human' """ } } @@ -188,6 +193,7 @@ nextflow_workflow { input[1] = true input[2] = 'sum' input[3] = false + input[4] = 'human' """ } } diff --git a/subworkflows/local/utils_nfcore_scdownstream_pipeline/main.nf b/subworkflows/local/utils_nfcore_scdownstream_pipeline/main.nf index 6d25a1e1..4c98fc5c 100644 --- a/subworkflows/local/utils_nfcore_scdownstream_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_scdownstream_pipeline/main.nf @@ -165,6 +165,7 @@ workflow PIPELINE_COMPLETION { def analysisPlanToList() { params.analysis_plan ? samplesheetToList(params.analysis_plan, "${projectDir}/assets/schema_analysis_plan.json") + .collect { row -> row[0] } : [[integration: null, subset: null, resolution: null, analyses: null]] } @@ -181,8 +182,8 @@ def validateInputParameters() { } def integration_methods = params.integration_methods.split(',').collect { it -> it.trim().toLowerCase() } - if (params.input && params.base_adata && (integration_methods - ['scvi', 'scanvi', 'scimilarity']).size() > 0) { - throw new Exception("Only scvi, scanvi and scimilarity integration methods are supported if base_adata is provided") + if (params.input && params.base_adata && (integration_methods - ['scvi', 'scanvi', 'scimilarity', 'symphony']).size() > 0) { + throw new Exception("Only scvi, scanvi, scimilarity and symphony integration methods are supported if base_adata is provided") } if (params.base_adata && 'scvi' in integration_methods && !params.scvi_model) { @@ -197,6 +198,10 @@ def validateInputParameters() { throw new Exception("If base_adata is provided and scimilarity is used as integration method, scimilarity_model must be provided.") } + if (params.base_adata && 'symphony' in integration_methods && !params.symphony_reference) { + throw new Exception("If base_adata is provided and symphony is used as integration method, symphony_reference must be provided.") + } + // Validate sample_n and sample_fraction parameters if (params.sample_n && params.sample_fraction) { throw new Exception("Both sample_n and sample_fraction are set. Please use only one of them.") diff --git a/tests/analysis_plan_extension.csv b/tests/analysis_plan_extension.csv new file mode 100644 index 00000000..dc39c197 --- /dev/null +++ b/tests/analysis_plan_extension.csv @@ -0,0 +1,2 @@ +integration,subset,resolution,analyses +scvi,global,0.5,"paga,liana,de" diff --git a/tests/default.nf.test.snap b/tests/default.nf.test.snap index eda1a9db..5fc73379 100644 --- a/tests/default.nf.test.snap +++ b/tests/default.nf.test.snap @@ -135,12 +135,6 @@ "python": "3.13.12", "scanpy": "1.12" }, - "SCANPY_HARMONY": { - "harmonypy": "0.2.0", - "pandas": "2.3.3", - "python": "3.13.12", - "scanpy": "1.12" - }, "SCANPY_HVGS": { "python": "3.13.12", "scanpy": "1.12" @@ -172,6 +166,12 @@ "SCVITOOLS_SOLO": { "scvi": "1.4.3" }, + "SYMPHONY_HARMONYINTEGRATE": { + "pandas": "2.3.3", + "python": "3.13.13", + "scanpy": "1.12.1", + "symphonypy": "0.2.4" + }, "UMAP": { "pandas": "2.3.3", "python": "3.13.12", @@ -237,14 +237,6 @@ "cluster_dimred/combat/leiden/combat-global-0.5_leiden.png", "cluster_dimred/combat/leiden/combat-global-1.0_leiden.png", "cluster_dimred/combat/umap", - "cluster_dimred/harmony", - "cluster_dimred/harmony/entropy", - "cluster_dimred/harmony/entropy/harmony-global-0.5_entropy.png", - "cluster_dimred/harmony/entropy/harmony-global-1.0_entropy.png", - "cluster_dimred/harmony/leiden", - "cluster_dimred/harmony/leiden/harmony-global-0.5_leiden.png", - "cluster_dimred/harmony/leiden/harmony-global-1.0_leiden.png", - "cluster_dimred/harmony/umap", "cluster_dimred/scvi", "cluster_dimred/scvi/entropy", "cluster_dimred/scvi/entropy/scvi-global-0.5_entropy.png", @@ -253,15 +245,25 @@ "cluster_dimred/scvi/leiden/scvi-global-0.5_leiden.png", "cluster_dimred/scvi/leiden/scvi-global-1.0_leiden.png", "cluster_dimred/scvi/umap", + "cluster_dimred/symphony", + "cluster_dimred/symphony/entropy", + "cluster_dimred/symphony/entropy/symphony-global-0.5_entropy.png", + "cluster_dimred/symphony/entropy/symphony-global-1.0_entropy.png", + "cluster_dimred/symphony/leiden", + "cluster_dimred/symphony/leiden/symphony-global-0.5_leiden.png", + "cluster_dimred/symphony/leiden/symphony-global-1.0_leiden.png", + "cluster_dimred/symphony/umap", "combine", "combine/integrate", "combine/integrate/scib_metrics", "combine/integrate/scib_metrics/combat_metrics.tsv", - "combine/integrate/scib_metrics/harmony_metrics.tsv", "combine/integrate/scib_metrics/scvi_metrics.tsv", + "combine/integrate/scib_metrics/symphony_metrics.tsv", "combine/integrate/scvi", "combine/integrate/scvi/scvi_model", "combine/integrate/scvi/scvi_model/model.pt", + "combine/integrate/symphony", + "combine/integrate/symphony/symphony_reference.h5ad", "combine/merge", "combine/merge/upset_genes.png", "finalized", @@ -305,7 +307,7 @@ "qc-report.qmd:md5,13061014a897b3fbdafd6ea3212df0e0" ] ], - "timestamp": "2026-05-11T22:49:53.648324922", + "timestamp": "2026-05-28T17:09:18.020969587", "meta": { "nf-test": "0.9.4", "nextflow": "26.04.0" diff --git a/tests/main_pipeline_build.nf.test b/tests/main_pipeline_build.nf.test index f07b1ebb..1932054a 100644 --- a/tests/main_pipeline_build.nf.test +++ b/tests/main_pipeline_build.nf.test @@ -9,7 +9,7 @@ nextflow_pipeline { when { params { input = 'https://github.com/nf-core/test-datasets/raw/refs/heads/scdownstream/samplesheet.csv' - integration_methods = 'scvi,harmony,bbknn,combat,seurat' + integration_methods = 'scvi,symphony,bbknn,combat,seurat' doublet_detection = 'scrublet,scdblfinder' celltypist_model = 'Adult_COVID19_PBMC' integration_hvgs = 500 diff --git a/tests/main_pipeline_build.nf.test.snap b/tests/main_pipeline_build.nf.test.snap index 02fca1ba..f7cb9baa 100644 --- a/tests/main_pipeline_build.nf.test.snap +++ b/tests/main_pipeline_build.nf.test.snap @@ -135,12 +135,6 @@ "python": "3.13.12", "scanpy": "1.12" }, - "SCANPY_HARMONY": { - "harmonypy": "0.2.0", - "pandas": "2.3.3", - "python": "3.13.12", - "scanpy": "1.12" - }, "SCANPY_HVGS": { "python": "3.13.12", "scanpy": "1.12" @@ -171,9 +165,15 @@ }, "SEURAT_INTEGRATION": { "R": "4.5.3", - "Seurat": "5.4.0", + "Seurat": "5.5.0", "anndataR": "1.0.2" }, + "SYMPHONY_HARMONYINTEGRATE": { + "pandas": "2.3.3", + "python": "3.13.13", + "scanpy": "1.12.1", + "symphonypy": "0.2.4" + }, "UMAP": { "pandas": "2.3.3", "python": "3.13.12", @@ -239,14 +239,6 @@ "cluster_dimred/combat/leiden/combat-global-0.5_leiden.png", "cluster_dimred/combat/leiden/combat-global-1.0_leiden.png", "cluster_dimred/combat/umap", - "cluster_dimred/harmony", - "cluster_dimred/harmony/entropy", - "cluster_dimred/harmony/entropy/harmony-global-0.5_entropy.png", - "cluster_dimred/harmony/entropy/harmony-global-1.0_entropy.png", - "cluster_dimred/harmony/leiden", - "cluster_dimred/harmony/leiden/harmony-global-0.5_leiden.png", - "cluster_dimred/harmony/leiden/harmony-global-1.0_leiden.png", - "cluster_dimred/harmony/umap", "cluster_dimred/scvi", "cluster_dimred/scvi/entropy", "cluster_dimred/scvi/entropy/scvi-global-0.5_entropy.png", @@ -263,16 +255,26 @@ "cluster_dimred/seurat/leiden/seurat-global-0.5_leiden.png", "cluster_dimred/seurat/leiden/seurat-global-1.0_leiden.png", "cluster_dimred/seurat/umap", + "cluster_dimred/symphony", + "cluster_dimred/symphony/entropy", + "cluster_dimred/symphony/entropy/symphony-global-0.5_entropy.png", + "cluster_dimred/symphony/entropy/symphony-global-1.0_entropy.png", + "cluster_dimred/symphony/leiden", + "cluster_dimred/symphony/leiden/symphony-global-0.5_leiden.png", + "cluster_dimred/symphony/leiden/symphony-global-1.0_leiden.png", + "cluster_dimred/symphony/umap", "combine", "combine/integrate", "combine/integrate/scib_metrics", "combine/integrate/scib_metrics/combat_metrics.tsv", - "combine/integrate/scib_metrics/harmony_metrics.tsv", "combine/integrate/scib_metrics/scvi_metrics.tsv", "combine/integrate/scib_metrics/seurat_metrics.tsv", + "combine/integrate/scib_metrics/symphony_metrics.tsv", "combine/integrate/scvi", "combine/integrate/scvi/scvi_model", "combine/integrate/scvi/scvi_model/model.pt", + "combine/integrate/symphony", + "combine/integrate/symphony/symphony_reference.h5ad", "combine/merge", "combine/merge/upset_genes.png", "finalized", @@ -316,7 +318,7 @@ "qc-report.qmd:md5,13061014a897b3fbdafd6ea3212df0e0" ] ], - "timestamp": "2026-05-11T23:09:11.341514791", + "timestamp": "2026-05-31T12:35:21.295737307", "meta": { "nf-test": "0.9.4", "nextflow": "26.04.0" diff --git a/tests/main_pipeline_extend.nf.test b/tests/main_pipeline_extend.nf.test index 083adbb7..daa0dd57 100644 --- a/tests/main_pipeline_extend.nf.test +++ b/tests/main_pipeline_extend.nf.test @@ -4,18 +4,21 @@ nextflow_pipeline { script "main.nf" tag "pipeline" - test("Should perform scvi reference extension") { + test("Should perform scvi and symphony reference extension") { when { params { - input = 'https://github.com/nf-core/test-datasets/raw/refs/heads/scdownstream/samplesheet_single.csv' - integration_methods = 'scvi' + pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nictru/test-datasets/e3a7f43eb802a090affac918026d2ba5dce8fcd5/' + input = pipelines_testdata_base_path + 'samplesheet_single.csv' + integration_methods = 'scvi,symphony' doublet_detection = 'scrublet,scdblfinder' celltypist_model = 'Adult_COVID19_PBMC' integration_hvgs = 500 outdir = "$outputDir" - scvi_model = 'https://github.com/nf-core/test-datasets/raw/refs/heads/scdownstream/extension_base/model.pt' - base_adata = 'https://github.com/nf-core/test-datasets/raw/refs/heads/scdownstream/extension_base/merged.h5ad' + scvi_model = pipelines_testdata_base_path + 'extension_base/model.pt' + symphony_reference = pipelines_testdata_base_path + 'extension_base/symphony_reference.h5ad' + base_adata = pipelines_testdata_base_path + 'extension_base/merged.h5ad' + analysis_plan = "${projectDir}/tests/analysis_plan_extension.csv" } } diff --git a/tests/main_pipeline_extend.nf.test.snap b/tests/main_pipeline_extend.nf.test.snap index 63c6d111..1a1be83b 100644 --- a/tests/main_pipeline_extend.nf.test.snap +++ b/tests/main_pipeline_extend.nf.test.snap @@ -1,5 +1,5 @@ { - "Should perform scvi reference extension": { + "Should perform scvi and symphony reference extension": { "content": [ { "ADATA_EXTEND": { @@ -144,6 +144,12 @@ "SCVITOOLS_SCVI": { "scvi": "1.4.3" }, + "SYMPHONY_MAPEMBEDDING": { + "pandas": "2.3.3", + "python": "3.13.13", + "scanpy": "1.12.1", + "symphonypy": "0.2.4" + }, "UMAP": { "pandas": "2.3.3", "python": "3.13.12", @@ -178,15 +184,16 @@ "cluster_dimred/scvi", "cluster_dimred/scvi/entropy", "cluster_dimred/scvi/entropy/scvi-global-0.5_entropy.png", - "cluster_dimred/scvi/entropy/scvi-global-1.0_entropy.png", "cluster_dimred/scvi/leiden", "cluster_dimred/scvi/leiden/scvi-global-0.5_leiden.png", - "cluster_dimred/scvi/leiden/scvi-global-1.0_leiden.png", "cluster_dimred/scvi/umap", + "cluster_dimred/symphony", + "cluster_dimred/symphony/umap", "combine", "combine/integrate", "combine/integrate/scib_metrics", "combine/integrate/scib_metrics/scvi_metrics.tsv", + "combine/integrate/scib_metrics/symphony_metrics.tsv", "combine/integrate/scvi", "combine/integrate/scvi/scvi_model", "combine/integrate/scvi/scvi_model/model.pt", @@ -228,7 +235,7 @@ "qc-report.qmd:md5,13061014a897b3fbdafd6ea3212df0e0" ] ], - "timestamp": "2026-05-11T23:24:27.061703025", + "timestamp": "2026-06-02T08:01:50.281977805", "meta": { "nf-test": "0.9.4", "nextflow": "26.04.0" diff --git a/tests/main_pipeline_reference_mapping.nf.test b/tests/main_pipeline_reference_mapping.nf.test index 9292b3b7..e7ccbe44 100644 --- a/tests/main_pipeline_reference_mapping.nf.test +++ b/tests/main_pipeline_reference_mapping.nf.test @@ -4,17 +4,19 @@ nextflow_pipeline { script "main.nf" tag "pipeline" - test("Should perform scvi reference mapping") { + test("Should perform scvi and symphony reference mapping") { when { params { - input = 'https://github.com/nf-core/test-datasets/raw/refs/heads/scdownstream/samplesheet_single.csv' - integration_methods = 'scvi' + pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nictru/test-datasets/e3a7f43eb802a090affac918026d2ba5dce8fcd5/' + input = pipelines_testdata_base_path + 'samplesheet_single.csv' + integration_methods = 'scvi,symphony' doublet_detection = 'scrublet,scdblfinder' celltypist_model = 'Adult_COVID19_PBMC' integration_hvgs = 500 outdir = "$outputDir" - scvi_model = 'https://github.com/nf-core/test-datasets/raw/refs/heads/scdownstream/extension_base/model.pt' + scvi_model = pipelines_testdata_base_path + 'extension_base/model.pt' + symphony_reference = pipelines_testdata_base_path + 'extension_base/symphony_reference.h5ad' } } diff --git a/tests/main_pipeline_reference_mapping.nf.test.snap b/tests/main_pipeline_reference_mapping.nf.test.snap index 5812c99c..e8795e97 100644 --- a/tests/main_pipeline_reference_mapping.nf.test.snap +++ b/tests/main_pipeline_reference_mapping.nf.test.snap @@ -1,5 +1,5 @@ { - "Should perform scvi reference mapping": { + "Should perform scvi and symphony reference mapping": { "content": [ { "ADATA_EXTEND": { @@ -144,6 +144,12 @@ "SCVITOOLS_SCVI": { "scvi": "1.4.3" }, + "SYMPHONY_MAPEMBEDDING": { + "pandas": "2.3.3", + "python": "3.13.13", + "scanpy": "1.12.1", + "symphonypy": "0.2.4" + }, "UMAP": { "pandas": "2.3.3", "python": "3.13.12", @@ -183,10 +189,19 @@ "cluster_dimred/scvi/leiden/scvi-global-0.5_leiden.png", "cluster_dimred/scvi/leiden/scvi-global-1.0_leiden.png", "cluster_dimred/scvi/umap", + "cluster_dimred/symphony", + "cluster_dimred/symphony/entropy", + "cluster_dimred/symphony/entropy/symphony-global-0.5_entropy.png", + "cluster_dimred/symphony/entropy/symphony-global-1.0_entropy.png", + "cluster_dimred/symphony/leiden", + "cluster_dimred/symphony/leiden/symphony-global-0.5_leiden.png", + "cluster_dimred/symphony/leiden/symphony-global-1.0_leiden.png", + "cluster_dimred/symphony/umap", "combine", "combine/integrate", "combine/integrate/scib_metrics", "combine/integrate/scib_metrics/scvi_metrics.tsv", + "combine/integrate/scib_metrics/symphony_metrics.tsv", "combine/integrate/scvi", "combine/integrate/scvi/scvi_model", "combine/integrate/scvi/scvi_model/model.pt", @@ -228,7 +243,7 @@ "qc-report.qmd:md5,13061014a897b3fbdafd6ea3212df0e0" ] ], - "timestamp": "2026-05-12T08:53:40.785735333", + "timestamp": "2026-05-29T14:31:02.484302876", "meta": { "nf-test": "0.9.4", "nextflow": "26.04.0" diff --git a/tests/main_pipeline_sub.nf.test b/tests/main_pipeline_sub.nf.test index 8ede02e5..03758609 100644 --- a/tests/main_pipeline_sub.nf.test +++ b/tests/main_pipeline_sub.nf.test @@ -8,10 +8,11 @@ nextflow_pipeline { when { params { + pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nictru/test-datasets/e3a7f43eb802a090affac918026d2ba5dce8fcd5/' outdir = "$outputDir" input = null - base_adata = 'https://github.com/nf-core/test-datasets/raw/refs/heads/scdownstream/extension_base/merged.h5ad' - base_embeddings = 'combat,harmony,scvi' + base_adata = pipelines_testdata_base_path + 'extension_base/merged.h5ad' + base_embeddings = 'symphony' cluster_global = false cluster_per_label = true base_label_col = 'sample' diff --git a/tests/main_pipeline_sub.nf.test.snap b/tests/main_pipeline_sub.nf.test.snap index 36b85745..cb74498d 100644 --- a/tests/main_pipeline_sub.nf.test.snap +++ b/tests/main_pipeline_sub.nf.test.snap @@ -59,58 +59,24 @@ }, [ "adata", - "adata/combat.h5ad", - "adata/harmony.h5ad", - "adata/scvi.h5ad", + "adata/symphony.h5ad", "cluster_dimred", - "cluster_dimred/combat", - "cluster_dimred/combat/entropy", - "cluster_dimred/combat/entropy/combat-SRR28679756-0.5_entropy.png", - "cluster_dimred/combat/entropy/combat-SRR28679756-1.0_entropy.png", - "cluster_dimred/combat/entropy/combat-SRR28679757-0.5_entropy.png", - "cluster_dimred/combat/entropy/combat-SRR28679757-1.0_entropy.png", - "cluster_dimred/combat/entropy/combat-SRR28679758-0.5_entropy.png", - "cluster_dimred/combat/entropy/combat-SRR28679758-1.0_entropy.png", - "cluster_dimred/combat/leiden", - "cluster_dimred/combat/leiden/combat-SRR28679756-0.5_leiden.png", - "cluster_dimred/combat/leiden/combat-SRR28679756-1.0_leiden.png", - "cluster_dimred/combat/leiden/combat-SRR28679757-0.5_leiden.png", - "cluster_dimred/combat/leiden/combat-SRR28679757-1.0_leiden.png", - "cluster_dimred/combat/leiden/combat-SRR28679758-0.5_leiden.png", - "cluster_dimred/combat/leiden/combat-SRR28679758-1.0_leiden.png", - "cluster_dimred/combat/umap", - "cluster_dimred/harmony", - "cluster_dimred/harmony/entropy", - "cluster_dimred/harmony/entropy/harmony-SRR28679756-0.5_entropy.png", - "cluster_dimred/harmony/entropy/harmony-SRR28679756-1.0_entropy.png", - "cluster_dimred/harmony/entropy/harmony-SRR28679757-0.5_entropy.png", - "cluster_dimred/harmony/entropy/harmony-SRR28679757-1.0_entropy.png", - "cluster_dimred/harmony/entropy/harmony-SRR28679758-0.5_entropy.png", - "cluster_dimred/harmony/entropy/harmony-SRR28679758-1.0_entropy.png", - "cluster_dimred/harmony/leiden", - "cluster_dimred/harmony/leiden/harmony-SRR28679756-0.5_leiden.png", - "cluster_dimred/harmony/leiden/harmony-SRR28679756-1.0_leiden.png", - "cluster_dimred/harmony/leiden/harmony-SRR28679757-0.5_leiden.png", - "cluster_dimred/harmony/leiden/harmony-SRR28679757-1.0_leiden.png", - "cluster_dimred/harmony/leiden/harmony-SRR28679758-0.5_leiden.png", - "cluster_dimred/harmony/leiden/harmony-SRR28679758-1.0_leiden.png", - "cluster_dimred/harmony/umap", - "cluster_dimred/scvi", - "cluster_dimred/scvi/entropy", - "cluster_dimred/scvi/entropy/scvi-SRR28679756-0.5_entropy.png", - "cluster_dimred/scvi/entropy/scvi-SRR28679756-1.0_entropy.png", - "cluster_dimred/scvi/entropy/scvi-SRR28679757-0.5_entropy.png", - "cluster_dimred/scvi/entropy/scvi-SRR28679757-1.0_entropy.png", - "cluster_dimred/scvi/entropy/scvi-SRR28679758-0.5_entropy.png", - "cluster_dimred/scvi/entropy/scvi-SRR28679758-1.0_entropy.png", - "cluster_dimred/scvi/leiden", - "cluster_dimred/scvi/leiden/scvi-SRR28679756-0.5_leiden.png", - "cluster_dimred/scvi/leiden/scvi-SRR28679756-1.0_leiden.png", - "cluster_dimred/scvi/leiden/scvi-SRR28679757-0.5_leiden.png", - "cluster_dimred/scvi/leiden/scvi-SRR28679757-1.0_leiden.png", - "cluster_dimred/scvi/leiden/scvi-SRR28679758-0.5_leiden.png", - "cluster_dimred/scvi/leiden/scvi-SRR28679758-1.0_leiden.png", - "cluster_dimred/scvi/umap", + "cluster_dimred/symphony", + "cluster_dimred/symphony/entropy", + "cluster_dimred/symphony/entropy/symphony-SRR28679756-0.5_entropy.png", + "cluster_dimred/symphony/entropy/symphony-SRR28679756-1.0_entropy.png", + "cluster_dimred/symphony/entropy/symphony-SRR28679757-0.5_entropy.png", + "cluster_dimred/symphony/entropy/symphony-SRR28679757-1.0_entropy.png", + "cluster_dimred/symphony/entropy/symphony-SRR28679758-0.5_entropy.png", + "cluster_dimred/symphony/entropy/symphony-SRR28679758-1.0_entropy.png", + "cluster_dimred/symphony/leiden", + "cluster_dimred/symphony/leiden/symphony-SRR28679756-0.5_leiden.png", + "cluster_dimred/symphony/leiden/symphony-SRR28679756-1.0_leiden.png", + "cluster_dimred/symphony/leiden/symphony-SRR28679757-0.5_leiden.png", + "cluster_dimred/symphony/leiden/symphony-SRR28679757-1.0_leiden.png", + "cluster_dimred/symphony/leiden/symphony-SRR28679758-0.5_leiden.png", + "cluster_dimred/symphony/leiden/symphony-SRR28679758-1.0_leiden.png", + "cluster_dimred/symphony/umap", "finalized", "finalized/base.h5ad", "finalized/base.rds", @@ -145,7 +111,7 @@ "qc-report.qmd:md5,13061014a897b3fbdafd6ea3212df0e0" ] ], - "timestamp": "2026-05-11T23:50:17.469316064", + "timestamp": "2026-05-29T14:32:12.630005788", "meta": { "nf-test": "0.9.4", "nextflow": "26.04.0" diff --git a/tests/nextflow.config b/tests/nextflow.config index aa29d8ba..0ab28b9b 100644 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -8,7 +8,7 @@ // Or any resources requirements params { modules_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/' - pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nictru/test-datasets/97addfb0946c0e51dbb70ee1391142d12e70f085' + pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nictru/test-datasets/97addfb0946c0e51dbb70ee1391142d12e70f085/' // CyteType is slow (remote LLM API); module nf-tests cover it — keep off in pipeline/subworkflow nf-tests cytetype_study_context = '' } diff --git a/workflows/scdownstream.nf b/workflows/scdownstream.nf index 82579603..e8a7c0e2 100644 --- a/workflows/scdownstream.nf +++ b/workflows/scdownstream.nf @@ -44,6 +44,7 @@ workflow SCDOWNSTREAM { cell_cycle_scoring // value: boolean s_genes // path: file or [] g2m_genes // path: file or [] + species // value: string qc_only // value: boolean celldex_reference // value: string celltypist_model // value: string @@ -59,6 +60,7 @@ workflow SCDOWNSTREAM { scvi_categorical_covariates // value: string scvi_continuous_covariates // value: string scimilarity_model // value: string + symphony_reference // value: string expimap_gmt // value: string skip_liana // value: boolean skip_rankgenesgroups // value: boolean @@ -128,6 +130,7 @@ workflow SCDOWNSTREAM { cell_cycle_scoring, s_genes, g2m_genes, + species, ) ch_multiqc_files = ch_multiqc_files.mix(QUALITY_CONTROL.out.multiqc_files) ch_h5ad = QUALITY_CONTROL.out.h5ad @@ -178,6 +181,7 @@ workflow SCDOWNSTREAM { scvi_categorical_covariates, scvi_continuous_covariates, scimilarity_model, + symphony_reference, expimap_gmt, condition_col, scib,