nf-core · miguelrosell · Feb 18, 2026 · Feb 26, 2026 · Feb 27, 2026 · Feb 27, 2026
@@ -121,6 +121,7 @@ We thank the following people for their extensive assistance in the development
 - [Jonathan Talbot-Martin](https://github.com/jtalbotmartin)
 - [Lukas Heumos](https://github.com/zethson)
 - [Matiss Ozols](https://github.com/maxozo)
+- [Miguel Rosell](https://github.com/miguelrosell)
 - [Nathan Skene](https://github.com/NathanSkene)
 - [Nurun Fancy](https://github.com/nfancy)
 - [Riley Grindle](https://github.com/Riley-Grindle)

@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+
+import argparse
+import sys
+import scanpy as sc
+import numpy as np
+import pandas as pd
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Run Causal Inference on Manifold.')
+    parser.add_argument('--input', required=True, help='Input AnnData file (.h5ad)')
+    parser.add_argument('--output', required=True, help='Output AnnData file (.h5ad)')
+    return parser.parse_args()
+
+def main():
+    args = parse_args()
+
+    print(f"Reading input from {args.input}...")
+    try:
+        adata = sc.read_h5ad(args.input)
+    except Exception as e:
+        sys.exit(f"Error loading AnnData: {e}")
+
+    # Logic: We compute Gene Rank based on centrality in the manifold graph
+    # This simulates finding "driver genes"
+    print("Computing causal graph metrics...")
+
+    if 'connectivities' not in adata.obsp:
+        print("Computing neighbors first...")
+        sc.pp.neighbors(adata, use_rep='X_pca')
+
+    # Compute PAGA (Partition-based Graph Abstraction) as a proxy for causal trajectory
+    # This requires clusters. If no clusters, then we cluster first.
+    if 'leiden' not in adata.obs:
+        print("Clustering (Leiden) needed for causality inference...")
+        sc.tl.leiden(adata)
+
+    print("Running PAGA for trajectory inference...")
+    sc.tl.paga(adata, groups='leiden')
+
+    # Store "causal" results (Pseudotime / PAGA connectivity)
+    adata.uns['causal_inference'] = {
+        'method': 'PAGA_Trajectory',
+        'connectivities': adata.uns['paga']['connectivities_tree']
+    }
+
+    print(f"Saving results to {args.output}...")
+    adata.write_h5ad(args.output)
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+
+import argparse
+import sys
+import scanpy as sc
+import pandas as pd
+import numpy as np
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Run Diffusion Maps (Diffmap) on an AnnData object.')
+    parser.add_argument('--input', required=True, help='Input AnnData file (.h5ad)')
+    parser.add_argument('--output', required=True, help='Output AnnData file (.h5ad)')
+    parser.add_argument('--n_neighbors', type=int, default=15, help='Number of nearest neighbors for graph construction')
+    parser.add_argument('--n_comps', type=int, default=15, help='Number of diffusion components to compute')
+    return parser.parse_args()
+
+def main():
+    args = parse_args()
+
+    # 1. Load data
+    print(f"Reading input from {args.input}...")
+    try:
+        adata = sc.read_h5ad(args.input)
+    except Exception as e:
+        sys.exit(f"Error loading AnnData: {e}")
+
+    # 2. Validation
+    if 'X_pca' not in adata.obsm:
+        sys.exit("Error: 'X_pca' not found in adata.obsm. Diffmap requires pre-computed PCA coordinates.")
+
+    # 3. Compute Neighbors
+    # Diffmap requires a neighborhood graph. We compute it on X_pca.
+    print(f"Computing neighbors graph (k={args.n_neighbors})...")
+    sc.pp.neighbors(adata, n_neighbors=args.n_neighbors, use_rep='X_pca')
+
+    # 4. Run diffusion maps
+    print(f"Running Diffusion Maps with n_comps={args.n_comps}...")
+    try:
+        sc.tl.diffmap(adata, n_comps=args.n_comps)
+    except Exception as e:
+        sys.exit(f"Error running diffmap: {e}")
+
+    # The result is stored in adata.obsm['X_diffmap'] automatically by scanpy
+
+    # 5. Save output
+    print(f"Saving results to {args.output}...")
+    try:
+        adata.write_h5ad(args.output)
+    except Exception as e:
+        sys.exit(f"Error saving AnnData: {e}")
+
+    # 6. Generate versions.yml
+    import scanpy as s_lib
+
+    print("Diffmap computation completed successfully.")
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+
+import argparse
+import sys
+import scanpy as sc
+import phate
+import pandas as pd
+import numpy as np
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Run PHATE on an AnnData object.')
+    parser.add_argument('--input', required=True, help='Input AnnData file (.h5ad)')
+    parser.add_argument('--output', required=True, help='Output AnnData file (.h5ad)')
+    parser.add_argument('--k', type=int, default=5, help='Number of nearest neighbors')
+    parser.add_argument('--a', type=float, default=40, help='Alpha decay parameter')
+    parser.add_argument('--t', type=str, default='auto', help='Diffusion time (int or "auto")')
+    parser.add_argument('--n_jobs', type=int, default=1, help='Number of threads')
+    parser.add_argument('--gamma', type=float, default=1, help='Informational distance parameter')
+    return parser.parse_args()
+
+def main():
+    args = parse_args()
+
+    # 1. Load data
+    print(f"Reading input from {args.input}...")
+    try:
+        adata = sc.read_h5ad(args.input)
+    except Exception as e:
+        sys.exit(f"Error loading AnnData: {e}")
+
+    # 2. Validation: Making sure that PCA exists (nf-core/scdownstream standard)
+    if 'X_pca' not in adata.obsm:
+        sys.exit("Error: 'X_pca' not found in adata.obsm. PHATE requires pre-computed PCA coordinates.")
+
+    # 3. Prepare parameters
+    # PHATE requires specific type handling for "auto"
+    t_param = 'auto' if args.t == 'auto' else int(args.t)
+
+    print(f"Running PHATE with k={args.k}, a={args.a}, t={t_param} on X_pca...")
+
+    # 4. Run PHATE
+    # We initialize the PHATE operator explicitly
+    phate_op = phate.PHATE(
+        n_pca=None,      # PCA is already computed
+        knn=args.k,
+        decay=args.a,
+        t=t_param,
+        gamma=args.gamma,
+        n_jobs=args.n_jobs,
+        verbose=1,
+        random_state=42 # Ensure reproducibility
+    )
+
+    # Fit and transform the PCA data
+    # Note: We pass X_pca directly to avoid recomputation
+    X_phate = phate_op.fit_transform(adata.obsm['X_pca'])
+
+    # 5. Store results
+    # Save coordinates in the standard Scanpy slot
+    adata.obsm['X_phate'] = X_phate
+
+    # Save metadata for reproducibility and downstream usage (e.g. TDA)
+    adata.uns['phate_params'] = {
+        'k': args.k,
+        'a': args.a,
+        't': t_param,
+        'gamma': args.gamma,
+        'diff_potential': phate_op.diff_potential # Crucial for Potential Distance
+    }
+
+    # 6. Save output
+    print(f"Saving results to {args.output}...")
+    try:
+        adata.write_h5ad(args.output)
+    except Exception as e:
+        sys.exit(f"Error saving AnnData: {e}")
+
+    print("PHATE computation completed successfully.")
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+
+import argparse
+import sys
+import scanpy as sc
+import numpy as np
+
+# Secure import for ripser
+try:
+    from ripser import ripser
+except ImportError:
+    # Fail gracefully if the library is missing
+    sys.exit("Error: 'ripser' library not found. Please ensure it is installed in the environment.")
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Run Topological Data Analysis (TDA) using Ripser.')
+    parser.add_argument('--input', required=True, help='Input AnnData file (.h5ad)')
+    parser.add_argument('--output', required=True, help='Output AnnData file (.h5ad)')
+    return parser.parse_args()
+
+def main():
+    args = parse_args()
+
+    # 1. Load Data
+    print(f"Reading input from {args.input}...")
+    try:
+        adata = sc.read_h5ad(args.input)
+    except Exception as e:
+        sys.exit(f"Error loading AnnData: {e}")
+
+    # 2. Select Embedding
+    # Priority: PHATE -> Diffmap -> PCA
+    # We check which embeddings are available in the object
+    embedding_key = 'X_phate'
+    if embedding_key not in adata.obsm:
+        if 'X_diffmap' in adata.obsm:
+            embedding_key = 'X_diffmap'
+        elif 'X_pca' in adata.obsm:
+            embedding_key = 'X_pca'
+        else:
+            sys.exit("Error: No valid embedding found (requires X_phate, X_diffmap, or X_pca).")
+
+    print(f"Running Ripser TDA on {embedding_key}...")
+
+    # 3. Subsampling cause TDA is computationally expensive
+    # If the dataset is large, we subsample to make sure the pipeline doesn't hang.
+    matrix = adata.obsm[embedding_key]
+    if matrix.shape[0] > 1000:
+        print("Subsampling to 1000 cells for performance...")
+        # Use random sampling without replacement
+        idx = np.random.choice(matrix.shape[0], 1000, replace=False)
+        matrix = matrix[idx, :]
+
+    # 4. Run Persistent Homology
+    try:
+        # maxdim=1 computes H0 (connected components) and H1 (loops)
+        diagrams = ripser(matrix, maxdim=1)['dgms']
+    except Exception as e:
+        sys.exit(f"Error running ripser: {e}")
+
+    # 5. Store results
+    diagrams_dict = {}
+    for i, dgm in enumerate(diagrams):
+        diagrams_dict[f'dim_{i}'] = dgm
+
+    adata.uns['tda_results'] = {
+        'max_homology_dim': 1,
+        'embedding_used': embedding_key,
+        'diagrams': diagrams_dict
+    }
+
+    # 6. Save output
+    print(f"Saving results to {args.output}...")
+    try:
+        adata.write_h5ad(args.output)
+    except Exception as e:
+        sys.exit(f"Error saving h5ad file: {e}")
+
+    print("TDA computation completed successfully.")
+
+if __name__ == "__main__":
+    main()
@@ -575,3 +575,43 @@ process {
         ]
     }
 }
+
+/*
+========================================================================================
+    MANIFOLD LEARNING MODULES CONFIG
+========================================================================================
+*/
+
+process {
+    withName: 'PHATE' {
+        publishDir = [
+            path: { "${params.outdir}/geometry/phate" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
+    withName: 'DIFFMAP' {
+        publishDir = [
+            path: { "${params.outdir}/geometry/diffmap" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
+    withName: 'TOPOLOGY' {
+        publishDir = [
+            path: { "${params.outdir}/topology" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
+    withName: 'CAUSALITY' {
+        publishDir = [
+            path: { "${params.outdir}/causality" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+}
@@ -0,0 +1,38 @@
+process CAUSALITY {
+    tag "$meta.id"
+    label 'process_medium'
+
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'docker://miguelrosell/nf-core-manifold:dev' :
+        'docker.io/miguelrosell/nf-core-manifold:dev' }"
+
+    input:
+    tuple val(meta), path(h5ad)
+
+    output:
+    tuple val(meta), path("*.h5ad"), emit: h5ad
+    path "versions.yml"            , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+
+    """
+    mkdir -p ./tmp
+    export MPLCONFIGDIR="./tmp"
+    export NUMBA_CACHE_DIR="./tmp"
+
+    run_causality.py \\
+        --input ${h5ad} \\
+        --output ${prefix}_causal.h5ad \\
+        $args
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        scanpy: \$(python -c "import importlib.metadata; print(importlib.metadata.version('scanpy'))")
+    END_VERSIONS
+    """
+}
@@ -0,0 +1,25 @@
+nextflow_process {
+
+    name "Test Process CAUSALITY"
+    script "../main.nf"
+    process "CAUSALITY"
+
+    test("Should run successfully on test dataset") {
+
+        when {
+            process {
+                """
+                input[0] = [ [id:'test_sample'], file("${projectDir}/test_dataset.h5ad") ]
+                """
+            }
+        }
+
+        then {
+            assert process.success
+            assert process.out.h5ad != null
+            assert snapshot(process.out.versions).match()
+        }
+
+    }
+
+}