Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ We thank the following people for their extensive assistance in the development
- [Jonathan Talbot-Martin](https://github.com/jtalbotmartin)
- [Lukas Heumos](https://github.com/zethson)
- [Matiss Ozols](https://github.com/maxozo)
- [Miguel Rosell](https://github.com/miguelrosell)
- [Nathan Skene](https://github.com/NathanSkene)
- [Nurun Fancy](https://github.com/nfancy)
- [Riley Grindle](https://github.com/Riley-Grindle)
Expand Down
51 changes: 51 additions & 0 deletions bin/run_causality.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env python

import argparse
import sys
import scanpy as sc
import numpy as np
import pandas as pd

def parse_args():
parser = argparse.ArgumentParser(description='Run Causal Inference on Manifold.')
parser.add_argument('--input', required=True, help='Input AnnData file (.h5ad)')
parser.add_argument('--output', required=True, help='Output AnnData file (.h5ad)')
return parser.parse_args()

def main():
args = parse_args()

print(f"Reading input from {args.input}...")
try:
adata = sc.read_h5ad(args.input)
except Exception as e:
sys.exit(f"Error loading AnnData: {e}")

# Logic: We compute Gene Rank based on centrality in the manifold graph
# This simulates finding "driver genes"
print("Computing causal graph metrics...")

if 'connectivities' not in adata.obsp:
print("Computing neighbors first...")
sc.pp.neighbors(adata, use_rep='X_pca')

# Compute PAGA (Partition-based Graph Abstraction) as a proxy for causal trajectory
# This requires clusters. If no clusters, then we cluster first.
if 'leiden' not in adata.obs:
print("Clustering (Leiden) needed for causality inference...")
sc.tl.leiden(adata)

print("Running PAGA for trajectory inference...")
sc.tl.paga(adata, groups='leiden')

# Store "causal" results (Pseudotime / PAGA connectivity)
adata.uns['causal_inference'] = {
'method': 'PAGA_Trajectory',
'connectivities': adata.uns['paga']['connectivities_tree']
}

print(f"Saving results to {args.output}...")
adata.write_h5ad(args.output)

if __name__ == "__main__":
main()
58 changes: 58 additions & 0 deletions bin/run_diffmap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/env python

import argparse
import sys
import scanpy as sc
import pandas as pd
import numpy as np

def parse_args():
parser = argparse.ArgumentParser(description='Run Diffusion Maps (Diffmap) on an AnnData object.')
parser.add_argument('--input', required=True, help='Input AnnData file (.h5ad)')
parser.add_argument('--output', required=True, help='Output AnnData file (.h5ad)')
parser.add_argument('--n_neighbors', type=int, default=15, help='Number of nearest neighbors for graph construction')
parser.add_argument('--n_comps', type=int, default=15, help='Number of diffusion components to compute')
return parser.parse_args()

def main():
args = parse_args()

# 1. Load data
print(f"Reading input from {args.input}...")
try:
adata = sc.read_h5ad(args.input)
except Exception as e:
sys.exit(f"Error loading AnnData: {e}")

# 2. Validation
if 'X_pca' not in adata.obsm:
sys.exit("Error: 'X_pca' not found in adata.obsm. Diffmap requires pre-computed PCA coordinates.")

# 3. Compute Neighbors
# Diffmap requires a neighborhood graph. We compute it on X_pca.
print(f"Computing neighbors graph (k={args.n_neighbors})...")
sc.pp.neighbors(adata, n_neighbors=args.n_neighbors, use_rep='X_pca')

# 4. Run diffusion maps
print(f"Running Diffusion Maps with n_comps={args.n_comps}...")
try:
sc.tl.diffmap(adata, n_comps=args.n_comps)
except Exception as e:
sys.exit(f"Error running diffmap: {e}")

# The result is stored in adata.obsm['X_diffmap'] automatically by scanpy

# 5. Save output
print(f"Saving results to {args.output}...")
try:
adata.write_h5ad(args.output)
except Exception as e:
sys.exit(f"Error saving AnnData: {e}")

# 6. Generate versions.yml
import scanpy as s_lib

print("Diffmap computation completed successfully.")

if __name__ == "__main__":
main()
81 changes: 81 additions & 0 deletions bin/run_phate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/usr/bin/env python

import argparse
import sys
import scanpy as sc
import phate
import pandas as pd
import numpy as np

def parse_args():
parser = argparse.ArgumentParser(description='Run PHATE on an AnnData object.')
parser.add_argument('--input', required=True, help='Input AnnData file (.h5ad)')
parser.add_argument('--output', required=True, help='Output AnnData file (.h5ad)')
parser.add_argument('--k', type=int, default=5, help='Number of nearest neighbors')
parser.add_argument('--a', type=float, default=40, help='Alpha decay parameter')
parser.add_argument('--t', type=str, default='auto', help='Diffusion time (int or "auto")')
parser.add_argument('--n_jobs', type=int, default=1, help='Number of threads')
parser.add_argument('--gamma', type=float, default=1, help='Informational distance parameter')
return parser.parse_args()

def main():
args = parse_args()

# 1. Load data
print(f"Reading input from {args.input}...")
try:
adata = sc.read_h5ad(args.input)
except Exception as e:
sys.exit(f"Error loading AnnData: {e}")

# 2. Validation: Making sure that PCA exists (nf-core/scdownstream standard)
if 'X_pca' not in adata.obsm:
sys.exit("Error: 'X_pca' not found in adata.obsm. PHATE requires pre-computed PCA coordinates.")

# 3. Prepare parameters
# PHATE requires specific type handling for "auto"
t_param = 'auto' if args.t == 'auto' else int(args.t)

print(f"Running PHATE with k={args.k}, a={args.a}, t={t_param} on X_pca...")

# 4. Run PHATE
# We initialize the PHATE operator explicitly
phate_op = phate.PHATE(
n_pca=None, # PCA is already computed
knn=args.k,
decay=args.a,
t=t_param,
gamma=args.gamma,
n_jobs=args.n_jobs,
verbose=1,
random_state=42 # Ensure reproducibility
)

# Fit and transform the PCA data
# Note: We pass X_pca directly to avoid recomputation
X_phate = phate_op.fit_transform(adata.obsm['X_pca'])

# 5. Store results
# Save coordinates in the standard Scanpy slot
adata.obsm['X_phate'] = X_phate

# Save metadata for reproducibility and downstream usage (e.g. TDA)
adata.uns['phate_params'] = {
'k': args.k,
'a': args.a,
't': t_param,
'gamma': args.gamma,
'diff_potential': phate_op.diff_potential # Crucial for Potential Distance
}

# 6. Save output
print(f"Saving results to {args.output}...")
try:
adata.write_h5ad(args.output)
except Exception as e:
sys.exit(f"Error saving AnnData: {e}")

print("PHATE computation completed successfully.")

if __name__ == "__main__":
main()
82 changes: 82 additions & 0 deletions bin/run_topology.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#!/usr/bin/env python

import argparse
import sys
import scanpy as sc
import numpy as np

# Secure import for ripser
try:
from ripser import ripser
except ImportError:
# Fail gracefully if the library is missing
sys.exit("Error: 'ripser' library not found. Please ensure it is installed in the environment.")

def parse_args():
parser = argparse.ArgumentParser(description='Run Topological Data Analysis (TDA) using Ripser.')
parser.add_argument('--input', required=True, help='Input AnnData file (.h5ad)')
parser.add_argument('--output', required=True, help='Output AnnData file (.h5ad)')
return parser.parse_args()

def main():
args = parse_args()

# 1. Load Data
print(f"Reading input from {args.input}...")
try:
adata = sc.read_h5ad(args.input)
except Exception as e:
sys.exit(f"Error loading AnnData: {e}")

# 2. Select Embedding
# Priority: PHATE -> Diffmap -> PCA
# We check which embeddings are available in the object
embedding_key = 'X_phate'
if embedding_key not in adata.obsm:
if 'X_diffmap' in adata.obsm:
embedding_key = 'X_diffmap'
elif 'X_pca' in adata.obsm:
embedding_key = 'X_pca'
else:
sys.exit("Error: No valid embedding found (requires X_phate, X_diffmap, or X_pca).")

print(f"Running Ripser TDA on {embedding_key}...")

# 3. Subsampling cause TDA is computationally expensive
# If the dataset is large, we subsample to make sure the pipeline doesn't hang.
matrix = adata.obsm[embedding_key]
if matrix.shape[0] > 1000:
print("Subsampling to 1000 cells for performance...")
# Use random sampling without replacement
idx = np.random.choice(matrix.shape[0], 1000, replace=False)
matrix = matrix[idx, :]

# 4. Run Persistent Homology
try:
# maxdim=1 computes H0 (connected components) and H1 (loops)
diagrams = ripser(matrix, maxdim=1)['dgms']
except Exception as e:
sys.exit(f"Error running ripser: {e}")

# 5. Store results
diagrams_dict = {}
for i, dgm in enumerate(diagrams):
diagrams_dict[f'dim_{i}'] = dgm

adata.uns['tda_results'] = {
'max_homology_dim': 1,
'embedding_used': embedding_key,
'diagrams': diagrams_dict
}

# 6. Save output
print(f"Saving results to {args.output}...")
try:
adata.write_h5ad(args.output)
except Exception as e:
sys.exit(f"Error saving h5ad file: {e}")

print("TDA computation completed successfully.")

if __name__ == "__main__":
main()
40 changes: 40 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -575,3 +575,43 @@ process {
]
}
}

/*
========================================================================================
MANIFOLD LEARNING MODULES CONFIG
========================================================================================
*/

process {
withName: 'PHATE' {
publishDir = [
path: { "${params.outdir}/geometry/phate" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: 'DIFFMAP' {
publishDir = [
path: { "${params.outdir}/geometry/diffmap" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: 'TOPOLOGY' {
publishDir = [
path: { "${params.outdir}/topology" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: 'CAUSALITY' {
publishDir = [
path: { "${params.outdir}/causality" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}
}
38 changes: 38 additions & 0 deletions modules/local/causality/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
process CAUSALITY {
tag "$meta.id"
label 'process_medium'

container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'docker://miguelrosell/nf-core-manifold:dev' :
'docker.io/miguelrosell/nf-core-manifold:dev' }"

input:
tuple val(meta), path(h5ad)

output:
tuple val(meta), path("*.h5ad"), emit: h5ad
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"

"""
mkdir -p ./tmp
export MPLCONFIGDIR="./tmp"
export NUMBA_CACHE_DIR="./tmp"

run_causality.py \\
--input ${h5ad} \\
--output ${prefix}_causal.h5ad \\
$args

cat <<-END_VERSIONS > versions.yml
"${task.process}":
scanpy: \$(python -c "import importlib.metadata; print(importlib.metadata.version('scanpy'))")
END_VERSIONS
"""
}
25 changes: 25 additions & 0 deletions modules/local/causality/tests/main.nf.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
nextflow_process {

name "Test Process CAUSALITY"
script "../main.nf"
process "CAUSALITY"

test("Should run successfully on test dataset") {

when {
process {
"""
input[0] = [ [id:'test_sample'], file("${projectDir}/test_dataset.h5ad") ]
"""
}
}

then {
assert process.success
assert process.out.h5ad != null
assert snapshot(process.out.versions).match()
}

}

}
Loading
Loading