Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
9abde15
New modules: cluster_metrics + cluster_viz
dbaku42 Apr 29, 2026
0320a41
Move custom clustering modules under custom
dbaku42 May 4, 2026
e424338
Move custom clustering modules under custom
dbaku42 May 4, 2026
670bc33
Fix custom clustering module lint
dbaku42 May 4, 2026
3a26119
Fix custom clustering module lint and snapshots
dbaku42 May 4, 2026
e4877e4
Address review comments for clustering custom modules
dbaku42 May 6, 2026
7e19b22
Fix custom clustering module metadata
dbaku42 May 6, 2026
fdb1be4
Add Dockerfile for custom/clustermetrics and custom/clustervisualiation
dbaku42 May 7, 2026
d3a382b
Add container directive for custom/clustermetrics and clustervisualia…
dbaku42 May 7, 2026
89287db
Update modules/nf-core/custom/clustermetrics/main.nf
dbaku42 May 8, 2026
d5ac668
Update modules/nf-core/custom/clustermetrics/main.nf
dbaku42 May 8, 2026
55e868a
Update modules/nf-core/custom/clustermetrics/main.nf
dbaku42 May 8, 2026
e492edb
Update modules/nf-core/custom/clustermetrics/main.nf
dbaku42 May 8, 2026
b8529f6
Update modules/nf-core/custom/clustervisualiation/templates/cluster_v…
dbaku42 May 8, 2026
12bb00d
Update modules/nf-core/custom/clustervisualiation/templates/cluster_v…
dbaku42 May 8, 2026
a4a01a3
Update modules/nf-core/custom/clustervisualiation/templates/cluster_v…
dbaku42 May 8, 2026
060eb5d
fix: use template for cluster visualization module
dbaku42 May 8, 2026
d48a224
style: clean cluster visualization module main
dbaku42 May 8, 2026
2d85309
fix: address reviewer feedback for cluster modules
dbaku42 May 8, 2026
387cbeb
fix: address pinin4fjords follow-up review - template escaping, drop …
dbaku42 May 11, 2026
4ed380b
feat(custom/clustervisualization): add UMAP and t-SNE cluster visuali…
dbaku42 May 11, 2026
4fe91e7
fix: apply ruff formatting to cluster_viz.py template
dbaku42 May 11, 2026
aaded95
Merge branch 'master' into add-cluster-metrics-viz
dbaku42 May 11, 2026
0980208
fix: align clustermetrics and clustervisualization envs and containers
dbaku42 May 11, 2026
f240623
fix: use docker:// prefix for singularity container to enable OCI con…
dbaku42 May 12, 2026
c46c26c
fix(custom/clustervisualization): set NUMBA_CACHE_DIR and MPLCONFIGDI…
dbaku42 May 12, 2026
ad39971
fix(custom/clustervisualization): move NUMBA_CACHE_DIR fix before any…
dbaku42 May 12, 2026
c8fec15
Apply suggestion from @pinin4fjords
dbaku42 May 12, 2026
3634466
Apply suggestion from @pinin4fjords
dbaku42 May 12, 2026
0ca17c9
Merge branch 'master' into add-cluster-metrics-viz
dbaku42 May 12, 2026
6a7ac45
Prek and script fix
dbaku42 May 12, 2026
cddb5a8
Fixed pandas series problem in cluster_metrics.py
dbaku42 May 13, 2026
4a72291
fix: escape \n in f-strings for Groovy template compatibility
dbaku42 May 13, 2026
2bf8afa
Format CUSTOM_CLUSTERMETRICS template with ruff
dbaku42 May 13, 2026
fd15709
fix(clustermetrics,clustervisualization): update nf-test snapshots an…
dbaku42 May 13, 2026
792373b
Merge branch 'master' into add-cluster-metrics-viz
dbaku42 May 13, 2026
6114ebd
fix environment conflict
dbaku42 May 13, 2026
8348609
feat: add custom clustering and metrics modules
dbaku42 May 13, 2026
b33105d
chore: remove extra custom modules from clustering PR
dbaku42 May 13, 2026
fdf5db0
Merge branch 'master' into add-custom-clustering
dbaku42 May 13, 2026
280aac9
test: update clustering snapshot after pyyaml addition
dbaku42 May 13, 2026
9d98375
Add pyyaml version 6.0.2 to environment.yml
dbaku42 May 14, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,14 @@ test_output/
tests/data/
work/
.github/CODEOWNERS-tmp
modules/local/
vcf_data/
subworkflows/nf-core/snpclustering/modules/
subworkflows/nf-core/snpclustering/run.log
subworkflows/nf-core/snpclustering/run_test.nf
subworkflows/nf-core/snpclustering/test_local.nf
subworkflows/nf-core/snpclustering/scripts/
subworkflows/nf-core/snpclustering/Dockerfile
subworkflows/nf-core/snpclustering/tests/
subworkflows/nf-core/snpclustering/main.nf
modules/nf-core/clustering/
15 changes: 15 additions & 0 deletions modules/nf-core/custom/clustering/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# clustermetrics/environment.yml
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
channels:
- conda-forge
- bioconda
dependencies:
- conda-forge::matplotlib=3.9.4
- conda-forge::numpy=2.4.2
- conda-forge::pandas=2.2.3
- conda-forge::python=3.12.12
- conda-forge::scikit-learn=1.6.1
- conda-forge::seaborn=0.13.2
- conda-forge::umap-learn=0.5.12
- conda-forge::pyyaml=6.0.2
40 changes: 40 additions & 0 deletions modules/nf-core/custom/clustering/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
process CUSTOM_CLUSTERING {
tag "$meta.id"
label 'process_medium'
conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ?
'docker://community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn_pruned:054f91aaa56bd7d5' :
'community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn_pruned:054f91aaa56bd7d5' }"

input:
tuple val(meta), path(eigenvec)
val algorithm
val n_clusters
val dbscan_eps
val dbscan_min_samples

output:
tuple val(meta), path("*_clusters.csv") , emit: clusters
tuple val(meta), path("*_clustering_info.json") , emit: info, optional: true
path "versions.yml" , emit: versions, topic: versions

when:
task.ext.when == null || task.ext.when

script:
template 'clustering.py'

stub:
def prefix = task.ext.prefix ?: "${meta.id}"
"""
touch ${prefix}_clusters.csv
touch ${prefix}_clustering_info.json

cat <<-END_VERSIONS > versions.yml
"${task.process}":
python: \$(python3 --version | sed 's/Python //')
pandas: \$(python3 -c "import pandas; print(pandas.__version__)")
scikit-learn: \$(python3 -c "import sklearn; print(sklearn.__version__)")
END_VERSIONS
"""
}
77 changes: 77 additions & 0 deletions modules/nf-core/custom/clustering/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
name: "CUSTOM_CLUSTERING"
description: "Performs KMeans or DBSCAN clustering on principal components from PLINK2
--pca"
keywords:
- clustering
- pca
- kmeans
- dbscan
- principal-components
tools:
- "scikit-learn":
description: "Machine learning library for clustering"
homepage: "https://scikit-learn.org/"
documentation: "https://scikit-learn.org/stable/modules/clustering.html"
licence:
- "BSD-3-Clause"
identifier: ""
input:
- - meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'sample1' ]`
- eigenvec:
type: file
description: PLINK2 .eigenvec file generated by --pca
pattern: "*.eigenvec"
ontologies: []
- algorithm:
type: string
description: Clustering algorithm to use (kmeans or dbscan)
- n_clusters:
type: integer
description: Number of clusters for KMeans
- dbscan_eps:
type: float
description: Epsilon parameter for DBSCAN
- dbscan_min_samples:
type: integer
description: Minimum samples parameter for DBSCAN
output:
clusters:
- - meta:
type: map
description: Groovy Map containing sample information
- "*_clusters.csv":
type: file
description: CSV file with sample_id and assigned cluster
pattern: "*_clusters.csv"
ontologies:
- edam: http://edamontology.org/format_3752
info:
- - meta:
type: map
description: Groovy Map containing sample information
- "*_clustering_info.json":
type: file
description: JSON file with clustering parameters and statistics
pattern: "*_clustering_info.json"
ontologies:
- edam: http://edamontology.org/format_3464
versions:
- "versions.yml":
type: file
description: File containing software versions
pattern: "versions.yml"
ontologies:
- edam: http://edamontology.org/format_3750
topics:
versions:
- versions.yml:
type: string
description: The name of the process
authors:
- "@dbaku42"
maintainers:
- "@dbaku42"
184 changes: 184 additions & 0 deletions modules/nf-core/custom/clustering/templates/clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
#!/usr/bin/env python3

import json
import platform
import sklearn
import yaml
import re
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN

PC_COL_RE = re.compile('[Pp][Cc][0-9]+', re.IGNORECASE)


def convert_eigenvec_to_tsv(eigenvec_path, out_pca, id_mode='iid'):
rows = []
n_pcs = 0
mode = None

with eigenvec_path.open('r') as fh:
for line in fh:
line = line.strip()
if not line:
continue
parts = line.split()
if parts[0].startswith('#'):
header = [p.lstrip('#') for p in parts]
if len(header) >= 2 and header[0].upper() == 'FID' and header[1].upper() == 'IID':
mode = 'fid_iid'
elif header[0].upper() == 'IID':
mode = 'iid_only'
continue
if mode is None:
try:
float(parts[1])
mode = 'iid_only'
except (ValueError, IndexError):
mode = 'fid_iid'
if mode == 'fid_iid':
if len(parts) < 3:
continue
fid = parts[0]
iid = parts[1]
pcs = parts[2:]
sample_id = iid if id_mode == 'iid' else f'{fid}:{iid}'
elif mode == 'iid_only':
if len(parts) < 2:
continue
iid = parts[0]
pcs = parts[1:]
sample_id = iid
else:
raise ValueError(f'Unrecognized eigenvec format in {eigenvec_path}')
if n_pcs == 0:
n_pcs = len(pcs)
rows.append((sample_id, pcs))

if not rows:
raise ValueError(f'No valid data found in {eigenvec_path}')

header = ['sample_id'] + [f'PC{i+1}' for i in range(n_pcs)]
with out_pca.open('w') as fh:
fh.write('\\t'.join(header) + '\\n')
for sample_id, pcs in rows:
fh.write(sample_id + '\\t' + '\\t'.join(pcs) + '\\n')

print(f'[INFO] Converted {len(rows)} samples with {n_pcs} PCs -> {out_pca}')
return n_pcs


def read_table_robust(path):
df = pd.read_csv(path, sep='\\t', dtype=str)
print(f'[DEBUG] Initial read: {df.shape[0]} rows x {df.shape[1]} cols', flush=True)
col_names_upper = set(str(c).upper() for c in df.columns)

def is_header_row(row):
row_values_upper = [str(v).upper() for v in row.values]
overlap = sum(1 for v in row_values_upper if v in col_names_upper)
if overlap >= 3:
return True
header_keywords = {'FID', 'IID', 'PC1', 'PC2', 'PC3'}
if sum(1 for v in row_values_upper if v in header_keywords) >= 2:
return True
return False

bad_rows = df.apply(is_header_row, axis=1)
if bad_rows.any():
n_bad = int(bad_rows.sum())
print(f'[INFO] Removed {n_bad} duplicate header row(s)', flush=True)
df = df[~bad_rows].copy().reset_index(drop=True)

print(f'[INFO] After cleanup: {df.shape[0]} rows x {df.shape[1]} cols', flush=True)
return df


def build_sample_id(df):
cols = list(df.columns)
if 'sample_id' in df.columns:
return df['sample_id'].astype(str), df.drop(columns=['sample_id'])
iid_candidates = [c for c in cols if str(c).upper() == 'IID']
if iid_candidates:
iid = iid_candidates[0]
return df[iid].astype(str), df.drop(columns=[iid])
fid_candidates = [c for c in cols if str(c).upper() == 'FID']
if fid_candidates and iid_candidates:
fid = fid_candidates[0]
iid = iid_candidates[0]
sample_ids = df[iid].astype(str)
return sample_ids, df.drop(columns=[c for c in [fid, iid] if c in df.columns])
pc_cols = [c for c in cols if PC_COL_RE.match(str(c))]
non_pc_cols = [c for c in cols if c not in pc_cols]
if non_pc_cols:
id_col = non_pc_cols[0]
return df[id_col].astype(str), df.drop(columns=[id_col])
return pd.Series([f'sample_{i}' for i in range(len(df))], index=df.index), df


def main():
prefix = '${meta.id}'

pca_tsv = Path(f'{prefix}_pca_scores.tsv')
convert_eigenvec_to_tsv(Path('${eigenvec}'), pca_tsv, 'iid')

df = read_table_robust(str(pca_tsv))
sample_ids, df_feats = build_sample_id(df)

pc_cols = [c for c in df_feats.columns if PC_COL_RE.match(str(c))]
if not pc_cols:
raise ValueError('No PC columns found in input')

X = df_feats[pc_cols].apply(pd.to_numeric, errors='coerce').values
if np.isnan(X).any():
raise ValueError('NaN values detected in PCA data')

print(f'[INFO] Loaded {X.shape[0]} samples x {X.shape[1]} principal components', flush=True)

if '${algorithm}' == 'kmeans':
model = KMeans(n_clusters=${n_clusters}, init='random', n_init=100, random_state=42)
labels = model.fit_predict(X)
info = {'algorithm': 'kmeans', 'k': ${n_clusters}, 'inertia': float(model.inertia_)}
else:
model = DBSCAN(eps=${dbscan_eps}, min_samples=${dbscan_min_samples})
labels = model.fit_predict(X)
n_found = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = int(np.sum(labels == -1))
info = {
'algorithm': 'dbscan',
'eps': ${dbscan_eps},
'min_samples': ${dbscan_min_samples},
'n_clusters_found': int(n_found),
'n_noise': n_noise
}

out_clusters = f'{prefix}_clusters.csv'
out_info = f'{prefix}_clustering_info.json'

pd.DataFrame({'sample_id': sample_ids.astype(str), 'cluster': labels}).to_csv(out_clusters, index=False)
info.update({
'n_samples': int(X.shape[0]),
'n_features': int(X.shape[1]),
'feature_names': pc_cols,
'input_file': Path('${eigenvec}').name
})
Path(out_info).write_text(json.dumps(info, indent=2))

print('[SUCCESS] Clustering completed:')
print(f' -> Clusters : {out_clusters}')
print(f' -> Info : {out_info}')


versions = {
'CUSTOM_CLUSTERING': {
'python': platform.python_version(),
'scikit-learn': sklearn.__version__,
'pandas': pd.__version__,
'numpy': np.__version__,
}
}
with open('versions.yml', 'w') as fh:
fh.write(yaml.dump(versions, default_flow_style=False))

main()
6 changes: 6 additions & 0 deletions modules/nf-core/custom/clustering/tests/data/test.eigenvec
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#FID IID PC1 PC2 PC3
0 sample01 0.1234 0.5678 0.9012
0 sample02 -0.2345 0.6789 -0.0123
0 sample03 0.3456 -0.7890 0.1234
0 sample04 -0.4567 0.8901 -0.2345
0 sample05 0.5678 -0.9012 0.3456
51 changes: 51 additions & 0 deletions modules/nf-core/custom/clustering/tests/main.nf.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
nextflow_process {
name "Test Process CUSTOM_CLUSTERING"
script "../main.nf"
process "CUSTOM_CLUSTERING"

tag "modules"
tag "modules_nfcore"
tag "custom"
tag "custom/clustering"

test("clustering - eigenvec") {
when {
process {
"""
input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/clustering/tests/data/test.eigenvec", checkIfExists: true) ]
input[1] = 'kmeans'
input[2] = 3
input[3] = 0.5
input[4] = 5
"""
}
}
then {
assert process.success
assert snapshot(
process.out.clusters,
process.out.info,
process.out.versions
).match()
}
}

test("clustering - eigenvec - stub") {
options "-stub"
when {
process {
"""
input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/clustering/tests/data/test.eigenvec", checkIfExists: true) ]
input[1] = 'kmeans'
input[2] = 3
input[3] = 0.5
input[4] = 5
"""
}
}
then {
assert process.success
assert snapshot(process.out).match()
}
}
}
Loading
Loading