From 9abde152a6c70d636fd5db0c69e6775e0957cb4f Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Wed, 29 Apr 2026 15:23:30 +0200 Subject: [PATCH 01/38] New modules: cluster_metrics + cluster_viz - cluster_metrics: computes clustering quality metrics + k-sweep - cluster_viz: generates PCA, UMAP and t-SNE plots colored by cluster - Both use conda environment.yml - Full nf-test coverage --- .../nf-core/cluster_metrics/environment.yml | 10 + modules/nf-core/cluster_metrics/main.nf | 50 ++++ modules/nf-core/cluster_metrics/meta.yml | 62 ++++ .../templates/cluster_metrics.py | 276 ++++++++++++++++++ .../tests/data/test_clusters.csv | 6 + .../tests/data/test_features.tsv | 6 + .../cluster_metrics/tests/main.nf.test | 44 +++ .../cluster_metrics/tests/main.nf.test.snap | 129 ++++++++ modules/nf-core/cluster_viz/environment.yml | 11 + modules/nf-core/cluster_viz/main.nf | 55 ++++ modules/nf-core/cluster_viz/meta.yml | 105 +++++++ .../cluster_viz/templates/cluster_viz.py | 213 ++++++++++++++ .../cluster_viz/tests/data/test_clusters.csv | 6 + .../cluster_viz/tests/data/test_features.tsv | 6 + .../cluster_viz/tests/data/test_pca.eigenvec | 6 + .../nf-core/cluster_viz/tests/main.nf.test | 52 ++++ .../cluster_viz/tests/main.nf.test.snap | 151 ++++++++++ 17 files changed, 1188 insertions(+) create mode 100644 modules/nf-core/cluster_metrics/environment.yml create mode 100644 modules/nf-core/cluster_metrics/main.nf create mode 100644 modules/nf-core/cluster_metrics/meta.yml create mode 100644 modules/nf-core/cluster_metrics/templates/cluster_metrics.py create mode 100644 modules/nf-core/cluster_metrics/tests/data/test_clusters.csv create mode 100644 modules/nf-core/cluster_metrics/tests/data/test_features.tsv create mode 100644 modules/nf-core/cluster_metrics/tests/main.nf.test create mode 100644 modules/nf-core/cluster_metrics/tests/main.nf.test.snap create mode 100644 modules/nf-core/cluster_viz/environment.yml create mode 100644 modules/nf-core/cluster_viz/main.nf create mode 100644 modules/nf-core/cluster_viz/meta.yml create mode 100644 modules/nf-core/cluster_viz/templates/cluster_viz.py create mode 100644 modules/nf-core/cluster_viz/tests/data/test_clusters.csv create mode 100644 modules/nf-core/cluster_viz/tests/data/test_features.tsv create mode 100644 modules/nf-core/cluster_viz/tests/data/test_pca.eigenvec create mode 100644 modules/nf-core/cluster_viz/tests/main.nf.test create mode 100644 modules/nf-core/cluster_viz/tests/main.nf.test.snap diff --git a/modules/nf-core/cluster_metrics/environment.yml b/modules/nf-core/cluster_metrics/environment.yml new file mode 100644 index 000000000000..616821c92ff9 --- /dev/null +++ b/modules/nf-core/cluster_metrics/environment.yml @@ -0,0 +1,10 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - matplotlib=3.9.* + - pandas=2.2.* + - python=3.12 + - scikit-learn=1.5.* diff --git a/modules/nf-core/cluster_metrics/main.nf b/modules/nf-core/cluster_metrics/main.nf new file mode 100644 index 000000000000..001f5479d85d --- /dev/null +++ b/modules/nf-core/cluster_metrics/main.nf @@ -0,0 +1,50 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl = 2 + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + PROCESS: CLUSTER_METRICS + Compute clustering quality metrics and k-sweep + Author: Donald Baku (athor) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +process CLUSTER_METRICS { + tag "$meta.id" + label 'process_medium' + conda "${moduleDir}/environment.yml" + + input: + tuple val(meta), path(features), path(clusters) + val out_prefix + + output: + tuple val(meta), path("*_metrics.tsv") , emit: metrics + tuple val(meta), path("*_k_sweep.csv") , emit: k_sweep + tuple val(meta), path("*_selected.json") , emit: selected + tuple val(meta), path("*.png") , emit: plots, optional: true + path "versions.yml" , emit: versions, topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: out_prefix ?: "${meta.id}" + + """ + python3 ${projectDir}/modules/nf-core/cluster_metrics/templates/cluster_metrics.py \\ + --features ${features} \\ + --clusters ${clusters} \\ + --out-k-sweep ${prefix}_k_sweep.csv \\ + --out-selected ${prefix}_selected.json \\ + --out-prefix ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | cut -d' ' -f2) + pandas: \$(python3 -c "import pandas; print(pandas.__version__)") + scikit-learn: \$(python3 -c "import sklearn; print(sklearn.__version__)") + matplotlib: \$(python3 -c "import matplotlib; print(matplotlib.__version__)") + END_VERSIONS + """ +} diff --git a/modules/nf-core/cluster_metrics/meta.yml b/modules/nf-core/cluster_metrics/meta.yml new file mode 100644 index 000000000000..0c55de377200 --- /dev/null +++ b/modules/nf-core/cluster_metrics/meta.yml @@ -0,0 +1,62 @@ +name: "cluster_metrics" +description: "Computes clustering quality metrics (silhouette, Calinski-Harabasz, Davies-Bouldin) and performs k-sweep analysis" +keywords: + - clustering + - metrics + - silhouette + - calinski-harabasz + - davies-bouldin + - evaluation + +tools: + - "scikit-learn": + description: "Machine learning library for clustering metrics" + homepage: "https://scikit-learn.org/" + documentation: "https://scikit-learn.org/stable/modules/clustering.html" + licence: ["BSD-3-Clause"] + +authors: + - "@dbaku42" +maintainers: + - "@dbaku42" + +input: + - meta: + type: map + description: Groovy Map containing sample information + - features: + type: file + description: TSV file with sample_id and numeric features (e.g. PCA scores) + pattern: "*.tsv" + - clusters: + type: file + description: CSV/TSV file with sample_id and cluster assignment + pattern: "*_clusters.*" + - out_prefix: + type: string + description: Prefix for output files + +output: + - meta: + type: map + description: Groovy Map containing sample information + - metrics: + type: file + description: TSV with selected cluster quality metrics + pattern: "*_metrics.tsv" + - k_sweep: + type: file + description: CSV with metrics for different values of k + pattern: "*_k_sweep.csv" + - selected: + type: file + description: JSON with the selected/best metrics + pattern: "*_selected.json" + - plots: + type: file + description: Optional PNG plots (elbow, silhouette, etc.) + pattern: "*.png" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" diff --git a/modules/nf-core/cluster_metrics/templates/cluster_metrics.py b/modules/nf-core/cluster_metrics/templates/cluster_metrics.py new file mode 100644 index 000000000000..93da1f6a03d0 --- /dev/null +++ b/modules/nf-core/cluster_metrics/templates/cluster_metrics.py @@ -0,0 +1,276 @@ +#!/usr/bin/env python3 + +import argparse +import json +from pathlib import Path + +import numpy as np +import pandas as pd +from sklearn.cluster import KMeans +from sklearn.metrics import ( + silhouette_score, + calinski_harabasz_score, + davies_bouldin_score, +) + +import matplotlib +matplotlib.use("Agg") + + +def _normalise_id_column(df: pd.DataFrame) -> pd.DataFrame: + df = df.copy() + df.columns = [str(c).lstrip("#") for c in df.columns] + + cols_upper = {str(c).upper(): c for c in df.columns} + + if "IID" in cols_upper: + iid_col = cols_upper["IID"] + dup_mask = df[iid_col].astype(str).str.upper().isin({"FID", "IID"}) + if dup_mask.any(): + df = df.loc[~dup_mask].copy().reset_index(drop=True) + + cols_upper = {str(c).upper(): c for c in df.columns} + + if "SAMPLE_ID" in cols_upper: + sample_col = cols_upper["SAMPLE_ID"] + if sample_col != "sample_id": + df = df.rename(columns={sample_col: "sample_id"}) + return df + + if "IID" in cols_upper: + iid_col = cols_upper["IID"] + iid_numeric = pd.to_numeric(df[iid_col], errors="coerce").notna().all() + + if iid_numeric: + df = df.drop(columns=[iid_col]) + if len(df.columns) == 0: + raise ValueError("Cannot infer sample_id after dropping numeric IID column") + df = df.rename(columns={df.columns[0]: "sample_id"}) + else: + df = df.rename(columns={iid_col: "sample_id"}) + + fid_cols = [c for c in df.columns if str(c).upper() == "FID"] + if fid_cols: + df = df.drop(columns=fid_cols) + + return df + + raise ValueError( + f"Cannot find sample ID column (expected 'sample_id' or 'IID'). " + f"Found: {list(df.columns)}" + ) + + +def load_features(path: str) -> tuple[pd.DataFrame, pd.Series]: + df = pd.read_csv(path, sep="\t", dtype=str) + df = _normalise_id_column(df) + + if "sample_id" not in df.columns: + raise ValueError("features file must contain a sample_id column after normalization") + + sample_ids = df["sample_id"].astype(str) + X = df.drop(columns=["sample_id"]).apply(pd.to_numeric, errors="coerce") + X = X.fillna(X.mean(numeric_only=True)) + X = X.fillna(0.0) + + return X, sample_ids + + +def _looks_mostly_numeric(s: pd.Series) -> bool: + if len(s) == 0: + return False + parsed = pd.to_numeric(s.astype(str), errors="coerce") + return float(parsed.notna().mean()) >= 0.8 + + +def load_clusters(path: str) -> tuple[pd.DataFrame, str]: + df = pd.read_csv(path, sep=",", dtype=str) + df = df.copy() + df.columns = [str(c).lstrip("#") for c in df.columns] + + cols_upper = {str(c).upper(): c for c in df.columns} + + if "CLUSTER" not in cols_upper: + raise ValueError("clusters CSV must have a 'cluster' column") + + cluster_col = cols_upper["CLUSTER"] + + if "SAMPLE_ID" in cols_upper: + sample_col = cols_upper["SAMPLE_ID"] + out = df[[sample_col, cluster_col]].copy() + out.columns = ["sample_id", "cluster"] + out["sample_id"] = out["sample_id"].astype(str) + out["cluster"] = pd.to_numeric(out["cluster"], errors="raise").astype(int) + return out, "sample_id" + + try: + norm = _normalise_id_column(df.copy()) + if "sample_id" in norm.columns and "cluster" in norm.columns: + out = norm[["sample_id", "cluster"]].copy() + out["sample_id"] = out["sample_id"].astype(str) + out["cluster"] = pd.to_numeric(out["cluster"], errors="raise").astype(int) + return out, "sample_id" + except Exception: + pass + + other_cols = [c for c in df.columns if c != cluster_col] + + if len(other_cols) == 1: + candidate = other_cols[0] + candidate_vals = df[candidate].astype(str) + + if not _looks_mostly_numeric(candidate_vals): + out = pd.DataFrame({ + "sample_id": candidate_vals, + "cluster": pd.to_numeric(df[cluster_col], errors="raise").astype(int), + }) + return out, "sample_id" + + out = pd.DataFrame({ + "cluster": pd.to_numeric(df[cluster_col], errors="raise").astype(int) + }) + return out, "row_order" + + +def safe_cluster_metrics(X: np.ndarray, labels: np.ndarray) -> dict: + uniq = np.unique(labels) + n_clusters = len(uniq) - (1 if -1 in uniq else 0) + + if n_clusters < 2: + return { + "n_clusters": int(n_clusters), + "silhouette": None, + "calinski_harabasz": None, + "davies_bouldin": None, + } + + mask = labels != -1 + X_use, y_use = X[mask], labels[mask] + + if len(X_use) < 2 or len(np.unique(y_use)) < 2: + return { + "n_clusters": int(n_clusters), + "silhouette": None, + "calinski_harabasz": None, + "davies_bouldin": None, + } + + return { + "n_clusters": int(n_clusters), + "silhouette": float(silhouette_score(X_use, y_use)), + "calinski_harabasz": float(calinski_harabasz_score(X_use, y_use)), + "davies_bouldin": float(davies_bouldin_score(X_use, y_use)), + } + + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("--features", required=True) + ap.add_argument("--clusters", required=True) + ap.add_argument("--k-min", type=int, default=2) + ap.add_argument("--k-max", type=int, default=12) + ap.add_argument("--out-k-sweep", required=True) + ap.add_argument("--out-selected", required=True) + ap.add_argument("--out-prefix", required=True) + args = ap.parse_args() + + X_df, sample_ids = load_features(args.features) + clusters_df, cluster_mode = load_clusters(args.clusters) + + if cluster_mode == "sample_id": + clusters = clusters_df.set_index("sample_id")["cluster"] + common = sample_ids[sample_ids.isin(clusters.index)] + + if len(common) > 0: + X = X_df.loc[common.index].values + labels = clusters.loc[common.values].values + aligned_ids = common.astype(str).tolist() + alignment_mode = "sample_id" + elif len(clusters_df) == len(sample_ids): + X = X_df.values + labels = clusters_df["cluster"].values + aligned_ids = sample_ids.astype(str).tolist() + alignment_mode = "row_order_fallback" + else: + raise ValueError( + f"No overlapping sample_id between features and clusters.\n" + f" features IDs (first 5): {sample_ids.head().tolist()}\n" + f" clusters IDs (first 5): {list(clusters.index[:5])}" + ) + else: + if len(clusters_df) != len(sample_ids): + raise ValueError( + "clusters CSV has no usable sample_id column and row counts do not match.\n" + f" n_features={len(sample_ids)}\n" + f" n_clusters={len(clusters_df)}" + ) + X = X_df.values + labels = clusters_df["cluster"].values + aligned_ids = sample_ids.astype(str).tolist() + alignment_mode = "row_order" + + if len(X) < 2: + raise ValueError("Need at least 2 samples to compute cluster metrics") + + selected = safe_cluster_metrics(X, labels) + selected["input_clusters"] = Path(args.clusters).name + selected["input_features"] = Path(args.features).name + selected["n_samples_used"] = int(len(aligned_ids)) + selected["alignment_mode"] = alignment_mode + + metrics_tsv = f"{args.out_prefix}_metrics.tsv" + pd.DataFrame([selected]).to_csv(metrics_tsv, sep="\t", index=False) + + rows = [] + max_k = min(int(args.k_max), len(X)) + for k in range(int(args.k_min), max_k + 1): + model = KMeans(n_clusters=k, n_init="auto", random_state=42) + y = model.fit_predict(X) + + sil = ch = db = None + if 1 < len(np.unique(y)) < len(X): + sil = float(silhouette_score(X, y)) + ch = float(calinski_harabasz_score(X, y)) + db = float(davies_bouldin_score(X, y)) + + rows.append({ + "k": k, + "inertia": float(model.inertia_), + "silhouette": sil, + "calinski_harabasz": ch, + "davies_bouldin": db, + }) + + sweep_df = pd.DataFrame(rows) + sweep_df.to_csv(args.out_k_sweep, sep=",", index=False) + Path(args.out_selected).write_text(json.dumps(selected, indent=2)) + + pfx = args.out_prefix + try: + import matplotlib.pyplot as plt + + def plot_curve(metric, title, ylabel, out_png): + plt.figure(figsize=(7, 4.5)) + vals = sweep_df[metric].dropna() + ks = sweep_df.loc[vals.index, "k"] + plt.plot(ks, vals, marker="o") + plt.xticks(sweep_df["k"].tolist()) + plt.title(title) + plt.xlabel("k") + plt.ylabel(ylabel) + plt.tight_layout() + plt.savefig(out_png, dpi=200) + plt.close() + + if not sweep_df.empty: + plot_curve("inertia", "Elbow method (KMeans inertia)", "inertia", f"{pfx}_elbow.png") + plot_curve("silhouette", "Silhouette score (higher is better)", "silhouette", f"{pfx}_silhouette.png") + plot_curve("davies_bouldin", "Davies-Bouldin index (lower is better)", "davies_bouldin", f"{pfx}_davies_bouldin.png") + plot_curve("calinski_harabasz", "Calinski-Harabasz index (higher is better)", "calinski_harabasz", f"{pfx}_calinski.png") + + except Exception as e: + Path("plot_warning.txt").write_text(f"Plotting failed: {e}\n") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/modules/nf-core/cluster_metrics/tests/data/test_clusters.csv b/modules/nf-core/cluster_metrics/tests/data/test_clusters.csv new file mode 100644 index 000000000000..1258849b8fbe --- /dev/null +++ b/modules/nf-core/cluster_metrics/tests/data/test_clusters.csv @@ -0,0 +1,6 @@ +sample_id,cluster +sample01,0 +sample02,2 +sample03,1 +sample04,2 +sample05,1 diff --git a/modules/nf-core/cluster_metrics/tests/data/test_features.tsv b/modules/nf-core/cluster_metrics/tests/data/test_features.tsv new file mode 100644 index 000000000000..033d23b82df8 --- /dev/null +++ b/modules/nf-core/cluster_metrics/tests/data/test_features.tsv @@ -0,0 +1,6 @@ +sample_id PC1 PC2 PC3 +sample01 0.1234 0.5678 0.9012 +sample02 -0.2345 0.6789 -0.0123 +sample03 0.3456 -0.7890 0.1234 +sample04 -0.4567 0.8901 -0.2345 +sample05 0.5678 -0.9012 0.3456 diff --git a/modules/nf-core/cluster_metrics/tests/main.nf.test b/modules/nf-core/cluster_metrics/tests/main.nf.test new file mode 100644 index 000000000000..c66d8349ed93 --- /dev/null +++ b/modules/nf-core/cluster_metrics/tests/main.nf.test @@ -0,0 +1,44 @@ +nextflow_process { + name "Test Process CLUSTER_METRICS" + script "../main.nf" + process "CLUSTER_METRICS" + tag "modules" + tag "modules_nfcore" + tag "cluster_metrics" + + test("cluster_metrics - features and clusters") { + when { + process { + """ + input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/cluster_metrics/tests/data/test_features.tsv", checkIfExists: true), file("${projectDir}/modules/nf-core/cluster_metrics/tests/data/test_clusters.csv", checkIfExists: true) ] + input[1] = 'test' + """ + } + } + then { + assert process.success + assert snapshot( + process.out.metrics, + process.out.k_sweep, + process.out.selected, + process.out.versions + ).match() + } + } + + test("cluster_metrics - features and clusters - stub") { + options "-stub" + when { + process { + """ + input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/cluster_metrics/tests/data/test_features.tsv", checkIfExists: true), file("${projectDir}/modules/nf-core/cluster_metrics/tests/data/test_clusters.csv", checkIfExists: true) ] + input[1] = 'test' + """ + } + } + then { + assert process.success + assert snapshot(process.out).match() + } + } +} diff --git a/modules/nf-core/cluster_metrics/tests/main.nf.test.snap b/modules/nf-core/cluster_metrics/tests/main.nf.test.snap new file mode 100644 index 000000000000..30f9c8ea46a9 --- /dev/null +++ b/modules/nf-core/cluster_metrics/tests/main.nf.test.snap @@ -0,0 +1,129 @@ +{ + "cluster_metrics - features and clusters - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_metrics.tsv:md5,7bfdac9bca90ba2ea3c03eca25b24f28" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test_k_sweep.csv:md5,9709607199ba889999dadf4d251497be" + ] + ], + "2": [ + [ + { + "id": "test" + }, + "test_selected.json:md5,633493b1585fb0ec0a81629bde4c00cb" + ] + ], + "3": [ + [ + { + "id": "test" + }, + [ + "test_calinski.png:md5,a5db5a4340f5a203f8cfb1e27617eab1", + "test_davies_bouldin.png:md5,b083077c690cc454c7094d6d0c72fdb9", + "test_elbow.png:md5,e90e14974deb0776c9b784c1de8b8ef0", + "test_silhouette.png:md5,b7f103711912855f97d8588e1be2d659" + ] + ] + ], + "4": [ + "versions.yml:md5,00e23b98be698c459e9c94079b0164e0" + ], + "k_sweep": [ + [ + { + "id": "test" + }, + "test_k_sweep.csv:md5,9709607199ba889999dadf4d251497be" + ] + ], + "metrics": [ + [ + { + "id": "test" + }, + "test_metrics.tsv:md5,7bfdac9bca90ba2ea3c03eca25b24f28" + ] + ], + "plots": [ + [ + { + "id": "test" + }, + [ + "test_calinski.png:md5,a5db5a4340f5a203f8cfb1e27617eab1", + "test_davies_bouldin.png:md5,b083077c690cc454c7094d6d0c72fdb9", + "test_elbow.png:md5,e90e14974deb0776c9b784c1de8b8ef0", + "test_silhouette.png:md5,b7f103711912855f97d8588e1be2d659" + ] + ] + ], + "selected": [ + [ + { + "id": "test" + }, + "test_selected.json:md5,633493b1585fb0ec0a81629bde4c00cb" + ] + ], + "versions": [ + "versions.yml:md5,00e23b98be698c459e9c94079b0164e0" + ] + } + ], + "timestamp": "2026-04-29T14:41:23.201098606", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.09.0" + } + }, + "cluster_metrics - features and clusters": { + "content": [ + [ + [ + { + "id": "test" + }, + "test_metrics.tsv:md5,7bfdac9bca90ba2ea3c03eca25b24f28" + ] + ], + [ + [ + { + "id": "test" + }, + "test_k_sweep.csv:md5,b321710d5bc65ecdd9894da7e0de7d67" + ] + ], + [ + [ + { + "id": "test" + }, + "test_selected.json:md5,633493b1585fb0ec0a81629bde4c00cb" + ] + ], + [ + "versions.yml:md5,00e23b98be698c459e9c94079b0164e0" + ] + ], + "timestamp": "2026-04-29T14:41:18.57417712", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.09.0" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/cluster_viz/environment.yml b/modules/nf-core/cluster_viz/environment.yml new file mode 100644 index 000000000000..803fb67fb108 --- /dev/null +++ b/modules/nf-core/cluster_viz/environment.yml @@ -0,0 +1,11 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - matplotlib=3.9.* + - pandas=2.2.* + - python=3.12 + - scikit-learn=1.5.* + - umap-learn=0.5.* diff --git a/modules/nf-core/cluster_viz/main.nf b/modules/nf-core/cluster_viz/main.nf new file mode 100644 index 000000000000..a68e4cc6a1ad --- /dev/null +++ b/modules/nf-core/cluster_viz/main.nf @@ -0,0 +1,55 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl = 2 + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + PROCESS: CLUSTER_VIZ + Generates PCA, UMAP and t-SNE visualizations colored by cluster + Author: Donald Baku (athor) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +process CLUSTER_VIZ { + tag "$meta.id" + label 'process_medium' + conda "${moduleDir}/environment.yml" + + input: + tuple val(meta), path(features), path(clusters), path(pca_scores) + val out_prefix + + output: + tuple val(meta), path("*_umap.tsv") , emit: umap + tuple val(meta), path("*_tsne.tsv") , emit: tsne + tuple val(meta), path("*_umap.png") , emit: umap_png + tuple val(meta), path("*_tsne.png") , emit: tsne_png + tuple val(meta), path("*_pca.png") , emit: pca_png + path "versions.yml" , emit: versions, topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: out_prefix ?: "${meta.id}" + + """ + python3 ${projectDir}/modules/nf-core/cluster_viz/templates/cluster_viz.py \\ + --features ${features} \\ + --clusters ${clusters} \\ + --pca-scores ${pca_scores} \\ + --out-umap-tsv ${prefix}_umap.tsv \\ + --out-tsne-tsv ${prefix}_tsne.tsv \\ + --out-umap-png ${prefix}_umap.png \\ + --out-tsne-png ${prefix}_tsne.png \\ + --out-pca-png ${prefix}_pca.png + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | cut -d' ' -f2) + pandas: \$(python3 -c "import pandas; print(pandas.__version__)") + scikit-learn: \$(python3 -c "import sklearn; print(sklearn.__version__)") + umap-learn: \$(python3 -c "import umap; print(umap.__version__)" 2>/dev/null || echo 'N/A') + matplotlib: \$(python3 -c "import matplotlib; print(matplotlib.__version__)") + END_VERSIONS + """ +} diff --git a/modules/nf-core/cluster_viz/meta.yml b/modules/nf-core/cluster_viz/meta.yml new file mode 100644 index 000000000000..eca231e91f85 --- /dev/null +++ b/modules/nf-core/cluster_viz/meta.yml @@ -0,0 +1,105 @@ +name: "cluster_viz" +description: "Generates PCA, UMAP and t-SNE visualizations colored by cluster assignment" +keywords: + - clustering + - visualization + - umap + - tsne + - pca + +tools: + - "scikit-learn": + description: "Machine learning library" + homepage: "https://scikit-learn.org/" + documentation: "https://scikit-learn.org/stable/" + licence: ["BSD-3-Clause"] + - "umap-learn": + description: "Uniform Manifold Approximation and Projection" + homepage: "https://umap-learn.readthedocs.io/" + documentation: "https://umap-learn.readthedocs.io/" + licence: ["BSD-3-Clause"] + +authors: + - "@dbaku42" +maintainers: + - "@dbaku42" + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - features: + type: file + description: TSV with features used for clustering + pattern: "*.tsv" + - clusters: + type: file + description: Cluster assignment file + pattern: "*_clusters.*" + - pca_scores: + type: file + description: Original PCA scores file + pattern: "*.eigenvec" + + - out_prefix: + type: string + description: Prefix for output files + +output: + umap: + - - meta: + type: map + description: | + Groovy Map containing sample information + - "_umap.tsv": + type: file + description: UMAP coordinates TSV + pattern: "_umap.tsv" + + tsne: + - - meta: + type: map + description: | + Groovy Map containing sample information + - "_tsne.tsv": + type: file + description: t-SNE coordinates TSV + pattern: "_tsne.tsv" + + umap_png: + - - meta: + type: map + description: | + Groovy Map containing sample information + - "_umap.png": + type: file + description: UMAP plot PNG + pattern: "_umap.png" + + tsne_png: + - - meta: + type: map + description: | + Groovy Map containing sample information + - "_tsne.png": + type: file + description: t-SNE plot PNG + pattern: "_tsne.png" + + pca_png: + - - meta: + type: map + description: | + Groovy Map containing sample information + - "*_pca.png": + type: file + description: PCA plot colored by cluster + pattern: "*_pca.png" + + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" diff --git a/modules/nf-core/cluster_viz/templates/cluster_viz.py b/modules/nf-core/cluster_viz/templates/cluster_viz.py new file mode 100644 index 000000000000..18ffbba4fff7 --- /dev/null +++ b/modules/nf-core/cluster_viz/templates/cluster_viz.py @@ -0,0 +1,213 @@ +#!/usr/bin/env python3 + +"""Cluster visualizations. + +Produces three 2D plots, all colored by cluster label: + - PCA (first two columns from pca_scores) + - UMAP (computed on the feature matrix used for clustering) + - t-SNE (computed on the feature matrix used for clustering) + +Also writes UMAP and t-SNE coordinates to TSV. +""" + +import argparse + +import numpy as np +import pandas as pd +from sklearn.manifold import TSNE + + +def _normalise_id_column(df: pd.DataFrame) -> pd.DataFrame: + """ + Handles the header formats that FlashPCA/PLINK2 produces: + - '#IID' (PLINK2 eigenvec: leading hash on first column) + - 'IID' (FlashPCA / older PLINK) + - 'FID', 'IID' (two-column prefix) + - 'sample_id' (already normalised) + """ + # Strip leading '#' (PLINK2 eigenvec writes '#IID' as the first column) + df = df.rename(columns=lambda c: c.lstrip("#")) + + cols_upper = {c.upper(): c for c in df.columns} + + # Remove duplicate header row (IID value == "FID" or "IID") + if "IID" in cols_upper: + iid_col = cols_upper["IID"] + dup_mask = df[iid_col].str.upper().isin({"FID", "IID"}) + if dup_mask.any(): + df = df[~dup_mask].copy().reset_index(drop=True) + + cols_upper = {c.upper(): c for c in df.columns} + + if "SAMPLE_ID" in cols_upper: + return df + + if "IID" in cols_upper: + iid_col = cols_upper["IID"] + iid_numeric = pd.to_numeric(df[iid_col], errors="coerce").notna().all() + if iid_numeric: + df = df.drop(columns=[iid_col]) + df = df.rename(columns={df.columns[0]: "sample_id"}) + else: + df = df.rename(columns={iid_col: "sample_id"}) + + fid_cols = [c for c in df.columns if c.upper() == "FID"] + if fid_cols: + df = df.drop(columns=fid_cols) + return df + + raise ValueError( + f"Cannot find sample ID column (expected 'sample_id' or 'IID'). " + f"Found: {list(df.columns)}" + ) + + +def load_features(path: str) -> tuple[pd.DataFrame, pd.Series]: + df = pd.read_csv(path, sep=r"\s+", engine="python", dtype=str) + df = _normalise_id_column(df) + sample_ids = df["sample_id"].astype(str) + X = ( + df.drop(columns=["sample_id"]) + .apply(pd.to_numeric, errors="coerce") + .fillna(0.0) + ) + return X, sample_ids + + +def load_clusters(path: str) -> pd.Series: + df = pd.read_csv(path, sep=",", dtype=str) + df = _normalise_id_column(df) + if "cluster" not in df.columns: + raise ValueError("clusters CSV must have a 'cluster' column") + return df.set_index("sample_id")["cluster"].astype(int) + + +def safe_perplexity(n_samples: int, requested: float) -> float: + if n_samples <= 3: + return 1.0 + upper = (n_samples - 1) / 3.0 + return float(max(2.0, min(requested, upper))) + + +def compute_umap(X: np.ndarray, n_neighbors: int, min_dist: float) -> np.ndarray: + try: + import umap + return umap.UMAP( + n_components=2, + n_neighbors=n_neighbors, + min_dist=min_dist, + random_state=42, + ).fit_transform(X) + except Exception as e: + print(f"[WARN] UMAP failed, fallback to first 2 feature columns: {e}") + if X.shape[1] >= 2: + return X[:, :2] + elif X.shape[1] == 1: + return np.column_stack([X[:, 0], np.zeros(X.shape[0])]) + else: + return np.zeros((X.shape[0], 2)) + + +def compute_tsne(X: np.ndarray, perplexity: float, max_iter: int) -> np.ndarray: + return TSNE( + n_components=2, + perplexity=perplexity, + init="pca", + random_state=42, + max_iter=max_iter, + learning_rate="auto", + ).fit_transform(X) + + +def plot_scatter(df, x, y, out_png, title, xlabel=None, ylabel=None): + import matplotlib.pyplot as plt + from matplotlib.lines import Line2D + + plt.figure(figsize=(7, 5)) + labels = df["cluster"].astype(int).values + uniq = np.unique(labels) + sc = plt.scatter(df[x], df[y], c=labels, cmap="Paired", s=24,linewidths=0.4, alpha=0.85) + plt.title(title) + plt.xlabel(xlabel or x) + plt.ylabel(ylabel or y) + plt.grid(True, alpha=0.5) + handles = [ + Line2D([0], [0], marker="o", linestyle="", markersize=7, + markerfacecolor=sc.cmap(sc.norm(k)), markeredgecolor="none", + label=f"Cluster {k}") + for k in uniq + ] + plt.legend(handles=handles, title="Clusters", loc="center left", + bbox_to_anchor=(1.02, 0.5), borderaxespad=0.0, frameon=True) + plt.tight_layout() + plt.savefig(out_png, dpi=200, bbox_inches="tight") + plt.close() + + +def main() -> None: + ap = argparse.ArgumentParser(description="PCA + UMAP + t-SNE plots colored by cluster") + ap.add_argument("--features", required=True) + ap.add_argument("--clusters", required=True) + ap.add_argument("--pca-scores", required=True) + ap.add_argument("--tsne-perplexity", type=float, default=30.0) + ap.add_argument("--tsne-iter", type=int, default=1000) + ap.add_argument("--umap-neighbors", type=int, default=15) + ap.add_argument("--umap-min-dist", type=float, default=0.1) + ap.add_argument("--out-umap-tsv", required=True) + ap.add_argument("--out-tsne-tsv", required=True) + ap.add_argument("--out-umap-png", required=True) + ap.add_argument("--out-tsne-png", required=True) + ap.add_argument("--out-pca-png", required=True) + args = ap.parse_args() + + X_df, sample_ids = load_features(args.features) + clusters = load_clusters(args.clusters) + + common = sample_ids[sample_ids.isin(clusters.index)] + if len(common) == 0: + raise ValueError( + f"No overlapping sample_id between features and clusters.\n" + f" features IDs (first 5): {sample_ids.head().tolist()}\n" + f" clusters IDs (first 5): {list(clusters.index[:5])}" + ) + + X = X_df.loc[common.index].values + y = clusters.loc[common.values].values + + umap_coords = compute_umap(X, args.umap_neighbors, args.umap_min_dist) + umap_df = pd.DataFrame({ + "sample_id": common.values, + "x": umap_coords[:, 0], + "y": umap_coords[:, 1], + "cluster": y, + }) + umap_df.to_csv(args.out_umap_tsv, sep="\t", index=False) + plot_scatter(umap_df, "x", "y", args.out_umap_png, "UMAP embedding") + + perp = safe_perplexity(len(common), args.tsne_perplexity) + tsne_coords = compute_tsne(X, perp, args.tsne_iter) + tsne_df = pd.DataFrame({ + "sample_id": common.values, + "x": tsne_coords[:, 0], + "y": tsne_coords[:, 1], + "cluster": y, + }) + tsne_df.to_csv(args.out_tsne_tsv, sep="\t", index=False) + plot_scatter(tsne_df, "x", "y", args.out_tsne_png, + f"t-SNE (perplexity={perp:.1f})") + + pca_df = pd.read_csv(args.pca_scores, sep=r"\s+", engine="python", dtype=str) + pca_df = _normalise_id_column(pca_df) + comp_cols = [c for c in pca_df.columns if c != "sample_id"] + if len(comp_cols) < 2: + raise ValueError("pca_scores must have at least 2 PC columns") + c1, c2 = comp_cols[0], comp_cols[1] + for col in [c1, c2]: + pca_df[col] = pd.to_numeric(pca_df[col], errors="coerce") + merged = pca_df.merge(umap_df[["sample_id", "cluster"]], on="sample_id", how="inner") + plot_scatter(merged, c1, c2, args.out_pca_png, + "PCA", c1, c2) + + +if __name__ == "__main__": + main() diff --git a/modules/nf-core/cluster_viz/tests/data/test_clusters.csv b/modules/nf-core/cluster_viz/tests/data/test_clusters.csv new file mode 100644 index 000000000000..1258849b8fbe --- /dev/null +++ b/modules/nf-core/cluster_viz/tests/data/test_clusters.csv @@ -0,0 +1,6 @@ +sample_id,cluster +sample01,0 +sample02,2 +sample03,1 +sample04,2 +sample05,1 diff --git a/modules/nf-core/cluster_viz/tests/data/test_features.tsv b/modules/nf-core/cluster_viz/tests/data/test_features.tsv new file mode 100644 index 000000000000..033d23b82df8 --- /dev/null +++ b/modules/nf-core/cluster_viz/tests/data/test_features.tsv @@ -0,0 +1,6 @@ +sample_id PC1 PC2 PC3 +sample01 0.1234 0.5678 0.9012 +sample02 -0.2345 0.6789 -0.0123 +sample03 0.3456 -0.7890 0.1234 +sample04 -0.4567 0.8901 -0.2345 +sample05 0.5678 -0.9012 0.3456 diff --git a/modules/nf-core/cluster_viz/tests/data/test_pca.eigenvec b/modules/nf-core/cluster_viz/tests/data/test_pca.eigenvec new file mode 100644 index 000000000000..61aae5d8b413 --- /dev/null +++ b/modules/nf-core/cluster_viz/tests/data/test_pca.eigenvec @@ -0,0 +1,6 @@ +#FID IID PC1 PC2 PC3 +0 sample01 0.1234 0.5678 0.9012 +0 sample02 -0.2345 0.6789 -0.0123 +0 sample03 0.3456 -0.7890 0.1234 +0 sample04 -0.4567 0.8901 -0.2345 +0 sample05 0.5678 -0.9012 0.3456 diff --git a/modules/nf-core/cluster_viz/tests/main.nf.test b/modules/nf-core/cluster_viz/tests/main.nf.test new file mode 100644 index 000000000000..341f15cf062b --- /dev/null +++ b/modules/nf-core/cluster_viz/tests/main.nf.test @@ -0,0 +1,52 @@ +nextflow_process { + name "Test Process CLUSTER_VIZ" + script "../main.nf" + process "CLUSTER_VIZ" + tag "modules" + tag "modules_nfcore" + tag "cluster_viz" + + test("cluster_viz - features clusters pca") { + when { + process { + """ + input[0] = [ [id:'test'], + file("${projectDir}/modules/nf-core/cluster_viz/tests/data/test_features.tsv", checkIfExists: true), + file("${projectDir}/modules/nf-core/cluster_viz/tests/data/test_clusters.csv", checkIfExists: true), + file("${projectDir}/modules/nf-core/cluster_viz/tests/data/test_pca.eigenvec", checkIfExists: true) ] + input[1] = 'test' + """ + } + } + then { + assert process.success + assert snapshot( + process.out.umap, + process.out.tsne, + process.out.umap_png, + process.out.tsne_png, + process.out.pca_png, + process.out.versions + ).match() + } + } + + test("cluster_viz - features clusters pca - stub") { + options "-stub" + when { + process { + """ + input[0] = [ [id:'test'], + file("${projectDir}/modules/nf-core/cluster_viz/tests/data/test_features.tsv", checkIfExists: true), + file("${projectDir}/modules/nf-core/cluster_viz/tests/data/test_clusters.csv", checkIfExists: true), + file("${projectDir}/modules/nf-core/cluster_viz/tests/data/test_pca.eigenvec", checkIfExists: true) ] + input[1] = 'test' + """ + } + } + then { + assert process.success + assert snapshot(process.out).match() + } + } +} diff --git a/modules/nf-core/cluster_viz/tests/main.nf.test.snap b/modules/nf-core/cluster_viz/tests/main.nf.test.snap new file mode 100644 index 000000000000..4fe180aa20aa --- /dev/null +++ b/modules/nf-core/cluster_viz/tests/main.nf.test.snap @@ -0,0 +1,151 @@ +{ + "cluster_viz - features clusters pca": { + "content": [ + [ + [ + { + "id": "test" + }, + "test_umap.tsv:md5,feda69a2108d84b70c3937a1cb84e661" + ] + ], + [ + [ + { + "id": "test" + }, + "test_tsne.tsv:md5,fc79b712722096f127cdb80c269380fb" + ] + ], + [ + [ + { + "id": "test" + }, + "test_umap.png:md5,470fab262187541b0ad52a3138bf3734" + ] + ], + [ + [ + { + "id": "test" + }, + "test_tsne.png:md5,aeacfdc6d9e35dd27406e70ae9220715" + ] + ], + [ + [ + { + "id": "test" + }, + "test_pca.png:md5,a659f260b13e0351dd28cbb5163d6a20" + ] + ], + [ + "versions.yml:md5,786af5c4301c54553001db08a8c9db5b" + ] + ], + "timestamp": "2026-04-29T14:56:42.32373645", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.09.0" + } + }, + "cluster_viz - features clusters pca - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_umap.tsv:md5,feda69a2108d84b70c3937a1cb84e661" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test_tsne.tsv:md5,fc79b712722096f127cdb80c269380fb" + ] + ], + "2": [ + [ + { + "id": "test" + }, + "test_umap.png:md5,470fab262187541b0ad52a3138bf3734" + ] + ], + "3": [ + [ + { + "id": "test" + }, + "test_tsne.png:md5,aeacfdc6d9e35dd27406e70ae9220715" + ] + ], + "4": [ + [ + { + "id": "test" + }, + "test_pca.png:md5,a659f260b13e0351dd28cbb5163d6a20" + ] + ], + "5": [ + "versions.yml:md5,786af5c4301c54553001db08a8c9db5b" + ], + "pca_png": [ + [ + { + "id": "test" + }, + "test_pca.png:md5,a659f260b13e0351dd28cbb5163d6a20" + ] + ], + "tsne": [ + [ + { + "id": "test" + }, + "test_tsne.tsv:md5,fc79b712722096f127cdb80c269380fb" + ] + ], + "tsne_png": [ + [ + { + "id": "test" + }, + "test_tsne.png:md5,aeacfdc6d9e35dd27406e70ae9220715" + ] + ], + "umap": [ + [ + { + "id": "test" + }, + "test_umap.tsv:md5,feda69a2108d84b70c3937a1cb84e661" + ] + ], + "umap_png": [ + [ + { + "id": "test" + }, + "test_umap.png:md5,470fab262187541b0ad52a3138bf3734" + ] + ], + "versions": [ + "versions.yml:md5,786af5c4301c54553001db08a8c9db5b" + ] + } + ], + "timestamp": "2026-04-29T14:56:54.120297782", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.09.0" + } + } +} \ No newline at end of file From 0320a4163df54ed91e3ecc9e9a5846d89437b428 Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Mon, 4 May 2026 14:44:50 +0200 Subject: [PATCH 02/38] Move custom clustering modules under custom --- modules/nf-core/cluster_metrics/meta.yml | 62 ---------- modules/nf-core/cluster_viz/meta.yml | 105 ---------------- .../cluster_metrics/environment.yml | 0 .../{ => custom}/cluster_metrics/main.nf | 4 +- .../nf-core/custom/cluster_metrics/meta.yml | 93 +++++++++++++++ .../templates/cluster_metrics.py | 0 .../tests/data/test_clusters.csv | 0 .../tests/data/test_features.tsv | 0 .../cluster_metrics/tests/main.nf.test | 4 +- .../cluster_metrics/tests/main.nf.test.snap | 10 +- .../{ => custom}/cluster_viz/environment.yml | 0 .../nf-core/{ => custom}/cluster_viz/main.nf | 4 +- modules/nf-core/custom/cluster_viz/meta.yml | 112 ++++++++++++++++++ .../cluster_viz/templates/cluster_viz.py | 0 .../cluster_viz/tests/data/test_clusters.csv | 0 .../cluster_viz/tests/data/test_features.tsv | 0 .../cluster_viz/tests/data/test_pca.eigenvec | 0 .../cluster_viz/tests/main.nf.test | 12 +- .../cluster_viz/tests/main.nf.test.snap | 0 19 files changed, 222 insertions(+), 184 deletions(-) delete mode 100644 modules/nf-core/cluster_metrics/meta.yml delete mode 100644 modules/nf-core/cluster_viz/meta.yml rename modules/nf-core/{ => custom}/cluster_metrics/environment.yml (100%) rename modules/nf-core/{ => custom}/cluster_metrics/main.nf (92%) create mode 100644 modules/nf-core/custom/cluster_metrics/meta.yml rename modules/nf-core/{ => custom}/cluster_metrics/templates/cluster_metrics.py (100%) rename modules/nf-core/{ => custom}/cluster_metrics/tests/data/test_clusters.csv (100%) rename modules/nf-core/{ => custom}/cluster_metrics/tests/data/test_features.tsv (100%) rename modules/nf-core/{ => custom}/cluster_metrics/tests/main.nf.test (73%) rename modules/nf-core/{ => custom}/cluster_metrics/tests/main.nf.test.snap (91%) rename modules/nf-core/{ => custom}/cluster_viz/environment.yml (100%) rename modules/nf-core/{ => custom}/cluster_viz/main.nf (93%) create mode 100644 modules/nf-core/custom/cluster_viz/meta.yml rename modules/nf-core/{ => custom}/cluster_viz/templates/cluster_viz.py (100%) rename modules/nf-core/{ => custom}/cluster_viz/tests/data/test_clusters.csv (100%) rename modules/nf-core/{ => custom}/cluster_viz/tests/data/test_features.tsv (100%) rename modules/nf-core/{ => custom}/cluster_viz/tests/data/test_pca.eigenvec (100%) rename modules/nf-core/{ => custom}/cluster_viz/tests/main.nf.test (59%) rename modules/nf-core/{ => custom}/cluster_viz/tests/main.nf.test.snap (100%) diff --git a/modules/nf-core/cluster_metrics/meta.yml b/modules/nf-core/cluster_metrics/meta.yml deleted file mode 100644 index 0c55de377200..000000000000 --- a/modules/nf-core/cluster_metrics/meta.yml +++ /dev/null @@ -1,62 +0,0 @@ -name: "cluster_metrics" -description: "Computes clustering quality metrics (silhouette, Calinski-Harabasz, Davies-Bouldin) and performs k-sweep analysis" -keywords: - - clustering - - metrics - - silhouette - - calinski-harabasz - - davies-bouldin - - evaluation - -tools: - - "scikit-learn": - description: "Machine learning library for clustering metrics" - homepage: "https://scikit-learn.org/" - documentation: "https://scikit-learn.org/stable/modules/clustering.html" - licence: ["BSD-3-Clause"] - -authors: - - "@dbaku42" -maintainers: - - "@dbaku42" - -input: - - meta: - type: map - description: Groovy Map containing sample information - - features: - type: file - description: TSV file with sample_id and numeric features (e.g. PCA scores) - pattern: "*.tsv" - - clusters: - type: file - description: CSV/TSV file with sample_id and cluster assignment - pattern: "*_clusters.*" - - out_prefix: - type: string - description: Prefix for output files - -output: - - meta: - type: map - description: Groovy Map containing sample information - - metrics: - type: file - description: TSV with selected cluster quality metrics - pattern: "*_metrics.tsv" - - k_sweep: - type: file - description: CSV with metrics for different values of k - pattern: "*_k_sweep.csv" - - selected: - type: file - description: JSON with the selected/best metrics - pattern: "*_selected.json" - - plots: - type: file - description: Optional PNG plots (elbow, silhouette, etc.) - pattern: "*.png" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" diff --git a/modules/nf-core/cluster_viz/meta.yml b/modules/nf-core/cluster_viz/meta.yml deleted file mode 100644 index eca231e91f85..000000000000 --- a/modules/nf-core/cluster_viz/meta.yml +++ /dev/null @@ -1,105 +0,0 @@ -name: "cluster_viz" -description: "Generates PCA, UMAP and t-SNE visualizations colored by cluster assignment" -keywords: - - clustering - - visualization - - umap - - tsne - - pca - -tools: - - "scikit-learn": - description: "Machine learning library" - homepage: "https://scikit-learn.org/" - documentation: "https://scikit-learn.org/stable/" - licence: ["BSD-3-Clause"] - - "umap-learn": - description: "Uniform Manifold Approximation and Projection" - homepage: "https://umap-learn.readthedocs.io/" - documentation: "https://umap-learn.readthedocs.io/" - licence: ["BSD-3-Clause"] - -authors: - - "@dbaku42" -maintainers: - - "@dbaku42" - -input: - - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test' ] - - features: - type: file - description: TSV with features used for clustering - pattern: "*.tsv" - - clusters: - type: file - description: Cluster assignment file - pattern: "*_clusters.*" - - pca_scores: - type: file - description: Original PCA scores file - pattern: "*.eigenvec" - - - out_prefix: - type: string - description: Prefix for output files - -output: - umap: - - - meta: - type: map - description: | - Groovy Map containing sample information - - "_umap.tsv": - type: file - description: UMAP coordinates TSV - pattern: "_umap.tsv" - - tsne: - - - meta: - type: map - description: | - Groovy Map containing sample information - - "_tsne.tsv": - type: file - description: t-SNE coordinates TSV - pattern: "_tsne.tsv" - - umap_png: - - - meta: - type: map - description: | - Groovy Map containing sample information - - "_umap.png": - type: file - description: UMAP plot PNG - pattern: "_umap.png" - - tsne_png: - - - meta: - type: map - description: | - Groovy Map containing sample information - - "_tsne.png": - type: file - description: t-SNE plot PNG - pattern: "_tsne.png" - - pca_png: - - - meta: - type: map - description: | - Groovy Map containing sample information - - "*_pca.png": - type: file - description: PCA plot colored by cluster - pattern: "*_pca.png" - - versions: - - versions.yml: - type: file - description: File containing software versions - pattern: "versions.yml" diff --git a/modules/nf-core/cluster_metrics/environment.yml b/modules/nf-core/custom/cluster_metrics/environment.yml similarity index 100% rename from modules/nf-core/cluster_metrics/environment.yml rename to modules/nf-core/custom/cluster_metrics/environment.yml diff --git a/modules/nf-core/cluster_metrics/main.nf b/modules/nf-core/custom/cluster_metrics/main.nf similarity index 92% rename from modules/nf-core/cluster_metrics/main.nf rename to modules/nf-core/custom/cluster_metrics/main.nf index 001f5479d85d..71b91e9c3c18 100644 --- a/modules/nf-core/cluster_metrics/main.nf +++ b/modules/nf-core/custom/cluster_metrics/main.nf @@ -5,7 +5,7 @@ nextflow.enable.dsl = 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PROCESS: CLUSTER_METRICS Compute clustering quality metrics and k-sweep - Author: Donald Baku (athor) + Author: Donald Baku (author) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ @@ -32,7 +32,7 @@ process CLUSTER_METRICS { def prefix = task.ext.prefix ?: out_prefix ?: "${meta.id}" """ - python3 ${projectDir}/modules/nf-core/cluster_metrics/templates/cluster_metrics.py \\ + python3 ${projectDir}/modules/nf-core/custom/cluster_metrics/templates/cluster_metrics.py \\ --features ${features} \\ --clusters ${clusters} \\ --out-k-sweep ${prefix}_k_sweep.csv \\ diff --git a/modules/nf-core/custom/cluster_metrics/meta.yml b/modules/nf-core/custom/cluster_metrics/meta.yml new file mode 100644 index 000000000000..0716b49dfb6a --- /dev/null +++ b/modules/nf-core/custom/cluster_metrics/meta.yml @@ -0,0 +1,93 @@ +name: "cluster_metrics" +description: "Computes clustering quality metrics (silhouette, Calinski-Harabasz, + Davies-Bouldin) and performs k-sweep analysis" +keywords: + - clustering + - metrics + - silhouette + - calinski-harabasz + - davies-bouldin + - evaluation +tools: + - "scikit-learn": + description: "Machine learning library for clustering metrics" + homepage: "https://scikit-learn.org/" + documentation: "https://scikit-learn.org/stable/modules/clustering.html" + licence: + - "BSD-3-Clause" + identifier: "" +input: + - - meta: + type: map + description: Groovy Map containing sample information + - features: + type: file + description: TSV file with sample_id and numeric features (e.g. PCA + scores) + pattern: "*.tsv" + ontologies: + - edam: http://edamontology.org/format_3475 + - clusters: + type: file + description: CSV/TSV file with sample_id and cluster assignment + pattern: "*_clusters.*" + ontologies: [] + - out_prefix: + type: string + description: Prefix for output files +output: + metrics: + - - meta: + type: map + description: Groovy Map containing sample information + - "*_metrics.tsv": + type: file + description: TSV with selected cluster quality metrics + pattern: "*_metrics.tsv" + ontologies: + - edam: http://edamontology.org/format_3475 + k_sweep: + - - meta: + type: map + description: Groovy Map containing sample information + - "*_k_sweep.csv": + type: file + description: CSV with metrics for different values of k + pattern: "*_k_sweep.csv" + ontologies: + - edam: http://edamontology.org/format_3752 + selected: + - - meta: + type: map + description: Groovy Map containing sample information + - "*_selected.json": + type: file + description: JSON with the selected/best metrics + pattern: "*_selected.json" + ontologies: + - edam: http://edamontology.org/format_3464 + plots: + - - meta: + type: map + description: Groovy Map containing sample information + - "*.png": + type: file + description: Optional PNG plots (elbow, silhouette, etc.) + pattern: "*.png" + ontologies: [] + versions: + - "versions.yml": + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 +topics: + versions: + - versions.yml: + type: string + description: The name of the process +authors: + - "@dbaku42" +maintainers: + - "@dbaku42" diff --git a/modules/nf-core/cluster_metrics/templates/cluster_metrics.py b/modules/nf-core/custom/cluster_metrics/templates/cluster_metrics.py similarity index 100% rename from modules/nf-core/cluster_metrics/templates/cluster_metrics.py rename to modules/nf-core/custom/cluster_metrics/templates/cluster_metrics.py diff --git a/modules/nf-core/cluster_metrics/tests/data/test_clusters.csv b/modules/nf-core/custom/cluster_metrics/tests/data/test_clusters.csv similarity index 100% rename from modules/nf-core/cluster_metrics/tests/data/test_clusters.csv rename to modules/nf-core/custom/cluster_metrics/tests/data/test_clusters.csv diff --git a/modules/nf-core/cluster_metrics/tests/data/test_features.tsv b/modules/nf-core/custom/cluster_metrics/tests/data/test_features.tsv similarity index 100% rename from modules/nf-core/cluster_metrics/tests/data/test_features.tsv rename to modules/nf-core/custom/cluster_metrics/tests/data/test_features.tsv diff --git a/modules/nf-core/cluster_metrics/tests/main.nf.test b/modules/nf-core/custom/cluster_metrics/tests/main.nf.test similarity index 73% rename from modules/nf-core/cluster_metrics/tests/main.nf.test rename to modules/nf-core/custom/cluster_metrics/tests/main.nf.test index c66d8349ed93..e335db60dd74 100644 --- a/modules/nf-core/cluster_metrics/tests/main.nf.test +++ b/modules/nf-core/custom/cluster_metrics/tests/main.nf.test @@ -10,7 +10,7 @@ nextflow_process { when { process { """ - input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/cluster_metrics/tests/data/test_features.tsv", checkIfExists: true), file("${projectDir}/modules/nf-core/cluster_metrics/tests/data/test_clusters.csv", checkIfExists: true) ] + input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/cluster_metrics/tests/data/test_features.tsv", checkIfExists: true), file("${projectDir}/modules/nf-core/custom/cluster_metrics/tests/data/test_clusters.csv", checkIfExists: true) ] input[1] = 'test' """ } @@ -31,7 +31,7 @@ nextflow_process { when { process { """ - input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/cluster_metrics/tests/data/test_features.tsv", checkIfExists: true), file("${projectDir}/modules/nf-core/cluster_metrics/tests/data/test_clusters.csv", checkIfExists: true) ] + input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/cluster_metrics/tests/data/test_features.tsv", checkIfExists: true), file("${projectDir}/modules/nf-core/custom/cluster_metrics/tests/data/test_clusters.csv", checkIfExists: true) ] input[1] = 'test' """ } diff --git a/modules/nf-core/cluster_metrics/tests/main.nf.test.snap b/modules/nf-core/custom/cluster_metrics/tests/main.nf.test.snap similarity index 91% rename from modules/nf-core/cluster_metrics/tests/main.nf.test.snap rename to modules/nf-core/custom/cluster_metrics/tests/main.nf.test.snap index 30f9c8ea46a9..62189103b866 100644 --- a/modules/nf-core/cluster_metrics/tests/main.nf.test.snap +++ b/modules/nf-core/custom/cluster_metrics/tests/main.nf.test.snap @@ -15,7 +15,7 @@ { "id": "test" }, - "test_k_sweep.csv:md5,9709607199ba889999dadf4d251497be" + "test_k_sweep.csv:md5,6be53a6eb8379a9ac718692a86702e3b" ] ], "2": [ @@ -47,7 +47,7 @@ { "id": "test" }, - "test_k_sweep.csv:md5,9709607199ba889999dadf4d251497be" + "test_k_sweep.csv:md5,6be53a6eb8379a9ac718692a86702e3b" ] ], "metrics": [ @@ -84,7 +84,7 @@ ] } ], - "timestamp": "2026-04-29T14:41:23.201098606", + "timestamp": "2026-05-04T14:41:43.518699109", "meta": { "nf-test": "0.9.5", "nextflow": "25.09.0" @@ -105,7 +105,7 @@ { "id": "test" }, - "test_k_sweep.csv:md5,b321710d5bc65ecdd9894da7e0de7d67" + "test_k_sweep.csv:md5,9709607199ba889999dadf4d251497be" ] ], [ @@ -120,7 +120,7 @@ "versions.yml:md5,00e23b98be698c459e9c94079b0164e0" ] ], - "timestamp": "2026-04-29T14:41:18.57417712", + "timestamp": "2026-05-04T14:41:38.770667954", "meta": { "nf-test": "0.9.5", "nextflow": "25.09.0" diff --git a/modules/nf-core/cluster_viz/environment.yml b/modules/nf-core/custom/cluster_viz/environment.yml similarity index 100% rename from modules/nf-core/cluster_viz/environment.yml rename to modules/nf-core/custom/cluster_viz/environment.yml diff --git a/modules/nf-core/cluster_viz/main.nf b/modules/nf-core/custom/cluster_viz/main.nf similarity index 93% rename from modules/nf-core/cluster_viz/main.nf rename to modules/nf-core/custom/cluster_viz/main.nf index a68e4cc6a1ad..a5d6d9eb8e50 100644 --- a/modules/nf-core/cluster_viz/main.nf +++ b/modules/nf-core/custom/cluster_viz/main.nf @@ -5,7 +5,7 @@ nextflow.enable.dsl = 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PROCESS: CLUSTER_VIZ Generates PCA, UMAP and t-SNE visualizations colored by cluster - Author: Donald Baku (athor) + Author: Donald Baku (author) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ @@ -33,7 +33,7 @@ process CLUSTER_VIZ { def prefix = task.ext.prefix ?: out_prefix ?: "${meta.id}" """ - python3 ${projectDir}/modules/nf-core/cluster_viz/templates/cluster_viz.py \\ + python3 ${projectDir}/modules/nf-core/custom/cluster_viz/templates/cluster_viz.py \\ --features ${features} \\ --clusters ${clusters} \\ --pca-scores ${pca_scores} \\ diff --git a/modules/nf-core/custom/cluster_viz/meta.yml b/modules/nf-core/custom/cluster_viz/meta.yml new file mode 100644 index 000000000000..cdad47cfcb26 --- /dev/null +++ b/modules/nf-core/custom/cluster_viz/meta.yml @@ -0,0 +1,112 @@ +name: "cluster_viz" +description: "Generates PCA, UMAP and t-SNE visualizations colored by cluster" +keywords: + - clustering + - visualization + - pca + - umap + - tsne + - dimension-reduction +tools: + - "scikit-learn": + description: "Machine learning library for dimension reduction (PCA, t-SNE)" + homepage: "https://scikit-learn.org/" + documentation: "https://scikit-learn.org/stable/modules/clustering.html" + licence: + - "BSD-3-Clause" + identifier: "" + - "umap-learn": + description: "Uniform Manifold Approximation and Projection for dimension reduction" + homepage: "https://umap-learn.readthedocs.io/" + documentation: "https://umap-learn.readthedocs.io/en/latest/" + licence: + - "BSD-3-Clause" + identifier: "" +input: + - - meta: + type: map + description: Groovy Map containing sample information + - features: + type: file + description: TSV file with sample_id and numeric features + pattern: "*.tsv" + ontologies: + - edam: http://edamontology.org/format_3475 + - clusters: + type: file + description: CSV/TSV file with sample_id and cluster assignment + pattern: "*_clusters.*" + ontologies: [] + - pca_scores: + type: file + description: TSV file with PCA scores from previous step + pattern: "*_pca_scores.tsv" + ontologies: + - edam: http://edamontology.org/format_3475 + - out_prefix: + type: string + description: Prefix for output files +output: + umap: + - - meta: + type: map + description: Groovy Map containing sample information + - "*_umap.tsv": + type: file + description: UMAP coordinates TSV file + pattern: "*_umap.tsv" + ontologies: + - edam: http://edamontology.org/format_3475 + tsne: + - - meta: + type: map + description: Groovy Map containing sample information + - "*_tsne.tsv": + type: file + description: t-SNE coordinates TSV file + pattern: "*_tsne.tsv" + ontologies: + - edam: http://edamontology.org/format_3475 + umap_png: + - - meta: + type: map + description: Groovy Map containing sample information + - "*_umap.png": + type: file + description: UMAP visualization plot + pattern: "*_umap.png" + ontologies: [] + tsne_png: + - - meta: + type: map + description: Groovy Map containing sample information + - "*_tsne.png": + type: file + description: t-SNE visualization plot + pattern: "*_tsne.png" + ontologies: [] + pca_png: + - - meta: + type: map + description: Groovy Map containing sample information + - "*_pca.png": + type: file + description: PCA visualization plot + pattern: "*_pca.png" + ontologies: [] + versions: + - "versions.yml": + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 +topics: + versions: + - versions.yml: + type: string + description: The name of the process +authors: + - "@dbaku42" +maintainers: + - "@dbaku42" diff --git a/modules/nf-core/cluster_viz/templates/cluster_viz.py b/modules/nf-core/custom/cluster_viz/templates/cluster_viz.py similarity index 100% rename from modules/nf-core/cluster_viz/templates/cluster_viz.py rename to modules/nf-core/custom/cluster_viz/templates/cluster_viz.py diff --git a/modules/nf-core/cluster_viz/tests/data/test_clusters.csv b/modules/nf-core/custom/cluster_viz/tests/data/test_clusters.csv similarity index 100% rename from modules/nf-core/cluster_viz/tests/data/test_clusters.csv rename to modules/nf-core/custom/cluster_viz/tests/data/test_clusters.csv diff --git a/modules/nf-core/cluster_viz/tests/data/test_features.tsv b/modules/nf-core/custom/cluster_viz/tests/data/test_features.tsv similarity index 100% rename from modules/nf-core/cluster_viz/tests/data/test_features.tsv rename to modules/nf-core/custom/cluster_viz/tests/data/test_features.tsv diff --git a/modules/nf-core/cluster_viz/tests/data/test_pca.eigenvec b/modules/nf-core/custom/cluster_viz/tests/data/test_pca.eigenvec similarity index 100% rename from modules/nf-core/cluster_viz/tests/data/test_pca.eigenvec rename to modules/nf-core/custom/cluster_viz/tests/data/test_pca.eigenvec diff --git a/modules/nf-core/cluster_viz/tests/main.nf.test b/modules/nf-core/custom/cluster_viz/tests/main.nf.test similarity index 59% rename from modules/nf-core/cluster_viz/tests/main.nf.test rename to modules/nf-core/custom/cluster_viz/tests/main.nf.test index 341f15cf062b..3c873f6c1420 100644 --- a/modules/nf-core/cluster_viz/tests/main.nf.test +++ b/modules/nf-core/custom/cluster_viz/tests/main.nf.test @@ -11,9 +11,9 @@ nextflow_process { process { """ input[0] = [ [id:'test'], - file("${projectDir}/modules/nf-core/cluster_viz/tests/data/test_features.tsv", checkIfExists: true), - file("${projectDir}/modules/nf-core/cluster_viz/tests/data/test_clusters.csv", checkIfExists: true), - file("${projectDir}/modules/nf-core/cluster_viz/tests/data/test_pca.eigenvec", checkIfExists: true) ] + file("${projectDir}/modules/nf-core/custom/cluster_viz/tests/data/test_features.tsv", checkIfExists: true), + file("${projectDir}/modules/nf-core/custom/cluster_viz/tests/data/test_clusters.csv", checkIfExists: true), + file("${projectDir}/modules/nf-core/custom/cluster_viz/tests/data/test_pca.eigenvec", checkIfExists: true) ] input[1] = 'test' """ } @@ -37,9 +37,9 @@ nextflow_process { process { """ input[0] = [ [id:'test'], - file("${projectDir}/modules/nf-core/cluster_viz/tests/data/test_features.tsv", checkIfExists: true), - file("${projectDir}/modules/nf-core/cluster_viz/tests/data/test_clusters.csv", checkIfExists: true), - file("${projectDir}/modules/nf-core/cluster_viz/tests/data/test_pca.eigenvec", checkIfExists: true) ] + file("${projectDir}/modules/nf-core/custom/cluster_viz/tests/data/test_features.tsv", checkIfExists: true), + file("${projectDir}/modules/nf-core/custom/cluster_viz/tests/data/test_clusters.csv", checkIfExists: true), + file("${projectDir}/modules/nf-core/custom/cluster_viz/tests/data/test_pca.eigenvec", checkIfExists: true) ] input[1] = 'test' """ } diff --git a/modules/nf-core/cluster_viz/tests/main.nf.test.snap b/modules/nf-core/custom/cluster_viz/tests/main.nf.test.snap similarity index 100% rename from modules/nf-core/cluster_viz/tests/main.nf.test.snap rename to modules/nf-core/custom/cluster_viz/tests/main.nf.test.snap From e42433815c9bc1590454cd80d5e2335d9a9aa77e Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Mon, 4 May 2026 15:35:55 +0200 Subject: [PATCH 03/38] Move custom clustering modules under custom --- .../cluster_metrics/templates/cluster_metrics.py | 4 ++-- .../nf-core/custom/cluster_metrics/tests/main.nf.test | 2 ++ .../custom/cluster_metrics/tests/main.nf.test.snap | 10 +++++----- modules/nf-core/custom/cluster_viz/tests/main.nf.test | 2 ++ 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/modules/nf-core/custom/cluster_metrics/templates/cluster_metrics.py b/modules/nf-core/custom/cluster_metrics/templates/cluster_metrics.py index 93da1f6a03d0..401172dca7d3 100644 --- a/modules/nf-core/custom/cluster_metrics/templates/cluster_metrics.py +++ b/modules/nf-core/custom/cluster_metrics/templates/cluster_metrics.py @@ -242,7 +242,7 @@ def main() -> None: }) sweep_df = pd.DataFrame(rows) - sweep_df.to_csv(args.out_k_sweep, sep=",", index=False) + sweep_df.to_csv(args.out_k_sweep, sep=",", index=False, float_format="%.10g") Path(args.out_selected).write_text(json.dumps(selected, indent=2)) pfx = args.out_prefix @@ -273,4 +273,4 @@ def plot_curve(metric, title, ylabel, out_png): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/modules/nf-core/custom/cluster_metrics/tests/main.nf.test b/modules/nf-core/custom/cluster_metrics/tests/main.nf.test index e335db60dd74..d541633b82d1 100644 --- a/modules/nf-core/custom/cluster_metrics/tests/main.nf.test +++ b/modules/nf-core/custom/cluster_metrics/tests/main.nf.test @@ -4,6 +4,8 @@ nextflow_process { process "CLUSTER_METRICS" tag "modules" tag "modules_nfcore" + tag "custom" + tag "custom/cluster_metrics" tag "cluster_metrics" test("cluster_metrics - features and clusters") { diff --git a/modules/nf-core/custom/cluster_metrics/tests/main.nf.test.snap b/modules/nf-core/custom/cluster_metrics/tests/main.nf.test.snap index 62189103b866..d6dcb96d8697 100644 --- a/modules/nf-core/custom/cluster_metrics/tests/main.nf.test.snap +++ b/modules/nf-core/custom/cluster_metrics/tests/main.nf.test.snap @@ -15,7 +15,7 @@ { "id": "test" }, - "test_k_sweep.csv:md5,6be53a6eb8379a9ac718692a86702e3b" + "test_k_sweep.csv:md5,8c382a00d959ae5d6e19f42cf9278f37" ] ], "2": [ @@ -47,7 +47,7 @@ { "id": "test" }, - "test_k_sweep.csv:md5,6be53a6eb8379a9ac718692a86702e3b" + "test_k_sweep.csv:md5,8c382a00d959ae5d6e19f42cf9278f37" ] ], "metrics": [ @@ -84,7 +84,7 @@ ] } ], - "timestamp": "2026-05-04T14:41:43.518699109", + "timestamp": "2026-05-04T15:30:54.137415161", "meta": { "nf-test": "0.9.5", "nextflow": "25.09.0" @@ -105,7 +105,7 @@ { "id": "test" }, - "test_k_sweep.csv:md5,9709607199ba889999dadf4d251497be" + "test_k_sweep.csv:md5,8c382a00d959ae5d6e19f42cf9278f37" ] ], [ @@ -120,7 +120,7 @@ "versions.yml:md5,00e23b98be698c459e9c94079b0164e0" ] ], - "timestamp": "2026-05-04T14:41:38.770667954", + "timestamp": "2026-05-04T15:30:49.380085424", "meta": { "nf-test": "0.9.5", "nextflow": "25.09.0" diff --git a/modules/nf-core/custom/cluster_viz/tests/main.nf.test b/modules/nf-core/custom/cluster_viz/tests/main.nf.test index 3c873f6c1420..a65f09b9d76f 100644 --- a/modules/nf-core/custom/cluster_viz/tests/main.nf.test +++ b/modules/nf-core/custom/cluster_viz/tests/main.nf.test @@ -4,6 +4,8 @@ nextflow_process { process "CLUSTER_VIZ" tag "modules" tag "modules_nfcore" + tag "custom" + tag "custom/cluster_viz" tag "cluster_viz" test("cluster_viz - features clusters pca") { From 670bc334706cd23b6bcd38f060f2317a6d7b4a7c Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Mon, 4 May 2026 15:47:14 +0200 Subject: [PATCH 04/38] Fix custom clustering module lint --- .../templates/cluster_metrics.py | 99 ++++++++++--------- .../cluster_viz/templates/cluster_viz.py | 99 ++++++++++--------- .../custom/cluster_viz/tests/main.nf.test | 4 +- 3 files changed, 110 insertions(+), 92 deletions(-) diff --git a/modules/nf-core/custom/cluster_metrics/templates/cluster_metrics.py b/modules/nf-core/custom/cluster_metrics/templates/cluster_metrics.py index 401172dca7d3..13a417ac37fe 100644 --- a/modules/nf-core/custom/cluster_metrics/templates/cluster_metrics.py +++ b/modules/nf-core/custom/cluster_metrics/templates/cluster_metrics.py @@ -4,16 +4,16 @@ import json from pathlib import Path +import matplotlib import numpy as np import pandas as pd from sklearn.cluster import KMeans from sklearn.metrics import ( - silhouette_score, calinski_harabasz_score, davies_bouldin_score, + silhouette_score, ) -import matplotlib matplotlib.use("Agg") @@ -55,10 +55,7 @@ def _normalise_id_column(df: pd.DataFrame) -> pd.DataFrame: return df - raise ValueError( - f"Cannot find sample ID column (expected 'sample_id' or 'IID'). " - f"Found: {list(df.columns)}" - ) + raise ValueError(f"Cannot find sample ID column (expected 'sample_id' or 'IID'). Found: {list(df.columns)}") def load_features(path: str) -> tuple[pd.DataFrame, pd.Series]: @@ -69,11 +66,11 @@ def load_features(path: str) -> tuple[pd.DataFrame, pd.Series]: raise ValueError("features file must contain a sample_id column after normalization") sample_ids = df["sample_id"].astype(str) - X = df.drop(columns=["sample_id"]).apply(pd.to_numeric, errors="coerce") - X = X.fillna(X.mean(numeric_only=True)) - X = X.fillna(0.0) + x = df.drop(columns=["sample_id"]).apply(pd.to_numeric, errors="coerce") + x = x.fillna(x.mean(numeric_only=True)) + x = x.fillna(0.0) - return X, sample_ids + return x, sample_ids def _looks_mostly_numeric(s: pd.Series) -> bool: @@ -120,19 +117,19 @@ def load_clusters(path: str) -> tuple[pd.DataFrame, str]: candidate_vals = df[candidate].astype(str) if not _looks_mostly_numeric(candidate_vals): - out = pd.DataFrame({ - "sample_id": candidate_vals, - "cluster": pd.to_numeric(df[cluster_col], errors="raise").astype(int), - }) + out = pd.DataFrame( + { + "sample_id": candidate_vals, + "cluster": pd.to_numeric(df[cluster_col], errors="raise").astype(int), + } + ) return out, "sample_id" - out = pd.DataFrame({ - "cluster": pd.to_numeric(df[cluster_col], errors="raise").astype(int) - }) + out = pd.DataFrame({"cluster": pd.to_numeric(df[cluster_col], errors="raise").astype(int)}) return out, "row_order" -def safe_cluster_metrics(X: np.ndarray, labels: np.ndarray) -> dict: +def safe_cluster_metrics(x: np.ndarray, labels: np.ndarray) -> dict: uniq = np.unique(labels) n_clusters = len(uniq) - (1 if -1 in uniq else 0) @@ -145,9 +142,9 @@ def safe_cluster_metrics(X: np.ndarray, labels: np.ndarray) -> dict: } mask = labels != -1 - X_use, y_use = X[mask], labels[mask] + x_use, y_use = x[mask], labels[mask] - if len(X_use) < 2 or len(np.unique(y_use)) < 2: + if len(x_use) < 2 or len(np.unique(y_use)) < 2: return { "n_clusters": int(n_clusters), "silhouette": None, @@ -157,9 +154,9 @@ def safe_cluster_metrics(X: np.ndarray, labels: np.ndarray) -> dict: return { "n_clusters": int(n_clusters), - "silhouette": float(silhouette_score(X_use, y_use)), - "calinski_harabasz": float(calinski_harabasz_score(X_use, y_use)), - "davies_bouldin": float(davies_bouldin_score(X_use, y_use)), + "silhouette": float(silhouette_score(x_use, y_use)), + "calinski_harabasz": float(calinski_harabasz_score(x_use, y_use)), + "davies_bouldin": float(davies_bouldin_score(x_use, y_use)), } @@ -174,7 +171,7 @@ def main() -> None: ap.add_argument("--out-prefix", required=True) args = ap.parse_args() - X_df, sample_ids = load_features(args.features) + x_df, sample_ids = load_features(args.features) clusters_df, cluster_mode = load_clusters(args.clusters) if cluster_mode == "sample_id": @@ -182,12 +179,12 @@ def main() -> None: common = sample_ids[sample_ids.isin(clusters.index)] if len(common) > 0: - X = X_df.loc[common.index].values + x = x_df.loc[common.index].values labels = clusters.loc[common.values].values aligned_ids = common.astype(str).tolist() alignment_mode = "sample_id" elif len(clusters_df) == len(sample_ids): - X = X_df.values + x = x_df.values labels = clusters_df["cluster"].values aligned_ids = sample_ids.astype(str).tolist() alignment_mode = "row_order_fallback" @@ -204,15 +201,15 @@ def main() -> None: f" n_features={len(sample_ids)}\n" f" n_clusters={len(clusters_df)}" ) - X = X_df.values + x = x_df.values labels = clusters_df["cluster"].values aligned_ids = sample_ids.astype(str).tolist() alignment_mode = "row_order" - if len(X) < 2: + if len(x) < 2: raise ValueError("Need at least 2 samples to compute cluster metrics") - selected = safe_cluster_metrics(X, labels) + selected = safe_cluster_metrics(x, labels) selected["input_clusters"] = Path(args.clusters).name selected["input_features"] = Path(args.features).name selected["n_samples_used"] = int(len(aligned_ids)) @@ -222,24 +219,26 @@ def main() -> None: pd.DataFrame([selected]).to_csv(metrics_tsv, sep="\t", index=False) rows = [] - max_k = min(int(args.k_max), len(X)) + max_k = min(int(args.k_max), len(x)) for k in range(int(args.k_min), max_k + 1): model = KMeans(n_clusters=k, n_init="auto", random_state=42) - y = model.fit_predict(X) + y = model.fit_predict(x) sil = ch = db = None - if 1 < len(np.unique(y)) < len(X): - sil = float(silhouette_score(X, y)) - ch = float(calinski_harabasz_score(X, y)) - db = float(davies_bouldin_score(X, y)) - - rows.append({ - "k": k, - "inertia": float(model.inertia_), - "silhouette": sil, - "calinski_harabasz": ch, - "davies_bouldin": db, - }) + if 1 < len(np.unique(y)) < len(x): + sil = float(silhouette_score(x, y)) + ch = float(calinski_harabasz_score(x, y)) + db = float(davies_bouldin_score(x, y)) + + rows.append( + { + "k": k, + "inertia": float(model.inertia_), + "silhouette": sil, + "calinski_harabasz": ch, + "davies_bouldin": db, + } + ) sweep_df = pd.DataFrame(rows) sweep_df.to_csv(args.out_k_sweep, sep=",", index=False, float_format="%.10g") @@ -265,8 +264,18 @@ def plot_curve(metric, title, ylabel, out_png): if not sweep_df.empty: plot_curve("inertia", "Elbow method (KMeans inertia)", "inertia", f"{pfx}_elbow.png") plot_curve("silhouette", "Silhouette score (higher is better)", "silhouette", f"{pfx}_silhouette.png") - plot_curve("davies_bouldin", "Davies-Bouldin index (lower is better)", "davies_bouldin", f"{pfx}_davies_bouldin.png") - plot_curve("calinski_harabasz", "Calinski-Harabasz index (higher is better)", "calinski_harabasz", f"{pfx}_calinski.png") + plot_curve( + "davies_bouldin", + "Davies-Bouldin index (lower is better)", + "davies_bouldin", + f"{pfx}_davies_bouldin.png", + ) + plot_curve( + "calinski_harabasz", + "Calinski-Harabasz index (higher is better)", + "calinski_harabasz", + f"{pfx}_calinski.png", + ) except Exception as e: Path("plot_warning.txt").write_text(f"Plotting failed: {e}\n") diff --git a/modules/nf-core/custom/cluster_viz/templates/cluster_viz.py b/modules/nf-core/custom/cluster_viz/templates/cluster_viz.py index 18ffbba4fff7..020a65db986e 100644 --- a/modules/nf-core/custom/cluster_viz/templates/cluster_viz.py +++ b/modules/nf-core/custom/cluster_viz/templates/cluster_viz.py @@ -56,22 +56,15 @@ def _normalise_id_column(df: pd.DataFrame) -> pd.DataFrame: df = df.drop(columns=fid_cols) return df - raise ValueError( - f"Cannot find sample ID column (expected 'sample_id' or 'IID'). " - f"Found: {list(df.columns)}" - ) + raise ValueError(f"Cannot find sample ID column (expected 'sample_id' or 'IID'). Found: {list(df.columns)}") def load_features(path: str) -> tuple[pd.DataFrame, pd.Series]: df = pd.read_csv(path, sep=r"\s+", engine="python", dtype=str) df = _normalise_id_column(df) sample_ids = df["sample_id"].astype(str) - X = ( - df.drop(columns=["sample_id"]) - .apply(pd.to_numeric, errors="coerce") - .fillna(0.0) - ) - return X, sample_ids + x = df.drop(columns=["sample_id"]).apply(pd.to_numeric, errors="coerce").fillna(0.0) + return x, sample_ids def load_clusters(path: str) -> pd.Series: @@ -89,26 +82,27 @@ def safe_perplexity(n_samples: int, requested: float) -> float: return float(max(2.0, min(requested, upper))) -def compute_umap(X: np.ndarray, n_neighbors: int, min_dist: float) -> np.ndarray: +def compute_umap(x: np.ndarray, n_neighbors: int, min_dist: float) -> np.ndarray: try: import umap + return umap.UMAP( n_components=2, n_neighbors=n_neighbors, min_dist=min_dist, random_state=42, - ).fit_transform(X) + ).fit_transform(x) except Exception as e: print(f"[WARN] UMAP failed, fallback to first 2 feature columns: {e}") - if X.shape[1] >= 2: - return X[:, :2] - elif X.shape[1] == 1: - return np.column_stack([X[:, 0], np.zeros(X.shape[0])]) + if x.shape[1] >= 2: + return x[:, :2] + elif x.shape[1] == 1: + return np.column_stack([x[:, 0], np.zeros(x.shape[0])]) else: - return np.zeros((X.shape[0], 2)) + return np.zeros((x.shape[0], 2)) -def compute_tsne(X: np.ndarray, perplexity: float, max_iter: int) -> np.ndarray: +def compute_tsne(x: np.ndarray, perplexity: float, max_iter: int) -> np.ndarray: return TSNE( n_components=2, perplexity=perplexity, @@ -116,7 +110,7 @@ def compute_tsne(X: np.ndarray, perplexity: float, max_iter: int) -> np.ndarray: random_state=42, max_iter=max_iter, learning_rate="auto", - ).fit_transform(X) + ).fit_transform(x) def plot_scatter(df, x, y, out_png, title, xlabel=None, ylabel=None): @@ -126,19 +120,32 @@ def plot_scatter(df, x, y, out_png, title, xlabel=None, ylabel=None): plt.figure(figsize=(7, 5)) labels = df["cluster"].astype(int).values uniq = np.unique(labels) - sc = plt.scatter(df[x], df[y], c=labels, cmap="Paired", s=24,linewidths=0.4, alpha=0.85) + sc = plt.scatter(df[x], df[y], c=labels, cmap="Paired", s=24, linewidths=0.4, alpha=0.85) plt.title(title) plt.xlabel(xlabel or x) plt.ylabel(ylabel or y) plt.grid(True, alpha=0.5) handles = [ - Line2D([0], [0], marker="o", linestyle="", markersize=7, - markerfacecolor=sc.cmap(sc.norm(k)), markeredgecolor="none", - label=f"Cluster {k}") + Line2D( + [0], + [0], + marker="o", + linestyle="", + markersize=7, + markerfacecolor=sc.cmap(sc.norm(k)), + markeredgecolor="none", + label=f"Cluster {k}", + ) for k in uniq ] - plt.legend(handles=handles, title="Clusters", loc="center left", - bbox_to_anchor=(1.02, 0.5), borderaxespad=0.0, frameon=True) + plt.legend( + handles=handles, + title="Clusters", + loc="center left", + bbox_to_anchor=(1.02, 0.5), + borderaxespad=0.0, + frameon=True, + ) plt.tight_layout() plt.savefig(out_png, dpi=200, bbox_inches="tight") plt.close() @@ -160,7 +167,7 @@ def main() -> None: ap.add_argument("--out-pca-png", required=True) args = ap.parse_args() - X_df, sample_ids = load_features(args.features) + x_df, sample_ids = load_features(args.features) clusters = load_clusters(args.clusters) common = sample_ids[sample_ids.isin(clusters.index)] @@ -171,30 +178,33 @@ def main() -> None: f" clusters IDs (first 5): {list(clusters.index[:5])}" ) - X = X_df.loc[common.index].values + x = x_df.loc[common.index].values y = clusters.loc[common.values].values - umap_coords = compute_umap(X, args.umap_neighbors, args.umap_min_dist) - umap_df = pd.DataFrame({ - "sample_id": common.values, - "x": umap_coords[:, 0], - "y": umap_coords[:, 1], - "cluster": y, - }) + umap_coords = compute_umap(x, args.umap_neighbors, args.umap_min_dist) + umap_df = pd.DataFrame( + { + "sample_id": common.values, + "x": umap_coords[:, 0], + "y": umap_coords[:, 1], + "cluster": y, + } + ) umap_df.to_csv(args.out_umap_tsv, sep="\t", index=False) plot_scatter(umap_df, "x", "y", args.out_umap_png, "UMAP embedding") perp = safe_perplexity(len(common), args.tsne_perplexity) - tsne_coords = compute_tsne(X, perp, args.tsne_iter) - tsne_df = pd.DataFrame({ - "sample_id": common.values, - "x": tsne_coords[:, 0], - "y": tsne_coords[:, 1], - "cluster": y, - }) + tsne_coords = compute_tsne(x, perp, args.tsne_iter) + tsne_df = pd.DataFrame( + { + "sample_id": common.values, + "x": tsne_coords[:, 0], + "y": tsne_coords[:, 1], + "cluster": y, + } + ) tsne_df.to_csv(args.out_tsne_tsv, sep="\t", index=False) - plot_scatter(tsne_df, "x", "y", args.out_tsne_png, - f"t-SNE (perplexity={perp:.1f})") + plot_scatter(tsne_df, "x", "y", args.out_tsne_png, f"t-SNE (perplexity={perp:.1f})") pca_df = pd.read_csv(args.pca_scores, sep=r"\s+", engine="python", dtype=str) pca_df = _normalise_id_column(pca_df) @@ -205,8 +215,7 @@ def main() -> None: for col in [c1, c2]: pca_df[col] = pd.to_numeric(pca_df[col], errors="coerce") merged = pca_df.merge(umap_df[["sample_id", "cluster"]], on="sample_id", how="inner") - plot_scatter(merged, c1, c2, args.out_pca_png, - "PCA", c1, c2) + plot_scatter(merged, c1, c2, args.out_pca_png, "PCA", c1, c2) if __name__ == "__main__": diff --git a/modules/nf-core/custom/cluster_viz/tests/main.nf.test b/modules/nf-core/custom/cluster_viz/tests/main.nf.test index a65f09b9d76f..4d2da62b7cd1 100644 --- a/modules/nf-core/custom/cluster_viz/tests/main.nf.test +++ b/modules/nf-core/custom/cluster_viz/tests/main.nf.test @@ -12,7 +12,7 @@ nextflow_process { when { process { """ - input[0] = [ [id:'test'], + input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/cluster_viz/tests/data/test_features.tsv", checkIfExists: true), file("${projectDir}/modules/nf-core/custom/cluster_viz/tests/data/test_clusters.csv", checkIfExists: true), file("${projectDir}/modules/nf-core/custom/cluster_viz/tests/data/test_pca.eigenvec", checkIfExists: true) ] @@ -38,7 +38,7 @@ nextflow_process { when { process { """ - input[0] = [ [id:'test'], + input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/cluster_viz/tests/data/test_features.tsv", checkIfExists: true), file("${projectDir}/modules/nf-core/custom/cluster_viz/tests/data/test_clusters.csv", checkIfExists: true), file("${projectDir}/modules/nf-core/custom/cluster_viz/tests/data/test_pca.eigenvec", checkIfExists: true) ] From 3a26119864710495502c76d6c5c10c5756d19ecd Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Mon, 4 May 2026 16:33:30 +0200 Subject: [PATCH 05/38] Fix custom clustering module lint and snapshots --- .../cluster_metrics/tests/main.nf.test.snap | 26 +++++++------- .../cluster_viz/tests/main.nf.test.snap | 34 +++++++++---------- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/modules/nf-core/custom/cluster_metrics/tests/main.nf.test.snap b/modules/nf-core/custom/cluster_metrics/tests/main.nf.test.snap index d6dcb96d8697..095955fd70ce 100644 --- a/modules/nf-core/custom/cluster_metrics/tests/main.nf.test.snap +++ b/modules/nf-core/custom/cluster_metrics/tests/main.nf.test.snap @@ -32,15 +32,15 @@ "id": "test" }, [ - "test_calinski.png:md5,a5db5a4340f5a203f8cfb1e27617eab1", - "test_davies_bouldin.png:md5,b083077c690cc454c7094d6d0c72fdb9", - "test_elbow.png:md5,e90e14974deb0776c9b784c1de8b8ef0", - "test_silhouette.png:md5,b7f103711912855f97d8588e1be2d659" + "test_calinski.png:md5,9ad4ecb0779b571ed6f869b5523d821c", + "test_davies_bouldin.png:md5,87eeef85f2bdc97cd3faab3c984fa4cc", + "test_elbow.png:md5,e1980b81ed477b8defe062f5e1dd9af3", + "test_silhouette.png:md5,4fff8e7a883ed692329e4d73485e0e50" ] ] ], "4": [ - "versions.yml:md5,00e23b98be698c459e9c94079b0164e0" + "versions.yml:md5,7f675f156526d327dab66b3edbcaf81d" ], "k_sweep": [ [ @@ -64,10 +64,10 @@ "id": "test" }, [ - "test_calinski.png:md5,a5db5a4340f5a203f8cfb1e27617eab1", - "test_davies_bouldin.png:md5,b083077c690cc454c7094d6d0c72fdb9", - "test_elbow.png:md5,e90e14974deb0776c9b784c1de8b8ef0", - "test_silhouette.png:md5,b7f103711912855f97d8588e1be2d659" + "test_calinski.png:md5,9ad4ecb0779b571ed6f869b5523d821c", + "test_davies_bouldin.png:md5,87eeef85f2bdc97cd3faab3c984fa4cc", + "test_elbow.png:md5,e1980b81ed477b8defe062f5e1dd9af3", + "test_silhouette.png:md5,4fff8e7a883ed692329e4d73485e0e50" ] ] ], @@ -80,11 +80,11 @@ ] ], "versions": [ - "versions.yml:md5,00e23b98be698c459e9c94079b0164e0" + "versions.yml:md5,7f675f156526d327dab66b3edbcaf81d" ] } ], - "timestamp": "2026-05-04T15:30:54.137415161", + "timestamp": "2026-05-04T16:19:18.154807795", "meta": { "nf-test": "0.9.5", "nextflow": "25.09.0" @@ -117,10 +117,10 @@ ] ], [ - "versions.yml:md5,00e23b98be698c459e9c94079b0164e0" + "versions.yml:md5,7f675f156526d327dab66b3edbcaf81d" ] ], - "timestamp": "2026-05-04T15:30:49.380085424", + "timestamp": "2026-05-04T16:18:56.462902926", "meta": { "nf-test": "0.9.5", "nextflow": "25.09.0" diff --git a/modules/nf-core/custom/cluster_viz/tests/main.nf.test.snap b/modules/nf-core/custom/cluster_viz/tests/main.nf.test.snap index 4fe180aa20aa..2b02e044f269 100644 --- a/modules/nf-core/custom/cluster_viz/tests/main.nf.test.snap +++ b/modules/nf-core/custom/cluster_viz/tests/main.nf.test.snap @@ -6,7 +6,7 @@ { "id": "test" }, - "test_umap.tsv:md5,feda69a2108d84b70c3937a1cb84e661" + "test_umap.tsv:md5,81ca2c64bffcac064124703e144b3aa4" ] ], [ @@ -22,7 +22,7 @@ { "id": "test" }, - "test_umap.png:md5,470fab262187541b0ad52a3138bf3734" + "test_umap.png:md5,c9b22ffd5236909e4ad212b4f8724f47" ] ], [ @@ -30,7 +30,7 @@ { "id": "test" }, - "test_tsne.png:md5,aeacfdc6d9e35dd27406e70ae9220715" + "test_tsne.png:md5,16a8f8d45905af0f2312fc3b3cefe93f" ] ], [ @@ -38,14 +38,14 @@ { "id": "test" }, - "test_pca.png:md5,a659f260b13e0351dd28cbb5163d6a20" + "test_pca.png:md5,eed3e7e2b50aac1f2fd4659cde79300b" ] ], [ - "versions.yml:md5,786af5c4301c54553001db08a8c9db5b" + "versions.yml:md5,1e4e4ef1869740e8d7aac3aa574d157c" ] ], - "timestamp": "2026-04-29T14:56:42.32373645", + "timestamp": "2026-05-04T16:25:58.786254808", "meta": { "nf-test": "0.9.5", "nextflow": "25.09.0" @@ -59,7 +59,7 @@ { "id": "test" }, - "test_umap.tsv:md5,feda69a2108d84b70c3937a1cb84e661" + "test_umap.tsv:md5,81ca2c64bffcac064124703e144b3aa4" ] ], "1": [ @@ -75,7 +75,7 @@ { "id": "test" }, - "test_umap.png:md5,470fab262187541b0ad52a3138bf3734" + "test_umap.png:md5,c9b22ffd5236909e4ad212b4f8724f47" ] ], "3": [ @@ -83,7 +83,7 @@ { "id": "test" }, - "test_tsne.png:md5,aeacfdc6d9e35dd27406e70ae9220715" + "test_tsne.png:md5,16a8f8d45905af0f2312fc3b3cefe93f" ] ], "4": [ @@ -91,18 +91,18 @@ { "id": "test" }, - "test_pca.png:md5,a659f260b13e0351dd28cbb5163d6a20" + "test_pca.png:md5,eed3e7e2b50aac1f2fd4659cde79300b" ] ], "5": [ - "versions.yml:md5,786af5c4301c54553001db08a8c9db5b" + "versions.yml:md5,1e4e4ef1869740e8d7aac3aa574d157c" ], "pca_png": [ [ { "id": "test" }, - "test_pca.png:md5,a659f260b13e0351dd28cbb5163d6a20" + "test_pca.png:md5,eed3e7e2b50aac1f2fd4659cde79300b" ] ], "tsne": [ @@ -118,7 +118,7 @@ { "id": "test" }, - "test_tsne.png:md5,aeacfdc6d9e35dd27406e70ae9220715" + "test_tsne.png:md5,16a8f8d45905af0f2312fc3b3cefe93f" ] ], "umap": [ @@ -126,7 +126,7 @@ { "id": "test" }, - "test_umap.tsv:md5,feda69a2108d84b70c3937a1cb84e661" + "test_umap.tsv:md5,81ca2c64bffcac064124703e144b3aa4" ] ], "umap_png": [ @@ -134,15 +134,15 @@ { "id": "test" }, - "test_umap.png:md5,470fab262187541b0ad52a3138bf3734" + "test_umap.png:md5,c9b22ffd5236909e4ad212b4f8724f47" ] ], "versions": [ - "versions.yml:md5,786af5c4301c54553001db08a8c9db5b" + "versions.yml:md5,1e4e4ef1869740e8d7aac3aa574d157c" ] } ], - "timestamp": "2026-04-29T14:56:54.120297782", + "timestamp": "2026-05-04T16:26:22.932828948", "meta": { "nf-test": "0.9.5", "nextflow": "25.09.0" From e4877e48bd416dcf183added8008511a582f3247 Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Wed, 6 May 2026 17:31:48 +0200 Subject: [PATCH 06/38] Address review comments for clustering custom modules --- .../environment.yml | 0 .../main.nf | 2 +- .../meta.yml | 2 +- .../templates/cluster_metrics.py | 0 .../tests/data/test_clusters.csv | 0 .../tests/data/test_features.tsv | 0 .../tests/main.nf.test | 12 +-- .../tests/main.nf.test.snap | 96 +++++++++---------- .../environment.yml | 0 .../main.nf | 2 +- .../meta.yml | 2 +- .../templates/cluster_viz.py | 0 .../tests/data/test_clusters.csv | 0 .../tests/data/test_features.tsv | 0 .../tests/data/test_pca.eigenvec | 0 .../tests/main.nf.test | 20 ++-- .../tests/main.nf.test.snap | 38 ++++---- 17 files changed, 87 insertions(+), 87 deletions(-) rename modules/nf-core/custom/{cluster_metrics => clustermetrics}/environment.yml (100%) rename modules/nf-core/custom/{cluster_metrics => clustermetrics}/main.nf (94%) rename modules/nf-core/custom/{cluster_metrics => clustermetrics}/meta.yml (99%) rename modules/nf-core/custom/{cluster_metrics => clustermetrics}/templates/cluster_metrics.py (100%) rename modules/nf-core/custom/{cluster_metrics => clustermetrics}/tests/data/test_clusters.csv (100%) rename modules/nf-core/custom/{cluster_metrics => clustermetrics}/tests/data/test_features.tsv (100%) rename modules/nf-core/custom/{cluster_metrics => clustermetrics}/tests/main.nf.test (62%) rename modules/nf-core/custom/{cluster_metrics => clustermetrics}/tests/main.nf.test.snap (73%) rename modules/nf-core/custom/{cluster_viz => clustervisualiation}/environment.yml (100%) rename modules/nf-core/custom/{cluster_viz => clustervisualiation}/main.nf (95%) rename modules/nf-core/custom/{cluster_viz => clustervisualiation}/meta.yml (99%) rename modules/nf-core/custom/{cluster_viz => clustervisualiation}/templates/cluster_viz.py (100%) rename modules/nf-core/custom/{cluster_viz => clustervisualiation}/tests/data/test_clusters.csv (100%) rename modules/nf-core/custom/{cluster_viz => clustervisualiation}/tests/data/test_features.tsv (100%) rename modules/nf-core/custom/{cluster_viz => clustervisualiation}/tests/data/test_pca.eigenvec (100%) rename modules/nf-core/custom/{cluster_viz => clustervisualiation}/tests/main.nf.test (68%) rename modules/nf-core/custom/{cluster_viz => clustervisualiation}/tests/main.nf.test.snap (70%) diff --git a/modules/nf-core/custom/cluster_metrics/environment.yml b/modules/nf-core/custom/clustermetrics/environment.yml similarity index 100% rename from modules/nf-core/custom/cluster_metrics/environment.yml rename to modules/nf-core/custom/clustermetrics/environment.yml diff --git a/modules/nf-core/custom/cluster_metrics/main.nf b/modules/nf-core/custom/clustermetrics/main.nf similarity index 94% rename from modules/nf-core/custom/cluster_metrics/main.nf rename to modules/nf-core/custom/clustermetrics/main.nf index 71b91e9c3c18..101f3cc78e84 100644 --- a/modules/nf-core/custom/cluster_metrics/main.nf +++ b/modules/nf-core/custom/clustermetrics/main.nf @@ -32,7 +32,7 @@ process CLUSTER_METRICS { def prefix = task.ext.prefix ?: out_prefix ?: "${meta.id}" """ - python3 ${projectDir}/modules/nf-core/custom/cluster_metrics/templates/cluster_metrics.py \\ + python3 ${moduleDir}/templates/cluster_metrics.py \\ --features ${features} \\ --clusters ${clusters} \\ --out-k-sweep ${prefix}_k_sweep.csv \\ diff --git a/modules/nf-core/custom/cluster_metrics/meta.yml b/modules/nf-core/custom/clustermetrics/meta.yml similarity index 99% rename from modules/nf-core/custom/cluster_metrics/meta.yml rename to modules/nf-core/custom/clustermetrics/meta.yml index 0716b49dfb6a..2944931ae49c 100644 --- a/modules/nf-core/custom/cluster_metrics/meta.yml +++ b/modules/nf-core/custom/clustermetrics/meta.yml @@ -1,4 +1,4 @@ -name: "cluster_metrics" +name: "clustermetrics" description: "Computes clustering quality metrics (silhouette, Calinski-Harabasz, Davies-Bouldin) and performs k-sweep analysis" keywords: diff --git a/modules/nf-core/custom/cluster_metrics/templates/cluster_metrics.py b/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py similarity index 100% rename from modules/nf-core/custom/cluster_metrics/templates/cluster_metrics.py rename to modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py diff --git a/modules/nf-core/custom/cluster_metrics/tests/data/test_clusters.csv b/modules/nf-core/custom/clustermetrics/tests/data/test_clusters.csv similarity index 100% rename from modules/nf-core/custom/cluster_metrics/tests/data/test_clusters.csv rename to modules/nf-core/custom/clustermetrics/tests/data/test_clusters.csv diff --git a/modules/nf-core/custom/cluster_metrics/tests/data/test_features.tsv b/modules/nf-core/custom/clustermetrics/tests/data/test_features.tsv similarity index 100% rename from modules/nf-core/custom/cluster_metrics/tests/data/test_features.tsv rename to modules/nf-core/custom/clustermetrics/tests/data/test_features.tsv diff --git a/modules/nf-core/custom/cluster_metrics/tests/main.nf.test b/modules/nf-core/custom/clustermetrics/tests/main.nf.test similarity index 62% rename from modules/nf-core/custom/cluster_metrics/tests/main.nf.test rename to modules/nf-core/custom/clustermetrics/tests/main.nf.test index d541633b82d1..351d1c00df79 100644 --- a/modules/nf-core/custom/cluster_metrics/tests/main.nf.test +++ b/modules/nf-core/custom/clustermetrics/tests/main.nf.test @@ -5,14 +5,14 @@ nextflow_process { tag "modules" tag "modules_nfcore" tag "custom" - tag "custom/cluster_metrics" - tag "cluster_metrics" + tag "custom/clustermetrics" + tag "clustermetrics" - test("cluster_metrics - features and clusters") { + test("clustermetrics - features and clusters") { when { process { """ - input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/cluster_metrics/tests/data/test_features.tsv", checkIfExists: true), file("${projectDir}/modules/nf-core/custom/cluster_metrics/tests/data/test_clusters.csv", checkIfExists: true) ] + input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/clustermetrics/tests/data/test_features.tsv", checkIfExists: true), file("${projectDir}/modules/nf-core/custom/clustermetrics/tests/data/test_clusters.csv", checkIfExists: true) ] input[1] = 'test' """ } @@ -28,12 +28,12 @@ nextflow_process { } } - test("cluster_metrics - features and clusters - stub") { + test("clustermetrics - features and clusters - stub") { options "-stub" when { process { """ - input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/cluster_metrics/tests/data/test_features.tsv", checkIfExists: true), file("${projectDir}/modules/nf-core/custom/cluster_metrics/tests/data/test_clusters.csv", checkIfExists: true) ] + input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/clustermetrics/tests/data/test_features.tsv", checkIfExists: true), file("${projectDir}/modules/nf-core/custom/clustermetrics/tests/data/test_clusters.csv", checkIfExists: true) ] input[1] = 'test' """ } diff --git a/modules/nf-core/custom/cluster_metrics/tests/main.nf.test.snap b/modules/nf-core/custom/clustermetrics/tests/main.nf.test.snap similarity index 73% rename from modules/nf-core/custom/cluster_metrics/tests/main.nf.test.snap rename to modules/nf-core/custom/clustermetrics/tests/main.nf.test.snap index 095955fd70ce..1e28bf786221 100644 --- a/modules/nf-core/custom/cluster_metrics/tests/main.nf.test.snap +++ b/modules/nf-core/custom/clustermetrics/tests/main.nf.test.snap @@ -1,5 +1,41 @@ { - "cluster_metrics - features and clusters - stub": { + "clustermetrics - features and clusters": { + "content": [ + [ + [ + { + "id": "test" + }, + "test_metrics.tsv:md5,7bfdac9bca90ba2ea3c03eca25b24f28" + ] + ], + [ + [ + { + "id": "test" + }, + "test_k_sweep.csv:md5,8c382a00d959ae5d6e19f42cf9278f37" + ] + ], + [ + [ + { + "id": "test" + }, + "test_selected.json:md5,633493b1585fb0ec0a81629bde4c00cb" + ] + ], + [ + "versions.yml:md5,00e23b98be698c459e9c94079b0164e0" + ] + ], + "timestamp": "2026-05-06T17:29:16.510271878", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.09.0" + } + }, + "clustermetrics - features and clusters - stub": { "content": [ { "0": [ @@ -32,15 +68,15 @@ "id": "test" }, [ - "test_calinski.png:md5,9ad4ecb0779b571ed6f869b5523d821c", - "test_davies_bouldin.png:md5,87eeef85f2bdc97cd3faab3c984fa4cc", - "test_elbow.png:md5,e1980b81ed477b8defe062f5e1dd9af3", - "test_silhouette.png:md5,4fff8e7a883ed692329e4d73485e0e50" + "test_calinski.png:md5,a5db5a4340f5a203f8cfb1e27617eab1", + "test_davies_bouldin.png:md5,b083077c690cc454c7094d6d0c72fdb9", + "test_elbow.png:md5,e90e14974deb0776c9b784c1de8b8ef0", + "test_silhouette.png:md5,b7f103711912855f97d8588e1be2d659" ] ] ], "4": [ - "versions.yml:md5,7f675f156526d327dab66b3edbcaf81d" + "versions.yml:md5,00e23b98be698c459e9c94079b0164e0" ], "k_sweep": [ [ @@ -64,10 +100,10 @@ "id": "test" }, [ - "test_calinski.png:md5,9ad4ecb0779b571ed6f869b5523d821c", - "test_davies_bouldin.png:md5,87eeef85f2bdc97cd3faab3c984fa4cc", - "test_elbow.png:md5,e1980b81ed477b8defe062f5e1dd9af3", - "test_silhouette.png:md5,4fff8e7a883ed692329e4d73485e0e50" + "test_calinski.png:md5,a5db5a4340f5a203f8cfb1e27617eab1", + "test_davies_bouldin.png:md5,b083077c690cc454c7094d6d0c72fdb9", + "test_elbow.png:md5,e90e14974deb0776c9b784c1de8b8ef0", + "test_silhouette.png:md5,b7f103711912855f97d8588e1be2d659" ] ] ], @@ -80,47 +116,11 @@ ] ], "versions": [ - "versions.yml:md5,7f675f156526d327dab66b3edbcaf81d" + "versions.yml:md5,00e23b98be698c459e9c94079b0164e0" ] } ], - "timestamp": "2026-05-04T16:19:18.154807795", - "meta": { - "nf-test": "0.9.5", - "nextflow": "25.09.0" - } - }, - "cluster_metrics - features and clusters": { - "content": [ - [ - [ - { - "id": "test" - }, - "test_metrics.tsv:md5,7bfdac9bca90ba2ea3c03eca25b24f28" - ] - ], - [ - [ - { - "id": "test" - }, - "test_k_sweep.csv:md5,8c382a00d959ae5d6e19f42cf9278f37" - ] - ], - [ - [ - { - "id": "test" - }, - "test_selected.json:md5,633493b1585fb0ec0a81629bde4c00cb" - ] - ], - [ - "versions.yml:md5,7f675f156526d327dab66b3edbcaf81d" - ] - ], - "timestamp": "2026-05-04T16:18:56.462902926", + "timestamp": "2026-05-06T17:29:21.180634202", "meta": { "nf-test": "0.9.5", "nextflow": "25.09.0" diff --git a/modules/nf-core/custom/cluster_viz/environment.yml b/modules/nf-core/custom/clustervisualiation/environment.yml similarity index 100% rename from modules/nf-core/custom/cluster_viz/environment.yml rename to modules/nf-core/custom/clustervisualiation/environment.yml diff --git a/modules/nf-core/custom/cluster_viz/main.nf b/modules/nf-core/custom/clustervisualiation/main.nf similarity index 95% rename from modules/nf-core/custom/cluster_viz/main.nf rename to modules/nf-core/custom/clustervisualiation/main.nf index a5d6d9eb8e50..58cb89db8142 100644 --- a/modules/nf-core/custom/cluster_viz/main.nf +++ b/modules/nf-core/custom/clustervisualiation/main.nf @@ -33,7 +33,7 @@ process CLUSTER_VIZ { def prefix = task.ext.prefix ?: out_prefix ?: "${meta.id}" """ - python3 ${projectDir}/modules/nf-core/custom/cluster_viz/templates/cluster_viz.py \\ + python3 ${moduleDir}/templates/cluster_viz.py \\ --features ${features} \\ --clusters ${clusters} \\ --pca-scores ${pca_scores} \\ diff --git a/modules/nf-core/custom/cluster_viz/meta.yml b/modules/nf-core/custom/clustervisualiation/meta.yml similarity index 99% rename from modules/nf-core/custom/cluster_viz/meta.yml rename to modules/nf-core/custom/clustervisualiation/meta.yml index cdad47cfcb26..f08ba6bfbc35 100644 --- a/modules/nf-core/custom/cluster_viz/meta.yml +++ b/modules/nf-core/custom/clustervisualiation/meta.yml @@ -1,4 +1,4 @@ -name: "cluster_viz" +name: "clustervisualiation" description: "Generates PCA, UMAP and t-SNE visualizations colored by cluster" keywords: - clustering diff --git a/modules/nf-core/custom/cluster_viz/templates/cluster_viz.py b/modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py similarity index 100% rename from modules/nf-core/custom/cluster_viz/templates/cluster_viz.py rename to modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py diff --git a/modules/nf-core/custom/cluster_viz/tests/data/test_clusters.csv b/modules/nf-core/custom/clustervisualiation/tests/data/test_clusters.csv similarity index 100% rename from modules/nf-core/custom/cluster_viz/tests/data/test_clusters.csv rename to modules/nf-core/custom/clustervisualiation/tests/data/test_clusters.csv diff --git a/modules/nf-core/custom/cluster_viz/tests/data/test_features.tsv b/modules/nf-core/custom/clustervisualiation/tests/data/test_features.tsv similarity index 100% rename from modules/nf-core/custom/cluster_viz/tests/data/test_features.tsv rename to modules/nf-core/custom/clustervisualiation/tests/data/test_features.tsv diff --git a/modules/nf-core/custom/cluster_viz/tests/data/test_pca.eigenvec b/modules/nf-core/custom/clustervisualiation/tests/data/test_pca.eigenvec similarity index 100% rename from modules/nf-core/custom/cluster_viz/tests/data/test_pca.eigenvec rename to modules/nf-core/custom/clustervisualiation/tests/data/test_pca.eigenvec diff --git a/modules/nf-core/custom/cluster_viz/tests/main.nf.test b/modules/nf-core/custom/clustervisualiation/tests/main.nf.test similarity index 68% rename from modules/nf-core/custom/cluster_viz/tests/main.nf.test rename to modules/nf-core/custom/clustervisualiation/tests/main.nf.test index 4d2da62b7cd1..f4695eedae1e 100644 --- a/modules/nf-core/custom/cluster_viz/tests/main.nf.test +++ b/modules/nf-core/custom/clustervisualiation/tests/main.nf.test @@ -5,17 +5,17 @@ nextflow_process { tag "modules" tag "modules_nfcore" tag "custom" - tag "custom/cluster_viz" - tag "cluster_viz" + tag "custom/clustervisualiation" + tag "clustervisualiation" - test("cluster_viz - features clusters pca") { + test("clustervisualiation - features clusters pca") { when { process { """ input[0] = [ [id:'test'], - file("${projectDir}/modules/nf-core/custom/cluster_viz/tests/data/test_features.tsv", checkIfExists: true), - file("${projectDir}/modules/nf-core/custom/cluster_viz/tests/data/test_clusters.csv", checkIfExists: true), - file("${projectDir}/modules/nf-core/custom/cluster_viz/tests/data/test_pca.eigenvec", checkIfExists: true) ] + file("${projectDir}/modules/nf-core/custom/clustervisualiation/tests/data/test_features.tsv", checkIfExists: true), + file("${projectDir}/modules/nf-core/custom/clustervisualiation/tests/data/test_clusters.csv", checkIfExists: true), + file("${projectDir}/modules/nf-core/custom/clustervisualiation/tests/data/test_pca.eigenvec", checkIfExists: true) ] input[1] = 'test' """ } @@ -33,15 +33,15 @@ nextflow_process { } } - test("cluster_viz - features clusters pca - stub") { + test("clustervisualiation - features clusters pca - stub") { options "-stub" when { process { """ input[0] = [ [id:'test'], - file("${projectDir}/modules/nf-core/custom/cluster_viz/tests/data/test_features.tsv", checkIfExists: true), - file("${projectDir}/modules/nf-core/custom/cluster_viz/tests/data/test_clusters.csv", checkIfExists: true), - file("${projectDir}/modules/nf-core/custom/cluster_viz/tests/data/test_pca.eigenvec", checkIfExists: true) ] + file("${projectDir}/modules/nf-core/custom/clustervisualiation/tests/data/test_features.tsv", checkIfExists: true), + file("${projectDir}/modules/nf-core/custom/clustervisualiation/tests/data/test_clusters.csv", checkIfExists: true), + file("${projectDir}/modules/nf-core/custom/clustervisualiation/tests/data/test_pca.eigenvec", checkIfExists: true) ] input[1] = 'test' """ } diff --git a/modules/nf-core/custom/cluster_viz/tests/main.nf.test.snap b/modules/nf-core/custom/clustervisualiation/tests/main.nf.test.snap similarity index 70% rename from modules/nf-core/custom/cluster_viz/tests/main.nf.test.snap rename to modules/nf-core/custom/clustervisualiation/tests/main.nf.test.snap index 2b02e044f269..bc8ca92380e9 100644 --- a/modules/nf-core/custom/cluster_viz/tests/main.nf.test.snap +++ b/modules/nf-core/custom/clustervisualiation/tests/main.nf.test.snap @@ -1,12 +1,12 @@ { - "cluster_viz - features clusters pca": { + "clustervisualiation - features clusters pca": { "content": [ [ [ { "id": "test" }, - "test_umap.tsv:md5,81ca2c64bffcac064124703e144b3aa4" + "test_umap.tsv:md5,feda69a2108d84b70c3937a1cb84e661" ] ], [ @@ -22,7 +22,7 @@ { "id": "test" }, - "test_umap.png:md5,c9b22ffd5236909e4ad212b4f8724f47" + "test_umap.png:md5,470fab262187541b0ad52a3138bf3734" ] ], [ @@ -30,7 +30,7 @@ { "id": "test" }, - "test_tsne.png:md5,16a8f8d45905af0f2312fc3b3cefe93f" + "test_tsne.png:md5,aeacfdc6d9e35dd27406e70ae9220715" ] ], [ @@ -38,20 +38,20 @@ { "id": "test" }, - "test_pca.png:md5,eed3e7e2b50aac1f2fd4659cde79300b" + "test_pca.png:md5,a659f260b13e0351dd28cbb5163d6a20" ] ], [ - "versions.yml:md5,1e4e4ef1869740e8d7aac3aa574d157c" + "versions.yml:md5,786af5c4301c54553001db08a8c9db5b" ] ], - "timestamp": "2026-05-04T16:25:58.786254808", + "timestamp": "2026-05-06T17:29:34.854148226", "meta": { "nf-test": "0.9.5", "nextflow": "25.09.0" } }, - "cluster_viz - features clusters pca - stub": { + "clustervisualiation - features clusters pca - stub": { "content": [ { "0": [ @@ -59,7 +59,7 @@ { "id": "test" }, - "test_umap.tsv:md5,81ca2c64bffcac064124703e144b3aa4" + "test_umap.tsv:md5,feda69a2108d84b70c3937a1cb84e661" ] ], "1": [ @@ -75,7 +75,7 @@ { "id": "test" }, - "test_umap.png:md5,c9b22ffd5236909e4ad212b4f8724f47" + "test_umap.png:md5,470fab262187541b0ad52a3138bf3734" ] ], "3": [ @@ -83,7 +83,7 @@ { "id": "test" }, - "test_tsne.png:md5,16a8f8d45905af0f2312fc3b3cefe93f" + "test_tsne.png:md5,aeacfdc6d9e35dd27406e70ae9220715" ] ], "4": [ @@ -91,18 +91,18 @@ { "id": "test" }, - "test_pca.png:md5,eed3e7e2b50aac1f2fd4659cde79300b" + "test_pca.png:md5,a659f260b13e0351dd28cbb5163d6a20" ] ], "5": [ - "versions.yml:md5,1e4e4ef1869740e8d7aac3aa574d157c" + "versions.yml:md5,786af5c4301c54553001db08a8c9db5b" ], "pca_png": [ [ { "id": "test" }, - "test_pca.png:md5,eed3e7e2b50aac1f2fd4659cde79300b" + "test_pca.png:md5,a659f260b13e0351dd28cbb5163d6a20" ] ], "tsne": [ @@ -118,7 +118,7 @@ { "id": "test" }, - "test_tsne.png:md5,16a8f8d45905af0f2312fc3b3cefe93f" + "test_tsne.png:md5,aeacfdc6d9e35dd27406e70ae9220715" ] ], "umap": [ @@ -126,7 +126,7 @@ { "id": "test" }, - "test_umap.tsv:md5,81ca2c64bffcac064124703e144b3aa4" + "test_umap.tsv:md5,feda69a2108d84b70c3937a1cb84e661" ] ], "umap_png": [ @@ -134,15 +134,15 @@ { "id": "test" }, - "test_umap.png:md5,c9b22ffd5236909e4ad212b4f8724f47" + "test_umap.png:md5,470fab262187541b0ad52a3138bf3734" ] ], "versions": [ - "versions.yml:md5,1e4e4ef1869740e8d7aac3aa574d157c" + "versions.yml:md5,786af5c4301c54553001db08a8c9db5b" ] } ], - "timestamp": "2026-05-04T16:26:22.932828948", + "timestamp": "2026-05-06T17:29:46.705755643", "meta": { "nf-test": "0.9.5", "nextflow": "25.09.0" From 7e19b220294a8ff1e3074fca0b47b4db310630ef Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Thu, 7 May 2026 00:38:46 +0200 Subject: [PATCH 07/38] Fix custom clustering module metadata --- modules/nf-core/custom/clustermetrics/meta.yml | 2 +- modules/nf-core/custom/clustervisualiation/meta.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/nf-core/custom/clustermetrics/meta.yml b/modules/nf-core/custom/clustermetrics/meta.yml index 2944931ae49c..a467cc634cba 100644 --- a/modules/nf-core/custom/clustermetrics/meta.yml +++ b/modules/nf-core/custom/clustermetrics/meta.yml @@ -1,4 +1,4 @@ -name: "clustermetrics" +name: "CLUSTER_METRICS" description: "Computes clustering quality metrics (silhouette, Calinski-Harabasz, Davies-Bouldin) and performs k-sweep analysis" keywords: diff --git a/modules/nf-core/custom/clustervisualiation/meta.yml b/modules/nf-core/custom/clustervisualiation/meta.yml index f08ba6bfbc35..9828a5c2a6bc 100644 --- a/modules/nf-core/custom/clustervisualiation/meta.yml +++ b/modules/nf-core/custom/clustervisualiation/meta.yml @@ -1,4 +1,4 @@ -name: "clustervisualiation" +name: "CLUSTER_VIZ" description: "Generates PCA, UMAP and t-SNE visualizations colored by cluster" keywords: - clustering From fdb1be424e843ca3dad4283951d30e9cdea25256 Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Thu, 7 May 2026 13:14:16 +0200 Subject: [PATCH 08/38] Add Dockerfile for custom/clustermetrics and custom/clustervisualiation --- modules/nf-core/custom/clustermetrics/Dockerfile | 8 ++++++++ modules/nf-core/custom/clustervisualiation/Dockerfile | 8 ++++++++ 2 files changed, 16 insertions(+) create mode 100644 modules/nf-core/custom/clustermetrics/Dockerfile create mode 100644 modules/nf-core/custom/clustervisualiation/Dockerfile diff --git a/modules/nf-core/custom/clustermetrics/Dockerfile b/modules/nf-core/custom/clustermetrics/Dockerfile new file mode 100644 index 000000000000..1fd701bd86f8 --- /dev/null +++ b/modules/nf-core/custom/clustermetrics/Dockerfile @@ -0,0 +1,8 @@ +FROM nfcore/base:2.0 + +LABEL authors="dbaku42" \ + description="Docker image containing all requirements for nf-core/custom/clustermetrics" + +COPY environment.yml / +RUN micromamba install -y -n base -f /environment.yml && \ + micromamba clean -a -y diff --git a/modules/nf-core/custom/clustervisualiation/Dockerfile b/modules/nf-core/custom/clustervisualiation/Dockerfile new file mode 100644 index 000000000000..577062b77918 --- /dev/null +++ b/modules/nf-core/custom/clustervisualiation/Dockerfile @@ -0,0 +1,8 @@ +FROM nfcore/base:2.0 + +LABEL authors="dbaku42" \ + description="Docker image containing all requirements for nf-core/custom/clustervisualiation" + +COPY environment.yml / +RUN micromamba install -y -n base -f /environment.yml && \ + micromamba clean -a -y From d3a382b5eb317f400f36ec37a1b518f8877b59d5 Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Thu, 7 May 2026 13:38:58 +0200 Subject: [PATCH 09/38] Add container directive for custom/clustermetrics and clustervisualiation --- modules/nf-core/custom/clustermetrics/main.nf | 4 +++- modules/nf-core/custom/clustervisualiation/main.nf | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/modules/nf-core/custom/clustermetrics/main.nf b/modules/nf-core/custom/clustermetrics/main.nf index 101f3cc78e84..b157d90bfa22 100644 --- a/modules/nf-core/custom/clustermetrics/main.nf +++ b/modules/nf-core/custom/clustermetrics/main.nf @@ -13,7 +13,9 @@ process CLUSTER_METRICS { tag "$meta.id" label 'process_medium' conda "${moduleDir}/environment.yml" - + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/clustermetrics:dev' : + 'quay.io/nf-core/clustermetrics:dev' }" input: tuple val(meta), path(features), path(clusters) val out_prefix diff --git a/modules/nf-core/custom/clustervisualiation/main.nf b/modules/nf-core/custom/clustervisualiation/main.nf index 58cb89db8142..0c8259de5ceb 100644 --- a/modules/nf-core/custom/clustervisualiation/main.nf +++ b/modules/nf-core/custom/clustervisualiation/main.nf @@ -13,7 +13,9 @@ process CLUSTER_VIZ { tag "$meta.id" label 'process_medium' conda "${moduleDir}/environment.yml" - + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/clustervisualiation:dev' : + 'quay.io/nf-core/clustervisualiation:dev' }" input: tuple val(meta), path(features), path(clusters), path(pca_scores) val out_prefix From 89287dbffc9c3dd33a9812bed3625095e1c9c013 Mon Sep 17 00:00:00 2001 From: Donald Baku <141358602+dbaku42@users.noreply.github.com> Date: Fri, 8 May 2026 13:06:10 +0200 Subject: [PATCH 10/38] Update modules/nf-core/custom/clustermetrics/main.nf Co-authored-by: Jonathan Manning --- modules/nf-core/custom/clustermetrics/main.nf | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/modules/nf-core/custom/clustermetrics/main.nf b/modules/nf-core/custom/clustermetrics/main.nf index b157d90bfa22..4a50b288f49c 100644 --- a/modules/nf-core/custom/clustermetrics/main.nf +++ b/modules/nf-core/custom/clustermetrics/main.nf @@ -1,15 +1,4 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl = 2 - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - PROCESS: CLUSTER_METRICS - Compute clustering quality metrics and k-sweep - Author: Donald Baku (author) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -process CLUSTER_METRICS { +process CUSTOM_CLUSTERMETRICS { tag "$meta.id" label 'process_medium' conda "${moduleDir}/environment.yml" From d5ac668b057e90d6a42ebf9381e126b955299e00 Mon Sep 17 00:00:00 2001 From: Donald Baku <141358602+dbaku42@users.noreply.github.com> Date: Fri, 8 May 2026 13:06:24 +0200 Subject: [PATCH 11/38] Update modules/nf-core/custom/clustermetrics/main.nf Co-authored-by: Jonathan Manning --- modules/nf-core/custom/clustermetrics/main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/nf-core/custom/clustermetrics/main.nf b/modules/nf-core/custom/clustermetrics/main.nf index 4a50b288f49c..a15db53eeae1 100644 --- a/modules/nf-core/custom/clustermetrics/main.nf +++ b/modules/nf-core/custom/clustermetrics/main.nf @@ -2,9 +2,9 @@ process CUSTOM_CLUSTERMETRICS { tag "$meta.id" label 'process_medium' conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/clustermetrics:dev' : - 'quay.io/nf-core/clustermetrics:dev' }" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/69/69a6d33f6bd1a901cad8a6914b6ad11a7db6c35005b4ff8604f20f1baba10fc3/data' : + 'community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn:b7d7028d28dc4084' }" input: tuple val(meta), path(features), path(clusters) val out_prefix From 55e868a733c2579d497ae98248ae143c68f06d5d Mon Sep 17 00:00:00 2001 From: Donald Baku <141358602+dbaku42@users.noreply.github.com> Date: Fri, 8 May 2026 13:06:39 +0200 Subject: [PATCH 12/38] Update modules/nf-core/custom/clustermetrics/main.nf Co-authored-by: Jonathan Manning --- modules/nf-core/custom/clustermetrics/main.nf | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/modules/nf-core/custom/clustermetrics/main.nf b/modules/nf-core/custom/clustermetrics/main.nf index a15db53eeae1..23b9447dd36c 100644 --- a/modules/nf-core/custom/clustermetrics/main.nf +++ b/modules/nf-core/custom/clustermetrics/main.nf @@ -20,19 +20,18 @@ process CUSTOM_CLUSTERMETRICS { task.ext.when == null || task.ext.when script: - def prefix = task.ext.prefix ?: out_prefix ?: "${meta.id}" + template 'cluster_metrics.py' + stub: + def prefix = task.ext.prefix ?: "${meta.id}" """ - python3 ${moduleDir}/templates/cluster_metrics.py \\ - --features ${features} \\ - --clusters ${clusters} \\ - --out-k-sweep ${prefix}_k_sweep.csv \\ - --out-selected ${prefix}_selected.json \\ - --out-prefix ${prefix} + touch ${prefix}_metrics.tsv + touch ${prefix}_k_sweep.csv + touch ${prefix}_selected.json cat <<-END_VERSIONS > versions.yml "${task.process}": - python: \$(python3 --version | cut -d' ' -f2) + python: \$(python3 --version | sed 's/Python //') pandas: \$(python3 -c "import pandas; print(pandas.__version__)") scikit-learn: \$(python3 -c "import sklearn; print(sklearn.__version__)") matplotlib: \$(python3 -c "import matplotlib; print(matplotlib.__version__)") From e492edbd7b876c38398be07081f4f231e775c9f9 Mon Sep 17 00:00:00 2001 From: Donald Baku <141358602+dbaku42@users.noreply.github.com> Date: Fri, 8 May 2026 13:06:48 +0200 Subject: [PATCH 13/38] Update modules/nf-core/custom/clustermetrics/main.nf Co-authored-by: Jonathan Manning --- modules/nf-core/custom/clustermetrics/main.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/nf-core/custom/clustermetrics/main.nf b/modules/nf-core/custom/clustermetrics/main.nf index 23b9447dd36c..1c8fe203fc9f 100644 --- a/modules/nf-core/custom/clustermetrics/main.nf +++ b/modules/nf-core/custom/clustermetrics/main.nf @@ -7,7 +7,6 @@ process CUSTOM_CLUSTERMETRICS { 'community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn:b7d7028d28dc4084' }" input: tuple val(meta), path(features), path(clusters) - val out_prefix output: tuple val(meta), path("*_metrics.tsv") , emit: metrics From b8529f67d7b3f054d86a4506f105562906c3d1cd Mon Sep 17 00:00:00 2001 From: Donald Baku <141358602+dbaku42@users.noreply.github.com> Date: Fri, 8 May 2026 13:07:02 +0200 Subject: [PATCH 14/38] Update modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py Co-authored-by: Jonathan Manning --- .../custom/clustervisualiation/templates/cluster_viz.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py b/modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py index 020a65db986e..7d470fc2a303 100644 --- a/modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py +++ b/modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py @@ -68,11 +68,10 @@ def load_features(path: str) -> tuple[pd.DataFrame, pd.Series]: def load_clusters(path: str) -> pd.Series: - df = pd.read_csv(path, sep=",", dtype=str) - df = _normalise_id_column(df) - if "cluster" not in df.columns: - raise ValueError("clusters CSV must have a 'cluster' column") - return df.set_index("sample_id")["cluster"].astype(int) + df = pd.read_csv(path) + if "sample_id" not in df.columns or "cluster" not in df.columns: + raise ValueError(f"clusters file must have 'sample_id' and 'cluster' columns. Found: {list(df.columns)}") + return df.set_index(df["sample_id"].astype(str))["cluster"].astype(int) def safe_perplexity(n_samples: int, requested: float) -> float: From 12bb00d90f449303a0afe87fc8bace8f8a35a87d Mon Sep 17 00:00:00 2001 From: Donald Baku <141358602+dbaku42@users.noreply.github.com> Date: Fri, 8 May 2026 13:07:25 +0200 Subject: [PATCH 15/38] Update modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py Co-authored-by: Jonathan Manning --- .../templates/cluster_viz.py | 25 ++++++------------- 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py b/modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py index 7d470fc2a303..108d19f80e3a 100644 --- a/modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py +++ b/modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py @@ -82,24 +82,13 @@ def safe_perplexity(n_samples: int, requested: float) -> float: def compute_umap(x: np.ndarray, n_neighbors: int, min_dist: float) -> np.ndarray: - try: - import umap - - return umap.UMAP( - n_components=2, - n_neighbors=n_neighbors, - min_dist=min_dist, - random_state=42, - ).fit_transform(x) - except Exception as e: - print(f"[WARN] UMAP failed, fallback to first 2 feature columns: {e}") - if x.shape[1] >= 2: - return x[:, :2] - elif x.shape[1] == 1: - return np.column_stack([x[:, 0], np.zeros(x.shape[0])]) - else: - return np.zeros((x.shape[0], 2)) - + import umap + return umap.UMAP( + n_components=2, + n_neighbors=n_neighbors, + min_dist=min_dist, + random_state=42, + ).fit_transform(x) def compute_tsne(x: np.ndarray, perplexity: float, max_iter: int) -> np.ndarray: return TSNE( From a4a01a388918ae5fe546e180f523d2fdb5e3b8b9 Mon Sep 17 00:00:00 2001 From: Donald Baku <141358602+dbaku42@users.noreply.github.com> Date: Fri, 8 May 2026 13:07:37 +0200 Subject: [PATCH 16/38] Update modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py Co-authored-by: Jonathan Manning --- .../clustervisualiation/templates/cluster_viz.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py b/modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py index 108d19f80e3a..7d757ca26873 100644 --- a/modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py +++ b/modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py @@ -194,16 +194,6 @@ def main() -> None: tsne_df.to_csv(args.out_tsne_tsv, sep="\t", index=False) plot_scatter(tsne_df, "x", "y", args.out_tsne_png, f"t-SNE (perplexity={perp:.1f})") - pca_df = pd.read_csv(args.pca_scores, sep=r"\s+", engine="python", dtype=str) - pca_df = _normalise_id_column(pca_df) - comp_cols = [c for c in pca_df.columns if c != "sample_id"] - if len(comp_cols) < 2: - raise ValueError("pca_scores must have at least 2 PC columns") - c1, c2 = comp_cols[0], comp_cols[1] - for col in [c1, c2]: - pca_df[col] = pd.to_numeric(pca_df[col], errors="coerce") - merged = pca_df.merge(umap_df[["sample_id", "cluster"]], on="sample_id", how="inner") - plot_scatter(merged, c1, c2, args.out_pca_png, "PCA", c1, c2) if __name__ == "__main__": From 060eb5da79477b5f845e1b985ff68ed2bb6198f3 Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Fri, 8 May 2026 13:25:32 +0200 Subject: [PATCH 17/38] fix: use template for cluster visualization module --- .../custom/clustervisualiation/main.nf | 22 +---------- .../templates/cluster_viz.py | 37 +++++++++++++++++++ 2 files changed, 38 insertions(+), 21 deletions(-) diff --git a/modules/nf-core/custom/clustervisualiation/main.nf b/modules/nf-core/custom/clustervisualiation/main.nf index 0c8259de5ceb..c2682992c436 100644 --- a/modules/nf-core/custom/clustervisualiation/main.nf +++ b/modules/nf-core/custom/clustervisualiation/main.nf @@ -32,26 +32,6 @@ process CLUSTER_VIZ { task.ext.when == null || task.ext.when script: - def prefix = task.ext.prefix ?: out_prefix ?: "${meta.id}" + template 'cluster_viz.py' - """ - python3 ${moduleDir}/templates/cluster_viz.py \\ - --features ${features} \\ - --clusters ${clusters} \\ - --pca-scores ${pca_scores} \\ - --out-umap-tsv ${prefix}_umap.tsv \\ - --out-tsne-tsv ${prefix}_tsne.tsv \\ - --out-umap-png ${prefix}_umap.png \\ - --out-tsne-png ${prefix}_tsne.png \\ - --out-pca-png ${prefix}_pca.png - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python3 --version | cut -d' ' -f2) - pandas: \$(python3 -c "import pandas; print(pandas.__version__)") - scikit-learn: \$(python3 -c "import sklearn; print(sklearn.__version__)") - umap-learn: \$(python3 -c "import umap; print(umap.__version__)" 2>/dev/null || echo 'N/A') - matplotlib: \$(python3 -c "import matplotlib; print(matplotlib.__version__)") - END_VERSIONS - """ } diff --git a/modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py b/modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py index 7d757ca26873..63f6b0c24a10 100644 --- a/modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py +++ b/modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py @@ -197,4 +197,41 @@ def main() -> None: if __name__ == "__main__": + import sys + import platform + + prefix = "${task.ext.prefix ?: out_prefix ?: meta.id}" + + sys.argv = [ + "cluster_viz.py", + "--features", "$features", + "--clusters", "$clusters", + "--pca-scores", "$pca_scores", + "--out-umap-tsv", f"{prefix}_umap.tsv", + "--out-tsne-tsv", f"{prefix}_tsne.tsv", + "--out-umap-png", f"{prefix}_umap.png", + "--out-tsne-png", f"{prefix}_tsne.png", + "--out-pca-png", f"{prefix}_pca.png", + ] + main() + + import matplotlib + import pandas + import sklearn + + try: + import umap + umap_version = umap.__version__ + except Exception: + umap_version = "N/A" + + with open("versions.yml", "w") as f: + f.write( + f'"${task.process}":\n' + f' python: {platform.python_version()}\n' + f' pandas: {pandas.__version__}\n' + f' scikit-learn: {sklearn.__version__}\n' + f' umap-learn: {umap_version}\n' + f' matplotlib: {matplotlib.__version__}\n' + ) From d48a224c86207fb8b5ccc16c04ad981fbc0a4e7c Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Fri, 8 May 2026 14:01:11 +0200 Subject: [PATCH 18/38] style: clean cluster visualization module main --- modules/nf-core/custom/clustervisualiation/main.nf | 13 +------------ modules/nf-core/custom/clustervisualiation/meta.yml | 2 +- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/modules/nf-core/custom/clustervisualiation/main.nf b/modules/nf-core/custom/clustervisualiation/main.nf index c2682992c436..cd67ff46275b 100644 --- a/modules/nf-core/custom/clustervisualiation/main.nf +++ b/modules/nf-core/custom/clustervisualiation/main.nf @@ -1,15 +1,4 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl = 2 - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - PROCESS: CLUSTER_VIZ - Generates PCA, UMAP and t-SNE visualizations colored by cluster - Author: Donald Baku (author) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -process CLUSTER_VIZ { +process CUSTOM_CLUSTERVISUALIATION { tag "$meta.id" label 'process_medium' conda "${moduleDir}/environment.yml" diff --git a/modules/nf-core/custom/clustervisualiation/meta.yml b/modules/nf-core/custom/clustervisualiation/meta.yml index 9828a5c2a6bc..63bcd13f76ec 100644 --- a/modules/nf-core/custom/clustervisualiation/meta.yml +++ b/modules/nf-core/custom/clustervisualiation/meta.yml @@ -1,4 +1,4 @@ -name: "CLUSTER_VIZ" +name: "CUSTOM_CLUSTERVISUALIATION" description: "Generates PCA, UMAP and t-SNE visualizations colored by cluster" keywords: - clustering From 2d85309ecc0d0e3ba3dba76defc65f12eae568c6 Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Fri, 8 May 2026 16:38:41 +0200 Subject: [PATCH 19/38] fix: address reviewer feedback for cluster modules --- .../nf-core/custom/clustermetrics/Dockerfile | 8 ----- modules/nf-core/custom/clustermetrics/main.nf | 5 ++-- .../nf-core/custom/clustermetrics/meta.yml | 21 ++++++-------- .../templates/cluster_metrics.py | 29 +++++++++++++++++++ .../custom/clustervisualiation/Dockerfile | 8 ----- .../custom/clustervisualiation/main.nf | 8 ++--- .../custom/clustervisualiation/meta.yml | 3 -- .../templates/cluster_viz.py | 2 +- .../clustervisualiation/tests/main.nf.test | 4 +-- 9 files changed, 48 insertions(+), 40 deletions(-) delete mode 100644 modules/nf-core/custom/clustermetrics/Dockerfile delete mode 100644 modules/nf-core/custom/clustervisualiation/Dockerfile diff --git a/modules/nf-core/custom/clustermetrics/Dockerfile b/modules/nf-core/custom/clustermetrics/Dockerfile deleted file mode 100644 index 1fd701bd86f8..000000000000 --- a/modules/nf-core/custom/clustermetrics/Dockerfile +++ /dev/null @@ -1,8 +0,0 @@ -FROM nfcore/base:2.0 - -LABEL authors="dbaku42" \ - description="Docker image containing all requirements for nf-core/custom/clustermetrics" - -COPY environment.yml / -RUN micromamba install -y -n base -f /environment.yml && \ - micromamba clean -a -y diff --git a/modules/nf-core/custom/clustermetrics/main.nf b/modules/nf-core/custom/clustermetrics/main.nf index 1c8fe203fc9f..2d7a374ba295 100644 --- a/modules/nf-core/custom/clustermetrics/main.nf +++ b/modules/nf-core/custom/clustermetrics/main.nf @@ -3,8 +3,9 @@ process CUSTOM_CLUSTERMETRICS { label 'process_medium' conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? - 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/69/69a6d33f6bd1a901cad8a6914b6ad11a7db6c35005b4ff8604f20f1baba10fc3/data' : - 'community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn:b7d7028d28dc4084' }" + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c9/c993602a6f49b387b34e84f41fda5a393355850b2dd6ab776f1307a0e7b9d540/data' : + 'community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn:c378d29780adbcbf' }" + input: tuple val(meta), path(features), path(clusters) diff --git a/modules/nf-core/custom/clustermetrics/meta.yml b/modules/nf-core/custom/clustermetrics/meta.yml index a467cc634cba..432d56069427 100644 --- a/modules/nf-core/custom/clustermetrics/meta.yml +++ b/modules/nf-core/custom/clustermetrics/meta.yml @@ -1,4 +1,4 @@ -name: "CLUSTER_METRICS" +name: "CUSTOM_CLUSTERMETRICS" description: "Computes clustering quality metrics (silhouette, Calinski-Harabasz, Davies-Bouldin) and performs k-sweep analysis" keywords: @@ -19,22 +19,19 @@ tools: input: - - meta: type: map - description: Groovy Map containing sample information + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` - features: type: file - description: TSV file with sample_id and numeric features (e.g. PCA - scores) - pattern: "*.tsv" - ontologies: - - edam: http://edamontology.org/format_3475 + description: Feature matrix file + pattern: "*" + ontologies: [] - clusters: type: file - description: CSV/TSV file with sample_id and cluster assignment - pattern: "*_clusters.*" + description: Cluster assignment file + pattern: "*" ontologies: [] - - out_prefix: - type: string - description: Prefix for output files output: metrics: - - meta: diff --git a/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py b/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py index 13a417ac37fe..742df47b043d 100644 --- a/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py +++ b/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py @@ -282,4 +282,33 @@ def plot_curve(metric, title, ylabel, out_png): if __name__ == "__main__": + import sys + import platform + + prefix = "${task.ext.prefix ?: meta.id}" + + sys.argv = [ + "cluster_metrics.py", + "--features", "$features", + "--clusters", "$clusters", + "--k-min", "2", + "--k-max", "12", + "--out-k-sweep", f"{prefix}_k_sweep.csv", + "--out-selected", f"{prefix}_selected.json", + "--out-prefix", prefix, + ] + main() + + import matplotlib + import pandas + import sklearn + + with open("versions.yml", "w") as f: + f.write( + f'"${task.process}":\n' + f' python: {platform.python_version()}\n' + f' pandas: {pandas.__version__}\n' + f' scikit-learn: {sklearn.__version__}\n' + f' matplotlib: {matplotlib.__version__}\n' + ) diff --git a/modules/nf-core/custom/clustervisualiation/Dockerfile b/modules/nf-core/custom/clustervisualiation/Dockerfile deleted file mode 100644 index 577062b77918..000000000000 --- a/modules/nf-core/custom/clustervisualiation/Dockerfile +++ /dev/null @@ -1,8 +0,0 @@ -FROM nfcore/base:2.0 - -LABEL authors="dbaku42" \ - description="Docker image containing all requirements for nf-core/custom/clustervisualiation" - -COPY environment.yml / -RUN micromamba install -y -n base -f /environment.yml && \ - micromamba clean -a -y diff --git a/modules/nf-core/custom/clustervisualiation/main.nf b/modules/nf-core/custom/clustervisualiation/main.nf index cd67ff46275b..a1d49f08dff3 100644 --- a/modules/nf-core/custom/clustervisualiation/main.nf +++ b/modules/nf-core/custom/clustervisualiation/main.nf @@ -2,12 +2,12 @@ process CUSTOM_CLUSTERVISUALIATION { tag "$meta.id" label 'process_medium' conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/clustervisualiation:dev' : - 'quay.io/nf-core/clustervisualiation:dev' }" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c0/c00b83d40a02e4ed2833ebf0d38635602231a21764eff0d30ed16885e5c02445/data' : + 'community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn_umap-learn:2c4aaf377be5cd4a' }" + input: tuple val(meta), path(features), path(clusters), path(pca_scores) - val out_prefix output: tuple val(meta), path("*_umap.tsv") , emit: umap diff --git a/modules/nf-core/custom/clustervisualiation/meta.yml b/modules/nf-core/custom/clustervisualiation/meta.yml index 63bcd13f76ec..ebde48a4bbef 100644 --- a/modules/nf-core/custom/clustervisualiation/meta.yml +++ b/modules/nf-core/custom/clustervisualiation/meta.yml @@ -43,9 +43,6 @@ input: pattern: "*_pca_scores.tsv" ontologies: - edam: http://edamontology.org/format_3475 - - out_prefix: - type: string - description: Prefix for output files output: umap: - - meta: diff --git a/modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py b/modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py index 63f6b0c24a10..df4d3913d758 100644 --- a/modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py +++ b/modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py @@ -200,7 +200,7 @@ def main() -> None: import sys import platform - prefix = "${task.ext.prefix ?: out_prefix ?: meta.id}" + prefix = "${task.ext.prefix ?: meta.id}" sys.argv = [ "cluster_viz.py", diff --git a/modules/nf-core/custom/clustervisualiation/tests/main.nf.test b/modules/nf-core/custom/clustervisualiation/tests/main.nf.test index f4695eedae1e..43c55045ae1e 100644 --- a/modules/nf-core/custom/clustervisualiation/tests/main.nf.test +++ b/modules/nf-core/custom/clustervisualiation/tests/main.nf.test @@ -1,7 +1,7 @@ nextflow_process { - name "Test Process CLUSTER_VIZ" + name "Test Process CUSTOM_CLUSTERVISUALIATION" script "../main.nf" - process "CLUSTER_VIZ" + process "CUSTOM_CLUSTERVISUALIATION" tag "modules" tag "modules_nfcore" tag "custom" From 387cbebd2b89d9bba0c43996078ac9237cc4e297 Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Mon, 11 May 2026 14:23:26 +0200 Subject: [PATCH 20/38] fix: address pinin4fjords follow-up review - template escaping, drop PCA orphans, fix versions.yml, rename clustervisualiation -> clustervisualization --- .gitignore | 11 + modules/nf-core/custom/clustermetrics/main.nf | 6 +- .../templates/cluster_metrics.py | 74 +++--- .../custom/clustermetrics/tests/main.nf.test | 4 +- .../clustermetrics/tests/main.nf.test.snap | 38 +-- .../custom/clustervisualiation/main.nf | 26 -- .../templates/cluster_viz.py | 237 ------------------ .../clustervisualiation/tests/main.nf.test | 54 ---- .../tests/main.nf.test.snap | 151 ----------- .../environment.yml | 0 .../custom/clustervisualization/main.nf | 43 ++++ .../meta.yml | 2 +- .../templates/cluster_viz.py | 229 +++++++++++++++++ .../tests/data/test_clusters.csv | 0 .../tests/data/test_features.tsv | 0 .../tests/data/test_pca.eigenvec | 0 .../clustervisualization/tests/main.nf.test | 47 ++++ .../tests/main.nf.test.snap | 111 ++++++++ 18 files changed, 513 insertions(+), 520 deletions(-) delete mode 100644 modules/nf-core/custom/clustervisualiation/main.nf delete mode 100644 modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py delete mode 100644 modules/nf-core/custom/clustervisualiation/tests/main.nf.test delete mode 100644 modules/nf-core/custom/clustervisualiation/tests/main.nf.test.snap rename modules/nf-core/custom/{clustervisualiation => clustervisualization}/environment.yml (100%) create mode 100644 modules/nf-core/custom/clustervisualization/main.nf rename modules/nf-core/custom/{clustervisualiation => clustervisualization}/meta.yml (98%) create mode 100644 modules/nf-core/custom/clustervisualization/templates/cluster_viz.py rename modules/nf-core/custom/{clustervisualiation => clustervisualization}/tests/data/test_clusters.csv (100%) rename modules/nf-core/custom/{clustervisualiation => clustervisualization}/tests/data/test_features.tsv (100%) rename modules/nf-core/custom/{clustervisualiation => clustervisualization}/tests/data/test_pca.eigenvec (100%) create mode 100644 modules/nf-core/custom/clustervisualization/tests/main.nf.test create mode 100644 modules/nf-core/custom/clustervisualization/tests/main.nf.test.snap diff --git a/.gitignore b/.gitignore index df5aafd3cc74..b9460e81b015 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,14 @@ test_output/ tests/data/ work/ .github/CODEOWNERS-tmp +modules/local/ +vcf_data/ +subworkflows/nf-core/snpclustering/modules/ +subworkflows/nf-core/snpclustering/run.log +subworkflows/nf-core/snpclustering/run_test.nf +subworkflows/nf-core/snpclustering/test_local.nf +subworkflows/nf-core/snpclustering/scripts/ +subworkflows/nf-core/snpclustering/Dockerfile +subworkflows/nf-core/snpclustering/tests/ +subworkflows/nf-core/snpclustering/main.nf +modules/nf-core/clustering/ diff --git a/modules/nf-core/custom/clustermetrics/main.nf b/modules/nf-core/custom/clustermetrics/main.nf index 2d7a374ba295..e41e695f87f3 100644 --- a/modules/nf-core/custom/clustermetrics/main.nf +++ b/modules/nf-core/custom/clustermetrics/main.nf @@ -5,7 +5,7 @@ process CUSTOM_CLUSTERMETRICS { container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c9/c993602a6f49b387b34e84f41fda5a393355850b2dd6ab776f1307a0e7b9d540/data' : 'community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn:c378d29780adbcbf' }" - + input: tuple val(meta), path(features), path(clusters) @@ -28,6 +28,10 @@ process CUSTOM_CLUSTERMETRICS { touch ${prefix}_metrics.tsv touch ${prefix}_k_sweep.csv touch ${prefix}_selected.json + touch ${prefix}_elbow.png + touch ${prefix}_silhouette.png + touch ${prefix}_davies_bouldin.png + touch ${prefix}_calinski.png cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py b/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py index 742df47b043d..01236a37ce3f 100644 --- a/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py +++ b/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py @@ -2,11 +2,14 @@ import argparse import json +import platform +import sys from pathlib import Path import matplotlib import numpy as np import pandas as pd +import sklearn from sklearn.cluster import KMeans from sklearn.metrics import ( calinski_harabasz_score, @@ -17,6 +20,18 @@ matplotlib.use("Agg") +def format_yaml_like(data: dict, indent: int = 0) -> str: + """Formats a dictionary to a YAML-like string (nf-core standard).""" + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + + def _normalise_id_column(df: pd.DataFrame) -> pd.DataFrame: df = df.copy() df.columns = [str(c).lstrip("#") for c in df.columns] @@ -190,15 +205,15 @@ def main() -> None: alignment_mode = "row_order_fallback" else: raise ValueError( - f"No overlapping sample_id between features and clusters.\n" - f" features IDs (first 5): {sample_ids.head().tolist()}\n" + f"No overlapping sample_id between features and clusters.\\n" + f" features IDs (first 5): {sample_ids.head().tolist()}\\n" f" clusters IDs (first 5): {list(clusters.index[:5])}" ) else: if len(clusters_df) != len(sample_ids): raise ValueError( - "clusters CSV has no usable sample_id column and row counts do not match.\n" - f" n_features={len(sample_ids)}\n" + "clusters CSV has no usable sample_id column and row counts do not match.\\n" + f" n_features={len(sample_ids)}\\n" f" n_clusters={len(clusters_df)}" ) x = x_df.values @@ -278,37 +293,40 @@ def plot_curve(metric, title, ylabel, out_png): ) except Exception as e: - Path("plot_warning.txt").write_text(f"Plotting failed: {e}\n") + Path("plot_warning.txt").write_text("Plotting failed: " + str(e) + "\\n") + + # === VERSIONS.YML (fix review) === + versions = { + "${task.process}": { + "python": platform.python_version(), + "pandas": pd.__version__, + "scikit-learn": sklearn.__version__, + "matplotlib": matplotlib.__version__, + } + } + with open("versions.yml", "w") as f: + f.write(format_yaml_like(versions)) if __name__ == "__main__": - import sys - import platform - prefix = "${task.ext.prefix ?: meta.id}" sys.argv = [ "cluster_metrics.py", - "--features", "$features", - "--clusters", "$clusters", - "--k-min", "2", - "--k-max", "12", - "--out-k-sweep", f"{prefix}_k_sweep.csv", - "--out-selected", f"{prefix}_selected.json", - "--out-prefix", prefix, + "--features", + "$features", + "--clusters", + "$clusters", + "--k-min", + "2", + "--k-max", + "12", + "--out-k-sweep", + f"{prefix}_k_sweep.csv", + "--out-selected", + f"{prefix}_selected.json", + "--out-prefix", + prefix, ] main() - - import matplotlib - import pandas - import sklearn - - with open("versions.yml", "w") as f: - f.write( - f'"${task.process}":\n' - f' python: {platform.python_version()}\n' - f' pandas: {pandas.__version__}\n' - f' scikit-learn: {sklearn.__version__}\n' - f' matplotlib: {matplotlib.__version__}\n' - ) diff --git a/modules/nf-core/custom/clustermetrics/tests/main.nf.test b/modules/nf-core/custom/clustermetrics/tests/main.nf.test index 351d1c00df79..1d6613fa3709 100644 --- a/modules/nf-core/custom/clustermetrics/tests/main.nf.test +++ b/modules/nf-core/custom/clustermetrics/tests/main.nf.test @@ -1,7 +1,7 @@ nextflow_process { name "Test Process CLUSTER_METRICS" script "../main.nf" - process "CLUSTER_METRICS" + process "CUSTOM_CLUSTERMETRICS" tag "modules" tag "modules_nfcore" tag "custom" @@ -13,7 +13,6 @@ nextflow_process { process { """ input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/clustermetrics/tests/data/test_features.tsv", checkIfExists: true), file("${projectDir}/modules/nf-core/custom/clustermetrics/tests/data/test_clusters.csv", checkIfExists: true) ] - input[1] = 'test' """ } } @@ -34,7 +33,6 @@ nextflow_process { process { """ input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/clustermetrics/tests/data/test_features.tsv", checkIfExists: true), file("${projectDir}/modules/nf-core/custom/clustermetrics/tests/data/test_clusters.csv", checkIfExists: true) ] - input[1] = 'test' """ } } diff --git a/modules/nf-core/custom/clustermetrics/tests/main.nf.test.snap b/modules/nf-core/custom/clustermetrics/tests/main.nf.test.snap index 1e28bf786221..15deaf982452 100644 --- a/modules/nf-core/custom/clustermetrics/tests/main.nf.test.snap +++ b/modules/nf-core/custom/clustermetrics/tests/main.nf.test.snap @@ -26,10 +26,10 @@ ] ], [ - "versions.yml:md5,00e23b98be698c459e9c94079b0164e0" + "versions.yml:md5,236501fe75ac914d4de40a2c42dbec6b" ] ], - "timestamp": "2026-05-06T17:29:16.510271878", + "timestamp": "2026-05-11T13:08:00.102276222", "meta": { "nf-test": "0.9.5", "nextflow": "25.09.0" @@ -43,7 +43,7 @@ { "id": "test" }, - "test_metrics.tsv:md5,7bfdac9bca90ba2ea3c03eca25b24f28" + "test_metrics.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], "1": [ @@ -51,7 +51,7 @@ { "id": "test" }, - "test_k_sweep.csv:md5,8c382a00d959ae5d6e19f42cf9278f37" + "test_k_sweep.csv:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], "2": [ @@ -59,7 +59,7 @@ { "id": "test" }, - "test_selected.json:md5,633493b1585fb0ec0a81629bde4c00cb" + "test_selected.json:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], "3": [ @@ -68,22 +68,22 @@ "id": "test" }, [ - "test_calinski.png:md5,a5db5a4340f5a203f8cfb1e27617eab1", - "test_davies_bouldin.png:md5,b083077c690cc454c7094d6d0c72fdb9", - "test_elbow.png:md5,e90e14974deb0776c9b784c1de8b8ef0", - "test_silhouette.png:md5,b7f103711912855f97d8588e1be2d659" + "test_calinski.png:md5,d41d8cd98f00b204e9800998ecf8427e", + "test_davies_bouldin.png:md5,d41d8cd98f00b204e9800998ecf8427e", + "test_elbow.png:md5,d41d8cd98f00b204e9800998ecf8427e", + "test_silhouette.png:md5,d41d8cd98f00b204e9800998ecf8427e" ] ] ], "4": [ - "versions.yml:md5,00e23b98be698c459e9c94079b0164e0" + "versions.yml:md5,48951b680bf5fe7b6b0242dd160a2618" ], "k_sweep": [ [ { "id": "test" }, - "test_k_sweep.csv:md5,8c382a00d959ae5d6e19f42cf9278f37" + "test_k_sweep.csv:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], "metrics": [ @@ -91,7 +91,7 @@ { "id": "test" }, - "test_metrics.tsv:md5,7bfdac9bca90ba2ea3c03eca25b24f28" + "test_metrics.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], "plots": [ @@ -100,10 +100,10 @@ "id": "test" }, [ - "test_calinski.png:md5,a5db5a4340f5a203f8cfb1e27617eab1", - "test_davies_bouldin.png:md5,b083077c690cc454c7094d6d0c72fdb9", - "test_elbow.png:md5,e90e14974deb0776c9b784c1de8b8ef0", - "test_silhouette.png:md5,b7f103711912855f97d8588e1be2d659" + "test_calinski.png:md5,d41d8cd98f00b204e9800998ecf8427e", + "test_davies_bouldin.png:md5,d41d8cd98f00b204e9800998ecf8427e", + "test_elbow.png:md5,d41d8cd98f00b204e9800998ecf8427e", + "test_silhouette.png:md5,d41d8cd98f00b204e9800998ecf8427e" ] ] ], @@ -112,15 +112,15 @@ { "id": "test" }, - "test_selected.json:md5,633493b1585fb0ec0a81629bde4c00cb" + "test_selected.json:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], "versions": [ - "versions.yml:md5,00e23b98be698c459e9c94079b0164e0" + "versions.yml:md5,48951b680bf5fe7b6b0242dd160a2618" ] } ], - "timestamp": "2026-05-06T17:29:21.180634202", + "timestamp": "2026-05-11T13:05:52.932850421", "meta": { "nf-test": "0.9.5", "nextflow": "25.09.0" diff --git a/modules/nf-core/custom/clustervisualiation/main.nf b/modules/nf-core/custom/clustervisualiation/main.nf deleted file mode 100644 index a1d49f08dff3..000000000000 --- a/modules/nf-core/custom/clustervisualiation/main.nf +++ /dev/null @@ -1,26 +0,0 @@ -process CUSTOM_CLUSTERVISUALIATION { - tag "$meta.id" - label 'process_medium' - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? - 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c0/c00b83d40a02e4ed2833ebf0d38635602231a21764eff0d30ed16885e5c02445/data' : - 'community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn_umap-learn:2c4aaf377be5cd4a' }" - - input: - tuple val(meta), path(features), path(clusters), path(pca_scores) - - output: - tuple val(meta), path("*_umap.tsv") , emit: umap - tuple val(meta), path("*_tsne.tsv") , emit: tsne - tuple val(meta), path("*_umap.png") , emit: umap_png - tuple val(meta), path("*_tsne.png") , emit: tsne_png - tuple val(meta), path("*_pca.png") , emit: pca_png - path "versions.yml" , emit: versions, topic: versions - - when: - task.ext.when == null || task.ext.when - - script: - template 'cluster_viz.py' - -} diff --git a/modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py b/modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py deleted file mode 100644 index df4d3913d758..000000000000 --- a/modules/nf-core/custom/clustervisualiation/templates/cluster_viz.py +++ /dev/null @@ -1,237 +0,0 @@ -#!/usr/bin/env python3 - -"""Cluster visualizations. - -Produces three 2D plots, all colored by cluster label: - - PCA (first two columns from pca_scores) - - UMAP (computed on the feature matrix used for clustering) - - t-SNE (computed on the feature matrix used for clustering) - -Also writes UMAP and t-SNE coordinates to TSV. -""" - -import argparse - -import numpy as np -import pandas as pd -from sklearn.manifold import TSNE - - -def _normalise_id_column(df: pd.DataFrame) -> pd.DataFrame: - """ - Handles the header formats that FlashPCA/PLINK2 produces: - - '#IID' (PLINK2 eigenvec: leading hash on first column) - - 'IID' (FlashPCA / older PLINK) - - 'FID', 'IID' (two-column prefix) - - 'sample_id' (already normalised) - """ - # Strip leading '#' (PLINK2 eigenvec writes '#IID' as the first column) - df = df.rename(columns=lambda c: c.lstrip("#")) - - cols_upper = {c.upper(): c for c in df.columns} - - # Remove duplicate header row (IID value == "FID" or "IID") - if "IID" in cols_upper: - iid_col = cols_upper["IID"] - dup_mask = df[iid_col].str.upper().isin({"FID", "IID"}) - if dup_mask.any(): - df = df[~dup_mask].copy().reset_index(drop=True) - - cols_upper = {c.upper(): c for c in df.columns} - - if "SAMPLE_ID" in cols_upper: - return df - - if "IID" in cols_upper: - iid_col = cols_upper["IID"] - iid_numeric = pd.to_numeric(df[iid_col], errors="coerce").notna().all() - if iid_numeric: - df = df.drop(columns=[iid_col]) - df = df.rename(columns={df.columns[0]: "sample_id"}) - else: - df = df.rename(columns={iid_col: "sample_id"}) - - fid_cols = [c for c in df.columns if c.upper() == "FID"] - if fid_cols: - df = df.drop(columns=fid_cols) - return df - - raise ValueError(f"Cannot find sample ID column (expected 'sample_id' or 'IID'). Found: {list(df.columns)}") - - -def load_features(path: str) -> tuple[pd.DataFrame, pd.Series]: - df = pd.read_csv(path, sep=r"\s+", engine="python", dtype=str) - df = _normalise_id_column(df) - sample_ids = df["sample_id"].astype(str) - x = df.drop(columns=["sample_id"]).apply(pd.to_numeric, errors="coerce").fillna(0.0) - return x, sample_ids - - -def load_clusters(path: str) -> pd.Series: - df = pd.read_csv(path) - if "sample_id" not in df.columns or "cluster" not in df.columns: - raise ValueError(f"clusters file must have 'sample_id' and 'cluster' columns. Found: {list(df.columns)}") - return df.set_index(df["sample_id"].astype(str))["cluster"].astype(int) - - -def safe_perplexity(n_samples: int, requested: float) -> float: - if n_samples <= 3: - return 1.0 - upper = (n_samples - 1) / 3.0 - return float(max(2.0, min(requested, upper))) - - -def compute_umap(x: np.ndarray, n_neighbors: int, min_dist: float) -> np.ndarray: - import umap - return umap.UMAP( - n_components=2, - n_neighbors=n_neighbors, - min_dist=min_dist, - random_state=42, - ).fit_transform(x) - -def compute_tsne(x: np.ndarray, perplexity: float, max_iter: int) -> np.ndarray: - return TSNE( - n_components=2, - perplexity=perplexity, - init="pca", - random_state=42, - max_iter=max_iter, - learning_rate="auto", - ).fit_transform(x) - - -def plot_scatter(df, x, y, out_png, title, xlabel=None, ylabel=None): - import matplotlib.pyplot as plt - from matplotlib.lines import Line2D - - plt.figure(figsize=(7, 5)) - labels = df["cluster"].astype(int).values - uniq = np.unique(labels) - sc = plt.scatter(df[x], df[y], c=labels, cmap="Paired", s=24, linewidths=0.4, alpha=0.85) - plt.title(title) - plt.xlabel(xlabel or x) - plt.ylabel(ylabel or y) - plt.grid(True, alpha=0.5) - handles = [ - Line2D( - [0], - [0], - marker="o", - linestyle="", - markersize=7, - markerfacecolor=sc.cmap(sc.norm(k)), - markeredgecolor="none", - label=f"Cluster {k}", - ) - for k in uniq - ] - plt.legend( - handles=handles, - title="Clusters", - loc="center left", - bbox_to_anchor=(1.02, 0.5), - borderaxespad=0.0, - frameon=True, - ) - plt.tight_layout() - plt.savefig(out_png, dpi=200, bbox_inches="tight") - plt.close() - - -def main() -> None: - ap = argparse.ArgumentParser(description="PCA + UMAP + t-SNE plots colored by cluster") - ap.add_argument("--features", required=True) - ap.add_argument("--clusters", required=True) - ap.add_argument("--pca-scores", required=True) - ap.add_argument("--tsne-perplexity", type=float, default=30.0) - ap.add_argument("--tsne-iter", type=int, default=1000) - ap.add_argument("--umap-neighbors", type=int, default=15) - ap.add_argument("--umap-min-dist", type=float, default=0.1) - ap.add_argument("--out-umap-tsv", required=True) - ap.add_argument("--out-tsne-tsv", required=True) - ap.add_argument("--out-umap-png", required=True) - ap.add_argument("--out-tsne-png", required=True) - ap.add_argument("--out-pca-png", required=True) - args = ap.parse_args() - - x_df, sample_ids = load_features(args.features) - clusters = load_clusters(args.clusters) - - common = sample_ids[sample_ids.isin(clusters.index)] - if len(common) == 0: - raise ValueError( - f"No overlapping sample_id between features and clusters.\n" - f" features IDs (first 5): {sample_ids.head().tolist()}\n" - f" clusters IDs (first 5): {list(clusters.index[:5])}" - ) - - x = x_df.loc[common.index].values - y = clusters.loc[common.values].values - - umap_coords = compute_umap(x, args.umap_neighbors, args.umap_min_dist) - umap_df = pd.DataFrame( - { - "sample_id": common.values, - "x": umap_coords[:, 0], - "y": umap_coords[:, 1], - "cluster": y, - } - ) - umap_df.to_csv(args.out_umap_tsv, sep="\t", index=False) - plot_scatter(umap_df, "x", "y", args.out_umap_png, "UMAP embedding") - - perp = safe_perplexity(len(common), args.tsne_perplexity) - tsne_coords = compute_tsne(x, perp, args.tsne_iter) - tsne_df = pd.DataFrame( - { - "sample_id": common.values, - "x": tsne_coords[:, 0], - "y": tsne_coords[:, 1], - "cluster": y, - } - ) - tsne_df.to_csv(args.out_tsne_tsv, sep="\t", index=False) - plot_scatter(tsne_df, "x", "y", args.out_tsne_png, f"t-SNE (perplexity={perp:.1f})") - - - -if __name__ == "__main__": - import sys - import platform - - prefix = "${task.ext.prefix ?: meta.id}" - - sys.argv = [ - "cluster_viz.py", - "--features", "$features", - "--clusters", "$clusters", - "--pca-scores", "$pca_scores", - "--out-umap-tsv", f"{prefix}_umap.tsv", - "--out-tsne-tsv", f"{prefix}_tsne.tsv", - "--out-umap-png", f"{prefix}_umap.png", - "--out-tsne-png", f"{prefix}_tsne.png", - "--out-pca-png", f"{prefix}_pca.png", - ] - - main() - - import matplotlib - import pandas - import sklearn - - try: - import umap - umap_version = umap.__version__ - except Exception: - umap_version = "N/A" - - with open("versions.yml", "w") as f: - f.write( - f'"${task.process}":\n' - f' python: {platform.python_version()}\n' - f' pandas: {pandas.__version__}\n' - f' scikit-learn: {sklearn.__version__}\n' - f' umap-learn: {umap_version}\n' - f' matplotlib: {matplotlib.__version__}\n' - ) diff --git a/modules/nf-core/custom/clustervisualiation/tests/main.nf.test b/modules/nf-core/custom/clustervisualiation/tests/main.nf.test deleted file mode 100644 index 43c55045ae1e..000000000000 --- a/modules/nf-core/custom/clustervisualiation/tests/main.nf.test +++ /dev/null @@ -1,54 +0,0 @@ -nextflow_process { - name "Test Process CUSTOM_CLUSTERVISUALIATION" - script "../main.nf" - process "CUSTOM_CLUSTERVISUALIATION" - tag "modules" - tag "modules_nfcore" - tag "custom" - tag "custom/clustervisualiation" - tag "clustervisualiation" - - test("clustervisualiation - features clusters pca") { - when { - process { - """ - input[0] = [ [id:'test'], - file("${projectDir}/modules/nf-core/custom/clustervisualiation/tests/data/test_features.tsv", checkIfExists: true), - file("${projectDir}/modules/nf-core/custom/clustervisualiation/tests/data/test_clusters.csv", checkIfExists: true), - file("${projectDir}/modules/nf-core/custom/clustervisualiation/tests/data/test_pca.eigenvec", checkIfExists: true) ] - input[1] = 'test' - """ - } - } - then { - assert process.success - assert snapshot( - process.out.umap, - process.out.tsne, - process.out.umap_png, - process.out.tsne_png, - process.out.pca_png, - process.out.versions - ).match() - } - } - - test("clustervisualiation - features clusters pca - stub") { - options "-stub" - when { - process { - """ - input[0] = [ [id:'test'], - file("${projectDir}/modules/nf-core/custom/clustervisualiation/tests/data/test_features.tsv", checkIfExists: true), - file("${projectDir}/modules/nf-core/custom/clustervisualiation/tests/data/test_clusters.csv", checkIfExists: true), - file("${projectDir}/modules/nf-core/custom/clustervisualiation/tests/data/test_pca.eigenvec", checkIfExists: true) ] - input[1] = 'test' - """ - } - } - then { - assert process.success - assert snapshot(process.out).match() - } - } -} diff --git a/modules/nf-core/custom/clustervisualiation/tests/main.nf.test.snap b/modules/nf-core/custom/clustervisualiation/tests/main.nf.test.snap deleted file mode 100644 index bc8ca92380e9..000000000000 --- a/modules/nf-core/custom/clustervisualiation/tests/main.nf.test.snap +++ /dev/null @@ -1,151 +0,0 @@ -{ - "clustervisualiation - features clusters pca": { - "content": [ - [ - [ - { - "id": "test" - }, - "test_umap.tsv:md5,feda69a2108d84b70c3937a1cb84e661" - ] - ], - [ - [ - { - "id": "test" - }, - "test_tsne.tsv:md5,fc79b712722096f127cdb80c269380fb" - ] - ], - [ - [ - { - "id": "test" - }, - "test_umap.png:md5,470fab262187541b0ad52a3138bf3734" - ] - ], - [ - [ - { - "id": "test" - }, - "test_tsne.png:md5,aeacfdc6d9e35dd27406e70ae9220715" - ] - ], - [ - [ - { - "id": "test" - }, - "test_pca.png:md5,a659f260b13e0351dd28cbb5163d6a20" - ] - ], - [ - "versions.yml:md5,786af5c4301c54553001db08a8c9db5b" - ] - ], - "timestamp": "2026-05-06T17:29:34.854148226", - "meta": { - "nf-test": "0.9.5", - "nextflow": "25.09.0" - } - }, - "clustervisualiation - features clusters pca - stub": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test_umap.tsv:md5,feda69a2108d84b70c3937a1cb84e661" - ] - ], - "1": [ - [ - { - "id": "test" - }, - "test_tsne.tsv:md5,fc79b712722096f127cdb80c269380fb" - ] - ], - "2": [ - [ - { - "id": "test" - }, - "test_umap.png:md5,470fab262187541b0ad52a3138bf3734" - ] - ], - "3": [ - [ - { - "id": "test" - }, - "test_tsne.png:md5,aeacfdc6d9e35dd27406e70ae9220715" - ] - ], - "4": [ - [ - { - "id": "test" - }, - "test_pca.png:md5,a659f260b13e0351dd28cbb5163d6a20" - ] - ], - "5": [ - "versions.yml:md5,786af5c4301c54553001db08a8c9db5b" - ], - "pca_png": [ - [ - { - "id": "test" - }, - "test_pca.png:md5,a659f260b13e0351dd28cbb5163d6a20" - ] - ], - "tsne": [ - [ - { - "id": "test" - }, - "test_tsne.tsv:md5,fc79b712722096f127cdb80c269380fb" - ] - ], - "tsne_png": [ - [ - { - "id": "test" - }, - "test_tsne.png:md5,aeacfdc6d9e35dd27406e70ae9220715" - ] - ], - "umap": [ - [ - { - "id": "test" - }, - "test_umap.tsv:md5,feda69a2108d84b70c3937a1cb84e661" - ] - ], - "umap_png": [ - [ - { - "id": "test" - }, - "test_umap.png:md5,470fab262187541b0ad52a3138bf3734" - ] - ], - "versions": [ - "versions.yml:md5,786af5c4301c54553001db08a8c9db5b" - ] - } - ], - "timestamp": "2026-05-06T17:29:46.705755643", - "meta": { - "nf-test": "0.9.5", - "nextflow": "25.09.0" - } - } -} \ No newline at end of file diff --git a/modules/nf-core/custom/clustervisualiation/environment.yml b/modules/nf-core/custom/clustervisualization/environment.yml similarity index 100% rename from modules/nf-core/custom/clustervisualiation/environment.yml rename to modules/nf-core/custom/clustervisualization/environment.yml diff --git a/modules/nf-core/custom/clustervisualization/main.nf b/modules/nf-core/custom/clustervisualization/main.nf new file mode 100644 index 000000000000..4c4949fb60d6 --- /dev/null +++ b/modules/nf-core/custom/clustervisualization/main.nf @@ -0,0 +1,43 @@ +process CUSTOM_CLUSTERVISUALIZATION { + tag "$meta.id" + label 'process_medium' + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c9/c993602a6f49b387b34e84f41fda5a393355850b2dd6ab776f1307a0e7b9d540/data' : + 'community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn_seaborn_umap-learn:c378d29780adbcbf' }" + + input: + tuple val(meta), path(features), path(clusters) + + output: + tuple val(meta), path("*.umap.tsv") , emit: umap_tsv + tuple val(meta), path("*.tsne.tsv") , emit: tsne_tsv + tuple val(meta), path("*.umap.png") , emit: umap_png, optional: true + tuple val(meta), path("*.tsne.png") , emit: tsne_png, optional: true + path "versions.yml" , emit: versions, topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + template 'cluster_viz.py' + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.umap.tsv + touch ${prefix}.tsne.tsv + touch ${prefix}.umap.png + touch ${prefix}.tsne.png + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //') + pandas: \$(python3 -c "import pandas; print(pandas.__version__)") + matplotlib: \$(python3 -c "import matplotlib; print(matplotlib.__version__)") + seaborn: \$(python3 -c "import seaborn; print(seaborn.__version__)") + umap-learn: \$(python3 -c "import umap; print(umap.__version__)") + scikit-learn: \$(python3 -c "import sklearn; print(sklearn.__version__)") + END_VERSIONS + """ +} diff --git a/modules/nf-core/custom/clustervisualiation/meta.yml b/modules/nf-core/custom/clustervisualization/meta.yml similarity index 98% rename from modules/nf-core/custom/clustervisualiation/meta.yml rename to modules/nf-core/custom/clustervisualization/meta.yml index ebde48a4bbef..a7f02374f268 100644 --- a/modules/nf-core/custom/clustervisualiation/meta.yml +++ b/modules/nf-core/custom/clustervisualization/meta.yml @@ -1,4 +1,4 @@ -name: "CUSTOM_CLUSTERVISUALIATION" +name: "CUSTOM_CLUSTERVISUALIZATION" description: "Generates PCA, UMAP and t-SNE visualizations colored by cluster" keywords: - clustering diff --git a/modules/nf-core/custom/clustervisualization/templates/cluster_viz.py b/modules/nf-core/custom/clustervisualization/templates/cluster_viz.py new file mode 100644 index 000000000000..c67391521f6c --- /dev/null +++ b/modules/nf-core/custom/clustervisualization/templates/cluster_viz.py @@ -0,0 +1,229 @@ +#!/usr/bin/env python3 + +import platform +import sys + +import matplotlib +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +import sklearn +from sklearn.manifold import TSNE +from umap import UMAP +import umap as umap_module + +matplotlib.use("Agg") + + +def format_yaml_like(data: dict, indent: int = 0) -> str: + """Formats a dictionary to a YAML-like string (nf-core standard).""" + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + + +def _normalise_id_column(df: pd.DataFrame) -> pd.DataFrame: + df = df.copy() + df.columns = [str(c).lstrip("#") for c in df.columns] + + cols_upper = {str(c).upper(): c for c in df.columns} + + if "IID" in cols_upper: + iid_col = cols_upper["IID"] + dup_mask = df[iid_col].astype(str).str.upper().isin({"FID", "IID"}) + if dup_mask.any(): + df = df.loc[~dup_mask].copy().reset_index(drop=True) + + cols_upper = {str(c).upper(): c for c in df.columns} + + if "SAMPLE_ID" in cols_upper: + sample_col = cols_upper["SAMPLE_ID"] + if sample_col != "sample_id": + df = df.rename(columns={sample_col: "sample_id"}) + return df + + if "IID" in cols_upper: + iid_col = cols_upper["IID"] + iid_numeric = pd.to_numeric(df[iid_col], errors="coerce").notna().all() + + if iid_numeric: + df = df.drop(columns=[iid_col]) + if len(df.columns) == 0: + raise ValueError("Cannot infer sample_id after dropping numeric IID column") + df = df.rename(columns={df.columns[0]: "sample_id"}) + else: + df = df.rename(columns={iid_col: "sample_id"}) + + fid_cols = [c for c in df.columns if str(c).upper() == "FID"] + if fid_cols: + df = df.drop(columns=fid_cols) + + return df + + raise ValueError(f"Cannot find sample ID column (expected 'sample_id' or 'IID'). Found: {list(df.columns)}") + + +def load_features(path: str) -> tuple[pd.DataFrame, pd.Series]: + df = pd.read_csv(path, sep="\\t", dtype=str) + df = _normalise_id_column(df) + + if "sample_id" not in df.columns: + raise ValueError("features file must contain a sample_id column after normalization") + + sample_ids = df["sample_id"].astype(str) + x = df.drop(columns=["sample_id"]).apply(pd.to_numeric, errors="coerce") + x = x.fillna(x.mean(numeric_only=True)) + x = x.fillna(0.0) + + return x, sample_ids + + +def load_clusters(path: str) -> tuple[pd.DataFrame, str]: + """Load clusters and return (df, mode). Same logic as cluster_metrics.""" + df = pd.read_csv(path, sep=",", dtype=str) + df = df.copy() + df.columns = [str(c).lstrip("#") for c in df.columns] + + cols_upper = {str(c).upper(): c for c in df.columns} + + if "CLUSTER" not in cols_upper: + raise ValueError("clusters CSV must have a 'cluster' column") + + cluster_col = cols_upper["CLUSTER"] + + if "SAMPLE_ID" in cols_upper: + sample_col = cols_upper["SAMPLE_ID"] + out = df[[sample_col, cluster_col]].copy() + out.columns = ["sample_id", "cluster"] + out["sample_id"] = out["sample_id"].astype(str) + out["cluster"] = pd.to_numeric(out["cluster"], errors="raise").astype(int) + return out, "sample_id" + + try: + norm = _normalise_id_column(df.copy()) + if "sample_id" in norm.columns and "cluster" in norm.columns: + out = norm[["sample_id", "cluster"]].copy() + out["sample_id"] = out["sample_id"].astype(str) + out["cluster"] = pd.to_numeric(out["cluster"], errors="raise").astype(int) + return out, "sample_id" + except Exception: + pass + + other_cols = [c for c in df.columns if c != cluster_col] + + if len(other_cols) == 1: + candidate = other_cols[0] + candidate_vals = df[candidate].astype(str) + + if not ( + len(candidate_vals) > 0 and float(pd.to_numeric(candidate_vals, errors="coerce").notna().mean()) >= 0.8 + ): + out = pd.DataFrame( + { + "sample_id": candidate_vals, + "cluster": pd.to_numeric(df[cluster_col], errors="raise").astype(int), + } + ) + return out, "sample_id" + + out = pd.DataFrame({"cluster": pd.to_numeric(df[cluster_col], errors="raise").astype(int)}) + return out, "row_order" + + +def plot_embedding(x: np.ndarray, labels: np.ndarray, method: str, prefix: str) -> None: + """Plot UMAP or t-SNE with cluster coloring.""" + if method == "umap": + reducer = UMAP(random_state=42) + embedding = reducer.fit_transform(x) + title = "UMAP" + out_tsv = f"{prefix}.umap.tsv" + out_png = f"{prefix}.umap.png" + else: # tsne + reducer = TSNE(n_components=2, random_state=42, perplexity=min(30, len(x) - 1)) + embedding = reducer.fit_transform(x) + title = "t-SNE" + out_tsv = f"{prefix}.tsne.tsv" + out_png = f"{prefix}.tsne.png" + + # Save embedding + emb_df = pd.DataFrame(embedding, columns=["Dim1", "Dim2"]) + emb_df["cluster"] = labels + emb_df.to_csv(out_tsv, sep="\\t", index=False) + + # Plot + plt.figure(figsize=(8, 6)) + palette = sns.color_palette("tab10", n_colors=len(np.unique(labels))) + sns.scatterplot( + x=embedding[:, 0], + y=embedding[:, 1], + hue=labels.astype(str), + palette=palette, + alpha=0.8, + s=60, + edgecolor="k", + linewidth=0.3, + ) + plt.title(f"{title} projection of features colored by cluster") + plt.xlabel(f"{title} 1") + plt.ylabel(f"{title} 2") + plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc="upper left") + plt.tight_layout() + plt.savefig(out_png, dpi=200, bbox_inches="tight") + plt.close() + + +def main() -> None: + features = "$features" + clusters_path = "$clusters" + prefix = "${task.ext.prefix ?: meta.id}" + + x_df, sample_ids = load_features(features) + clusters_df, cluster_mode = load_clusters(clusters_path) + + if cluster_mode == "sample_id": + clusters = clusters_df.set_index("sample_id")["cluster"] + common = sample_ids[sample_ids.isin(clusters.index)] + if len(common) > 0: + x = x_df.loc[common.index].values + labels = clusters.loc[common.values].values + elif len(clusters_df) == len(sample_ids): + x = x_df.values + labels = clusters_df["cluster"].values + else: + raise ValueError("No overlapping sample_id between features and clusters") + else: + if len(clusters_df) != len(sample_ids): + raise ValueError("Row counts do not match and no sample_id column found") + x = x_df.values + labels = clusters_df["cluster"].values + + if len(x) < 2: + raise ValueError("Need at least 2 samples for embedding") + + # Generate both embeddings + plot_embedding(x, labels, "umap", prefix) + plot_embedding(x, labels, "tsne", prefix) + + # versions.yml + versions = { + "${task.process}": { + "python": platform.python_version(), + "pandas": pd.__version__, + "matplotlib": matplotlib.__version__, + "seaborn": sns.__version__, + "umap-learn": umap_module.__version__, + "scikit-learn": sklearn.__version__, + } + } + with open("versions.yml", "w") as f: + f.write(format_yaml_like(versions)) + + +if __name__ == "__main__": + main() diff --git a/modules/nf-core/custom/clustervisualiation/tests/data/test_clusters.csv b/modules/nf-core/custom/clustervisualization/tests/data/test_clusters.csv similarity index 100% rename from modules/nf-core/custom/clustervisualiation/tests/data/test_clusters.csv rename to modules/nf-core/custom/clustervisualization/tests/data/test_clusters.csv diff --git a/modules/nf-core/custom/clustervisualiation/tests/data/test_features.tsv b/modules/nf-core/custom/clustervisualization/tests/data/test_features.tsv similarity index 100% rename from modules/nf-core/custom/clustervisualiation/tests/data/test_features.tsv rename to modules/nf-core/custom/clustervisualization/tests/data/test_features.tsv diff --git a/modules/nf-core/custom/clustervisualiation/tests/data/test_pca.eigenvec b/modules/nf-core/custom/clustervisualization/tests/data/test_pca.eigenvec similarity index 100% rename from modules/nf-core/custom/clustervisualiation/tests/data/test_pca.eigenvec rename to modules/nf-core/custom/clustervisualization/tests/data/test_pca.eigenvec diff --git a/modules/nf-core/custom/clustervisualization/tests/main.nf.test b/modules/nf-core/custom/clustervisualization/tests/main.nf.test new file mode 100644 index 000000000000..76e03b14c860 --- /dev/null +++ b/modules/nf-core/custom/clustervisualization/tests/main.nf.test @@ -0,0 +1,47 @@ +nextflow_process { + name "Test Process CUSTOM_CLUSTERVISUALIZATION" + script "../main.nf" + process "CUSTOM_CLUSTERVISUALIZATION" + tag "modules" + tag "modules_nfcore" + tag "custom" + tag "custom/clustervisualization" + tag "clustervisualization" + + test("clustervisualization - features and clusters") { + when { + process { + """ + input[0] = [ [id:'test'], + file("${projectDir}/modules/nf-core/custom/clustervisualization/tests/data/test_features.tsv", checkIfExists: true), + file("${projectDir}/modules/nf-core/custom/clustervisualization/tests/data/test_clusters.csv", checkIfExists: true) ] + """ + } + } + then { + assert process.success + assert snapshot( + process.out.umap_tsv, + process.out.tsne_tsv, + process.out.versions + ).match() + } + } + + test("clustervisualization - features and clusters - stub") { + options "-stub" + when { + process { + """ + input[0] = [ [id:'test'], + file("${projectDir}/modules/nf-core/custom/clustervisualization/tests/data/test_features.tsv", checkIfExists: true), + file("${projectDir}/modules/nf-core/custom/clustervisualization/tests/data/test_clusters.csv", checkIfExists: true) ] + """ + } + } + then { + assert process.success + assert snapshot(process.out).match() + } + } +} diff --git a/modules/nf-core/custom/clustervisualization/tests/main.nf.test.snap b/modules/nf-core/custom/clustervisualization/tests/main.nf.test.snap new file mode 100644 index 000000000000..610912e901ea --- /dev/null +++ b/modules/nf-core/custom/clustervisualization/tests/main.nf.test.snap @@ -0,0 +1,111 @@ +{ + "clustervisualization - features and clusters - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.umap.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.tsne.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test" + }, + "test.umap.png:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "test" + }, + "test.tsne.png:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + "versions.yml:md5,a50bc5907c3a28f12238b5497e3f4c67" + ], + "tsne_png": [ + [ + { + "id": "test" + }, + "test.tsne.png:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "tsne_tsv": [ + [ + { + "id": "test" + }, + "test.tsne.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "umap_png": [ + [ + { + "id": "test" + }, + "test.umap.png:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "umap_tsv": [ + [ + { + "id": "test" + }, + "test.umap.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,a50bc5907c3a28f12238b5497e3f4c67" + ] + } + ], + "timestamp": "2026-05-11T14:15:13.987219333", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.09.0" + } + }, + "clustervisualization - features and clusters": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.umap.tsv:md5,2cba3fa6ba2d3ce80ad884b4210403eb" + ] + ], + [ + [ + { + "id": "test" + }, + "test.tsne.tsv:md5,738a97587fa8c72614d2655eddbd2f7a" + ] + ], + [ + "versions.yml:md5,43b533ced227b510ca833d01881efc8e" + ] + ], + "timestamp": "2026-05-11T14:15:07.547048716", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.09.0" + } + } +} \ No newline at end of file From 4ed380b14761a7dd2f516ba5ac7e4830b9331ec5 Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Mon, 11 May 2026 16:36:40 +0200 Subject: [PATCH 21/38] feat(custom/clustervisualization): add UMAP and t-SNE cluster visualization module --- .../clustervisualization/environment.yml | 1 + .../custom/clustervisualization/main.nf | 4 +- .../custom/clustervisualization/meta.yml | 71 ++++++++----------- 3 files changed, 31 insertions(+), 45 deletions(-) diff --git a/modules/nf-core/custom/clustervisualization/environment.yml b/modules/nf-core/custom/clustervisualization/environment.yml index 803fb67fb108..b68e2333b2f2 100644 --- a/modules/nf-core/custom/clustervisualization/environment.yml +++ b/modules/nf-core/custom/clustervisualization/environment.yml @@ -8,4 +8,5 @@ dependencies: - pandas=2.2.* - python=3.12 - scikit-learn=1.5.* + - seaborn=0.13.* - umap-learn=0.5.* diff --git a/modules/nf-core/custom/clustervisualization/main.nf b/modules/nf-core/custom/clustervisualization/main.nf index 4c4949fb60d6..9120f2e18f2f 100644 --- a/modules/nf-core/custom/clustervisualization/main.nf +++ b/modules/nf-core/custom/clustervisualization/main.nf @@ -2,9 +2,7 @@ process CUSTOM_CLUSTERVISUALIZATION { tag "$meta.id" label 'process_medium' conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? - 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c9/c993602a6f49b387b34e84f41fda5a393355850b2dd6ab776f1307a0e7b9d540/data' : - 'community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn_seaborn_umap-learn:c378d29780adbcbf' }" + container "community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn_pruned:9579c043ac096a34" input: tuple val(meta), path(features), path(clusters) diff --git a/modules/nf-core/custom/clustervisualization/meta.yml b/modules/nf-core/custom/clustervisualization/meta.yml index a7f02374f268..0d90ab6fa53d 100644 --- a/modules/nf-core/custom/clustervisualization/meta.yml +++ b/modules/nf-core/custom/clustervisualization/meta.yml @@ -1,5 +1,5 @@ name: "CUSTOM_CLUSTERVISUALIZATION" -description: "Generates PCA, UMAP and t-SNE visualizations colored by cluster" +description: "Generates UMAP and t-SNE visualizations colored by cluster" keywords: - clustering - visualization @@ -8,14 +8,14 @@ keywords: - tsne - dimension-reduction tools: - - "scikit-learn": + - scikit-learn: description: "Machine learning library for dimension reduction (PCA, t-SNE)" homepage: "https://scikit-learn.org/" documentation: "https://scikit-learn.org/stable/modules/clustering.html" licence: - "BSD-3-Clause" identifier: "" - - "umap-learn": + - umap-learn: description: "Uniform Manifold Approximation and Projection for dimension reduction" homepage: "https://umap-learn.readthedocs.io/" documentation: "https://umap-learn.readthedocs.io/en/latest/" @@ -25,76 +25,63 @@ tools: input: - - meta: type: map - description: Groovy Map containing sample information + description: "Groovy Map containing sample information" - features: type: file - description: TSV file with sample_id and numeric features + description: "TSV file with sample_id and numeric features" pattern: "*.tsv" ontologies: - - edam: http://edamontology.org/format_3475 + - edam: "http://edamontology.org/format_3475" - clusters: type: file - description: CSV/TSV file with sample_id and cluster assignment + description: "CSV/TSV file with sample_id and cluster assignment" pattern: "*_clusters.*" ontologies: [] - - pca_scores: - type: file - description: TSV file with PCA scores from previous step - pattern: "*_pca_scores.tsv" - ontologies: - - edam: http://edamontology.org/format_3475 output: - umap: + umap_tsv: - - meta: type: map - description: Groovy Map containing sample information - - "*_umap.tsv": + description: "Groovy Map containing sample information" + - "*.umap.tsv": type: file - description: UMAP coordinates TSV file - pattern: "*_umap.tsv" + description: "UMAP coordinates per sample" + pattern: "*.umap.tsv" ontologies: + - edam: "http://edamontology.org/operation_2432" - edam: http://edamontology.org/format_3475 - tsne: + tsne_tsv: - - meta: type: map - description: Groovy Map containing sample information - - "*_tsne.tsv": + description: "Groovy Map containing sample information" + - "*.tsne.tsv": type: file - description: t-SNE coordinates TSV file - pattern: "*_tsne.tsv" + description: "t-SNE coordinates per sample" + pattern: "*.tsne.tsv" ontologies: + - edam: "http://edamontology.org/operation_2432" - edam: http://edamontology.org/format_3475 umap_png: - - meta: type: map - description: Groovy Map containing sample information - - "*_umap.png": + description: "Groovy Map containing sample information" + - "*.umap.png": type: file - description: UMAP visualization plot - pattern: "*_umap.png" + description: "UMAP visualization coloured by cluster" + pattern: "*.umap.png" ontologies: [] tsne_png: - - meta: type: map - description: Groovy Map containing sample information - - "*_tsne.png": + description: "Groovy Map containing sample information" + - "*.tsne.png": type: file - description: t-SNE visualization plot - pattern: "*_tsne.png" - ontologies: [] - pca_png: - - - meta: - type: map - description: Groovy Map containing sample information - - "*_pca.png": - type: file - description: PCA visualization plot - pattern: "*_pca.png" + description: "t-SNE visualization coloured by cluster" + pattern: "*.tsne.png" ontologies: [] versions: - - "versions.yml": + - versions.yml: type: file - description: File containing software versions + description: "Software versions used in the module" pattern: "versions.yml" ontologies: - edam: http://edamontology.org/format_3750 From 4fe91e77088c215e325671c6c66991f9404ab512 Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Mon, 11 May 2026 16:56:11 +0200 Subject: [PATCH 22/38] fix: apply ruff formatting to cluster_viz.py template --- .../custom/clustervisualization/templates/cluster_viz.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/modules/nf-core/custom/clustervisualization/templates/cluster_viz.py b/modules/nf-core/custom/clustervisualization/templates/cluster_viz.py index c67391521f6c..f06f76271a30 100644 --- a/modules/nf-core/custom/clustervisualization/templates/cluster_viz.py +++ b/modules/nf-core/custom/clustervisualization/templates/cluster_viz.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import platform -import sys import matplotlib import matplotlib.pyplot as plt @@ -9,9 +8,9 @@ import pandas as pd import seaborn as sns import sklearn +import umap as umap_module from sklearn.manifold import TSNE from umap import UMAP -import umap as umap_module matplotlib.use("Agg") From 09802089e5814fcbb7cb93caebacc8397c4b7625 Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Mon, 11 May 2026 20:09:26 +0200 Subject: [PATCH 23/38] fix: align clustermetrics and clustervisualization envs and containers --- modules/nf-core/custom/clustermetrics/environment.yml | 11 +++++++---- modules/nf-core/custom/clustermetrics/main.nf | 5 ++--- .../custom/clustervisualization/environment.yml | 2 +- modules/nf-core/custom/clustervisualization/main.nf | 5 +++-- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/modules/nf-core/custom/clustermetrics/environment.yml b/modules/nf-core/custom/clustermetrics/environment.yml index 616821c92ff9..f4cde46c06ab 100644 --- a/modules/nf-core/custom/clustermetrics/environment.yml +++ b/modules/nf-core/custom/clustermetrics/environment.yml @@ -4,7 +4,10 @@ channels: - conda-forge - bioconda dependencies: - - matplotlib=3.9.* - - pandas=2.2.* - - python=3.12 - - scikit-learn=1.5.* + - conda-forge::matplotlib=3.9.4 + - conda-forge::numpy=2.4.2 + - conda-forge::pandas=2.3.2 + - conda-forge::python=3.12.12 + - conda-forge::scikit-learn=1.8.0 + - conda-forge::seaborn=0.13.2 + - conda-forge::umap-learn=0.5.12 diff --git a/modules/nf-core/custom/clustermetrics/main.nf b/modules/nf-core/custom/clustermetrics/main.nf index e41e695f87f3..fc9678950dfa 100644 --- a/modules/nf-core/custom/clustermetrics/main.nf +++ b/modules/nf-core/custom/clustermetrics/main.nf @@ -3,9 +3,8 @@ process CUSTOM_CLUSTERMETRICS { label 'process_medium' conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? - 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c9/c993602a6f49b387b34e84f41fda5a393355850b2dd6ab776f1307a0e7b9d540/data' : - 'community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn:c378d29780adbcbf' }" - + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/92/92185ca35119bf65d5f45eb57c71dc489edcd209056c32162fc6045fceda6dd6/data' : + 'community.wave.seqera.io/library/matplotlib_numpy_pandas_python_pruned:6b81abc92579656a' }" input: tuple val(meta), path(features), path(clusters) diff --git a/modules/nf-core/custom/clustervisualization/environment.yml b/modules/nf-core/custom/clustervisualization/environment.yml index b68e2333b2f2..c2cf95a372d2 100644 --- a/modules/nf-core/custom/clustervisualization/environment.yml +++ b/modules/nf-core/custom/clustervisualization/environment.yml @@ -7,6 +7,6 @@ dependencies: - matplotlib=3.9.* - pandas=2.2.* - python=3.12 - - scikit-learn=1.5.* + - scikit-learn=1.6.* - seaborn=0.13.* - umap-learn=0.5.* diff --git a/modules/nf-core/custom/clustervisualization/main.nf b/modules/nf-core/custom/clustervisualization/main.nf index 9120f2e18f2f..ed8ab229bf6e 100644 --- a/modules/nf-core/custom/clustervisualization/main.nf +++ b/modules/nf-core/custom/clustervisualization/main.nf @@ -2,8 +2,9 @@ process CUSTOM_CLUSTERVISUALIZATION { tag "$meta.id" label 'process_medium' conda "${moduleDir}/environment.yml" - container "community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn_pruned:9579c043ac096a34" - + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/92/92185ca35119bf65d5f45eb57c71dc489edcd209056c32162fc6045fceda6dd6/data' : + 'community.wave.seqera.io/library/matplotlib_numpy_pandas_python_pruned:6b81abc92579656a' }" input: tuple val(meta), path(features), path(clusters) From f240623c4b597e7efc3b9d7d8fe4410c217bf0bb Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Tue, 12 May 2026 13:19:57 +0200 Subject: [PATCH 24/38] fix: use docker:// prefix for singularity container to enable OCI conversion --- modules/nf-core/custom/clustermetrics/main.nf | 4 ++-- modules/nf-core/custom/clustervisualization/main.nf | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/nf-core/custom/clustermetrics/main.nf b/modules/nf-core/custom/clustermetrics/main.nf index fc9678950dfa..67802950c6c2 100644 --- a/modules/nf-core/custom/clustermetrics/main.nf +++ b/modules/nf-core/custom/clustermetrics/main.nf @@ -3,8 +3,8 @@ process CUSTOM_CLUSTERMETRICS { label 'process_medium' conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? - 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/92/92185ca35119bf65d5f45eb57c71dc489edcd209056c32162fc6045fceda6dd6/data' : - 'community.wave.seqera.io/library/matplotlib_numpy_pandas_python_pruned:6b81abc92579656a' }" + 'docker://community.wave.seqera.io/library/matplotlib_numpy_pandas_python_pruned:6b81abc92579656a' : + 'community.wave.seqera.io/library/matplotlib_numpy_pandas_python_pruned:6b81abc92579656a' }" input: tuple val(meta), path(features), path(clusters) diff --git a/modules/nf-core/custom/clustervisualization/main.nf b/modules/nf-core/custom/clustervisualization/main.nf index ed8ab229bf6e..a96b0e342c6b 100644 --- a/modules/nf-core/custom/clustervisualization/main.nf +++ b/modules/nf-core/custom/clustervisualization/main.nf @@ -3,8 +3,8 @@ process CUSTOM_CLUSTERVISUALIZATION { label 'process_medium' conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? - 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/92/92185ca35119bf65d5f45eb57c71dc489edcd209056c32162fc6045fceda6dd6/data' : - 'community.wave.seqera.io/library/matplotlib_numpy_pandas_python_pruned:6b81abc92579656a' }" + 'docker://community.wave.seqera.io/library/matplotlib_numpy_pandas_python_pruned:6b81abc92579656a' : + 'community.wave.seqera.io/library/matplotlib_numpy_pandas_python_pruned:6b81abc92579656a' }" input: tuple val(meta), path(features), path(clusters) From c46c26cac99b2f056b1bba84f2fb4e0db3ece604 Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Tue, 12 May 2026 14:39:54 +0200 Subject: [PATCH 25/38] fix(custom/clustervisualization): set NUMBA_CACHE_DIR and MPLCONFIGDIR to fix numba caching in Singularity --- .../custom/clustervisualization/templates/cluster_viz.py | 5 ++++- .../nf-core/custom/clustervisualization/tests/main.nf.test | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/modules/nf-core/custom/clustervisualization/templates/cluster_viz.py b/modules/nf-core/custom/clustervisualization/templates/cluster_viz.py index f06f76271a30..e28d3d5262f5 100644 --- a/modules/nf-core/custom/clustervisualization/templates/cluster_viz.py +++ b/modules/nf-core/custom/clustervisualization/templates/cluster_viz.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import platform - +import os import matplotlib import matplotlib.pyplot as plt import numpy as np @@ -12,6 +12,9 @@ from sklearn.manifold import TSNE from umap import UMAP +# Fix numba + matplotlib in read-only Singularity container +os.environ['NUMBA_CACHE_DIR'] = '/tmp' +os.environ['MPLCONFIGDIR'] = '/tmp' matplotlib.use("Agg") diff --git a/modules/nf-core/custom/clustervisualization/tests/main.nf.test b/modules/nf-core/custom/clustervisualization/tests/main.nf.test index 76e03b14c860..bda59809d0b5 100644 --- a/modules/nf-core/custom/clustervisualization/tests/main.nf.test +++ b/modules/nf-core/custom/clustervisualization/tests/main.nf.test @@ -10,6 +10,9 @@ nextflow_process { test("clustervisualization - features and clusters") { when { + params { + nf_test = true + } process { """ input[0] = [ [id:'test'], From ad39971648d380fa5d29149b000180041b36e7d4 Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Tue, 12 May 2026 16:09:31 +0200 Subject: [PATCH 26/38] fix(custom/clustervisualization): move NUMBA_CACHE_DIR fix before any imports, and KMean n_init stable --- .../custom/clustermetrics/templates/cluster_metrics.py | 2 +- .../custom/clustervisualization/templates/cluster_viz.py | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py b/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py index 01236a37ce3f..5be8bac0bfc2 100644 --- a/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py +++ b/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py @@ -236,7 +236,7 @@ def main() -> None: rows = [] max_k = min(int(args.k_max), len(x)) for k in range(int(args.k_min), max_k + 1): - model = KMeans(n_clusters=k, n_init="auto", random_state=42) + model = KMeans(n_clusters=k, n_init=10, random_state=42) y = model.fit_predict(x) sil = ch = db = None diff --git a/modules/nf-core/custom/clustervisualization/templates/cluster_viz.py b/modules/nf-core/custom/clustervisualization/templates/cluster_viz.py index e28d3d5262f5..82e0e527b4f5 100644 --- a/modules/nf-core/custom/clustervisualization/templates/cluster_viz.py +++ b/modules/nf-core/custom/clustervisualization/templates/cluster_viz.py @@ -1,7 +1,11 @@ #!/usr/bin/env python3 -import platform import os +# Fix numba + matplotlib in read-only Singularity container +os.environ['NUMBA_CACHE_DIR'] = '/tmp' +os.environ['MPLCONFIGDIR'] = '/tmp' + +import platform import matplotlib import matplotlib.pyplot as plt import numpy as np @@ -12,9 +16,6 @@ from sklearn.manifold import TSNE from umap import UMAP -# Fix numba + matplotlib in read-only Singularity container -os.environ['NUMBA_CACHE_DIR'] = '/tmp' -os.environ['MPLCONFIGDIR'] = '/tmp' matplotlib.use("Agg") From c8fec1515d031310a74d63bd9fb314cc2cbe2813 Mon Sep 17 00:00:00 2001 From: Donald Baku <141358602+dbaku42@users.noreply.github.com> Date: Tue, 12 May 2026 16:34:20 +0200 Subject: [PATCH 27/38] Apply suggestion from @pinin4fjords Co-authored-by: Jonathan Manning --- .../templates/cluster_metrics.py | 59 ++----------------- 1 file changed, 5 insertions(+), 54 deletions(-) diff --git a/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py b/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py index 5be8bac0bfc2..6feb281e2cca 100644 --- a/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py +++ b/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py @@ -88,60 +88,11 @@ def load_features(path: str) -> tuple[pd.DataFrame, pd.Series]: return x, sample_ids -def _looks_mostly_numeric(s: pd.Series) -> bool: - if len(s) == 0: - return False - parsed = pd.to_numeric(s.astype(str), errors="coerce") - return float(parsed.notna().mean()) >= 0.8 - - -def load_clusters(path: str) -> tuple[pd.DataFrame, str]: - df = pd.read_csv(path, sep=",", dtype=str) - df = df.copy() - df.columns = [str(c).lstrip("#") for c in df.columns] - - cols_upper = {str(c).upper(): c for c in df.columns} - - if "CLUSTER" not in cols_upper: - raise ValueError("clusters CSV must have a 'cluster' column") - - cluster_col = cols_upper["CLUSTER"] - - if "SAMPLE_ID" in cols_upper: - sample_col = cols_upper["SAMPLE_ID"] - out = df[[sample_col, cluster_col]].copy() - out.columns = ["sample_id", "cluster"] - out["sample_id"] = out["sample_id"].astype(str) - out["cluster"] = pd.to_numeric(out["cluster"], errors="raise").astype(int) - return out, "sample_id" - - try: - norm = _normalise_id_column(df.copy()) - if "sample_id" in norm.columns and "cluster" in norm.columns: - out = norm[["sample_id", "cluster"]].copy() - out["sample_id"] = out["sample_id"].astype(str) - out["cluster"] = pd.to_numeric(out["cluster"], errors="raise").astype(int) - return out, "sample_id" - except Exception: - pass - - other_cols = [c for c in df.columns if c != cluster_col] - - if len(other_cols) == 1: - candidate = other_cols[0] - candidate_vals = df[candidate].astype(str) - - if not _looks_mostly_numeric(candidate_vals): - out = pd.DataFrame( - { - "sample_id": candidate_vals, - "cluster": pd.to_numeric(df[cluster_col], errors="raise").astype(int), - } - ) - return out, "sample_id" - - out = pd.DataFrame({"cluster": pd.to_numeric(df[cluster_col], errors="raise").astype(int)}) - return out, "row_order" +def load_clusters(path: str) -> pd.Series: + df = pd.read_csv(path) + if "sample_id" not in df.columns or "cluster" not in df.columns: + raise ValueError(f"clusters file must have 'sample_id' and 'cluster' columns. Found: {list(df.columns)}") + return df.set_index(df["sample_id"].astype(str))["cluster"].astype(int) def safe_cluster_metrics(x: np.ndarray, labels: np.ndarray) -> dict: From 36344663e2a9e555c8a85aea6945f77de8792c87 Mon Sep 17 00:00:00 2001 From: Donald Baku <141358602+dbaku42@users.noreply.github.com> Date: Tue, 12 May 2026 16:34:49 +0200 Subject: [PATCH 28/38] Apply suggestion from @pinin4fjords Co-authored-by: Jonathan Manning --- .../nf-core/custom/clustermetrics/templates/cluster_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py b/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py index 6feb281e2cca..f2e0794a58eb 100644 --- a/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py +++ b/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py @@ -182,7 +182,7 @@ def main() -> None: selected["alignment_mode"] = alignment_mode metrics_tsv = f"{args.out_prefix}_metrics.tsv" - pd.DataFrame([selected]).to_csv(metrics_tsv, sep="\t", index=False) + pd.DataFrame([selected]).to_csv(metrics_tsv, sep="\\t", index=False) rows = [] max_k = min(int(args.k_max), len(x)) From 6a7ac451916b9f31dbf374dca16f5bf819c8aeac Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Tue, 12 May 2026 17:19:30 +0200 Subject: [PATCH 29/38] Prek and script fix --- .../custom/clustervisualization/templates/cluster_viz.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modules/nf-core/custom/clustervisualization/templates/cluster_viz.py b/modules/nf-core/custom/clustervisualization/templates/cluster_viz.py index 82e0e527b4f5..c571d16ac884 100644 --- a/modules/nf-core/custom/clustervisualization/templates/cluster_viz.py +++ b/modules/nf-core/custom/clustervisualization/templates/cluster_viz.py @@ -1,11 +1,13 @@ #!/usr/bin/env python3 import os + # Fix numba + matplotlib in read-only Singularity container -os.environ['NUMBA_CACHE_DIR'] = '/tmp' -os.environ['MPLCONFIGDIR'] = '/tmp' +os.environ["NUMBA_CACHE_DIR"] = "/tmp" +os.environ["MPLCONFIGDIR"] = "/tmp" import platform + import matplotlib import matplotlib.pyplot as plt import numpy as np From cddb5a8c9b5045c284c3784d37574c6b8a61a112 Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Wed, 13 May 2026 14:34:38 +0200 Subject: [PATCH 30/38] Fixed pandas series problem in cluster_metrics.py --- .../clustermetrics/templates/cluster_metrics.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py b/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py index f2e0794a58eb..3a2dacd5c518 100644 --- a/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py +++ b/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py @@ -88,11 +88,19 @@ def load_features(path: str) -> tuple[pd.DataFrame, pd.Series]: return x, sample_ids -def load_clusters(path: str) -> pd.Series: +def load_clusters(path: str) -> tuple[pd.Series, str]: df = pd.read_csv(path) - if "sample_id" not in df.columns or "cluster" not in df.columns: - raise ValueError(f"clusters file must have 'sample_id' and 'cluster' columns. Found: {list(df.columns)}") - return df.set_index(df["sample_id"].astype(str))["cluster"].astype(int) + if "sample_id" in df.columns and "cluster" in df.columns: + series = df.set_index(df["sample_id"].astype(str))["cluster"].astype(int) + return series, "sample_id" + elif "cluster" in df.columns: + series = df["cluster"].astype(int).reset_index(drop=True) + return series, "row_order" + else: + raise ValueError( + f"clusters file must have a 'cluster' column (and optionally 'sample_id'). " + f"Found: {list(df.columns)}" + ) def safe_cluster_metrics(x: np.ndarray, labels: np.ndarray) -> dict: From 4a72291849ec3b90f895c1fcc599ffe39c1dac07 Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Wed, 13 May 2026 14:53:04 +0200 Subject: [PATCH 31/38] fix: escape \n in f-strings for Groovy template compatibility --- .../templates/cluster_metrics.py | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py b/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py index 3a2dacd5c518..fad09dab40b1 100644 --- a/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py +++ b/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py @@ -74,7 +74,7 @@ def _normalise_id_column(df: pd.DataFrame) -> pd.DataFrame: def load_features(path: str) -> tuple[pd.DataFrame, pd.Series]: - df = pd.read_csv(path, sep="\t", dtype=str) + df = pd.read_csv(path, sep="\\t", dtype=str) df = _normalise_id_column(df) if "sample_id" not in df.columns: @@ -146,37 +146,36 @@ def main() -> None: args = ap.parse_args() x_df, sample_ids = load_features(args.features) - clusters_df, cluster_mode = load_clusters(args.clusters) + clusters_s, cluster_mode = load_clusters(args.clusters) if cluster_mode == "sample_id": - clusters = clusters_df.set_index("sample_id")["cluster"] - common = sample_ids[sample_ids.isin(clusters.index)] + common = sample_ids[sample_ids.isin(clusters_s.index)] if len(common) > 0: x = x_df.loc[common.index].values - labels = clusters.loc[common.values].values + labels = clusters_s.loc[common.values].values aligned_ids = common.astype(str).tolist() alignment_mode = "sample_id" - elif len(clusters_df) == len(sample_ids): + elif len(clusters_s) == len(sample_ids): x = x_df.values - labels = clusters_df["cluster"].values + labels = clusters_s.values aligned_ids = sample_ids.astype(str).tolist() alignment_mode = "row_order_fallback" else: raise ValueError( f"No overlapping sample_id between features and clusters.\\n" f" features IDs (first 5): {sample_ids.head().tolist()}\\n" - f" clusters IDs (first 5): {list(clusters.index[:5])}" + f" clusters IDs (first 5): {list(clusters_s.index[:5])}" ) else: - if len(clusters_df) != len(sample_ids): + if len(clusters_s) != len(sample_ids): raise ValueError( - "clusters CSV has no usable sample_id column and row counts do not match.\\n" + "clusters CSV has no usable sample_id column and row counts do not match.\n" f" n_features={len(sample_ids)}\\n" - f" n_clusters={len(clusters_df)}" + f" n_clusters={len(clusters_s)}" ) x = x_df.values - labels = clusters_df["cluster"].values + labels = clusters_s.values aligned_ids = sample_ids.astype(str).tolist() alignment_mode = "row_order" From 2bf8afac71bec0d7d86ad6644307137a87659bb9 Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Wed, 13 May 2026 15:49:01 +0200 Subject: [PATCH 32/38] Format CUSTOM_CLUSTERMETRICS template with ruff --- modules/nf-core/custom/clustermetrics/main.nf | 4 ++-- .../custom/clustermetrics/templates/cluster_metrics.py | 7 +++---- modules/nf-core/custom/clustervisualization/main.nf | 4 ++-- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/modules/nf-core/custom/clustermetrics/main.nf b/modules/nf-core/custom/clustermetrics/main.nf index 67802950c6c2..e7fe70ca970e 100644 --- a/modules/nf-core/custom/clustermetrics/main.nf +++ b/modules/nf-core/custom/clustermetrics/main.nf @@ -3,8 +3,8 @@ process CUSTOM_CLUSTERMETRICS { label 'process_medium' conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? - 'docker://community.wave.seqera.io/library/matplotlib_numpy_pandas_python_pruned:6b81abc92579656a' : - 'community.wave.seqera.io/library/matplotlib_numpy_pandas_python_pruned:6b81abc92579656a' }" + 'docker://community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn_pruned:054f91aaa56bd7d5' : + 'community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn_pruned:054f91aaa56bd7d5' }" input: tuple val(meta), path(features), path(clusters) diff --git a/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py b/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py index fad09dab40b1..f8fb15f2e776 100644 --- a/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py +++ b/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py @@ -98,8 +98,7 @@ def load_clusters(path: str) -> tuple[pd.Series, str]: return series, "row_order" else: raise ValueError( - f"clusters file must have a 'cluster' column (and optionally 'sample_id'). " - f"Found: {list(df.columns)}" + f"clusters file must have a 'cluster' column (and optionally 'sample_id'). Found: {list(df.columns)}" ) @@ -170,7 +169,7 @@ def main() -> None: else: if len(clusters_s) != len(sample_ids): raise ValueError( - "clusters CSV has no usable sample_id column and row counts do not match.\n" + "clusters CSV has no usable sample_id column and row counts do not match.\\n" f" n_features={len(sample_ids)}\\n" f" n_clusters={len(clusters_s)}" ) @@ -267,7 +266,7 @@ def plot_curve(metric, title, ylabel, out_png): if __name__ == "__main__": - prefix = "${task.ext.prefix ?: meta.id}" + prefix = "${task.ext.prefix ? task.ext.prefix : meta.id}" sys.argv = [ "cluster_metrics.py", diff --git a/modules/nf-core/custom/clustervisualization/main.nf b/modules/nf-core/custom/clustervisualization/main.nf index a96b0e342c6b..7d9156a0cb14 100644 --- a/modules/nf-core/custom/clustervisualization/main.nf +++ b/modules/nf-core/custom/clustervisualization/main.nf @@ -3,8 +3,8 @@ process CUSTOM_CLUSTERVISUALIZATION { label 'process_medium' conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? - 'docker://community.wave.seqera.io/library/matplotlib_numpy_pandas_python_pruned:6b81abc92579656a' : - 'community.wave.seqera.io/library/matplotlib_numpy_pandas_python_pruned:6b81abc92579656a' }" + 'docker://community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn_pruned:054f91aaa56bd7d5' : + 'community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn_pruned:054f91aaa56bd7d5' }" input: tuple val(meta), path(features), path(clusters) From fd15709d9e3f33d246b21aca6d778dea1f02f1dd Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Wed, 13 May 2026 16:37:10 +0200 Subject: [PATCH 33/38] fix(clustermetrics,clustervisualization): update nf-test snapshots and test assertions --- .../custom/clustermetrics/tests/main.nf.test | 54 +++++-- .../clustermetrics/tests/main.nf.test.snap | 136 +++++++----------- .../clustervisualization/tests/main.nf.test | 53 ++++--- .../tests/main.nf.test.snap | 129 ++++++++--------- 4 files changed, 187 insertions(+), 185 deletions(-) diff --git a/modules/nf-core/custom/clustermetrics/tests/main.nf.test b/modules/nf-core/custom/clustermetrics/tests/main.nf.test index 1d6613fa3709..32f903d2fdeb 100644 --- a/modules/nf-core/custom/clustermetrics/tests/main.nf.test +++ b/modules/nf-core/custom/clustermetrics/tests/main.nf.test @@ -1,44 +1,70 @@ nextflow_process { - name "Test Process CLUSTER_METRICS" + + name "Test Process CUSTOM_CLUSTERMETRICS" script "../main.nf" process "CUSTOM_CLUSTERMETRICS" + tag "modules" tag "modules_nfcore" tag "custom" tag "custom/clustermetrics" - tag "clustermetrics" test("clustermetrics - features and clusters") { + when { process { """ - input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/clustermetrics/tests/data/test_features.tsv", checkIfExists: true), file("${projectDir}/modules/nf-core/custom/clustermetrics/tests/data/test_clusters.csv", checkIfExists: true) ] + input[0] = [ + [ id:'test' ], + file("${projectDir}/modules/nf-core/custom/clustermetrics/tests/data/test_features.tsv", checkIfExists: true), + file("${projectDir}/modules/nf-core/custom/clustermetrics/tests/data/test_clusters.csv", checkIfExists: true) + ] """ } } + then { - assert process.success - assert snapshot( - process.out.metrics, - process.out.k_sweep, - process.out.selected, - process.out.versions - ).match() + assertAll( + { assert process.success }, + { assert snapshot( + process.out.metrics, + process.out.k_sweep, + process.out.selected, + process.out.versions, + path(process.out.versions[0]).yaml + ).match() } + ) } } test("clustermetrics - features and clusters - stub") { + options "-stub" + when { process { """ - input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/clustermetrics/tests/data/test_features.tsv", checkIfExists: true), file("${projectDir}/modules/nf-core/custom/clustermetrics/tests/data/test_clusters.csv", checkIfExists: true) ] + input[0] = [ + [ id:'test' ], + file("${projectDir}/modules/nf-core/custom/clustermetrics/tests/data/test_features.tsv", checkIfExists: true), + file("${projectDir}/modules/nf-core/custom/clustermetrics/tests/data/test_clusters.csv", checkIfExists: true) + ] """ } } + then { - assert process.success - assert snapshot(process.out).match() + assertAll( + { assert process.success }, + { assert snapshot( + process.out.metrics, + process.out.k_sweep, + process.out.selected, + process.out.plots, + process.out.versions, + path(process.out.versions[0]).yaml + ).match() } + ) } } -} +} \ No newline at end of file diff --git a/modules/nf-core/custom/clustermetrics/tests/main.nf.test.snap b/modules/nf-core/custom/clustermetrics/tests/main.nf.test.snap index 15deaf982452..034252978e7f 100644 --- a/modules/nf-core/custom/clustermetrics/tests/main.nf.test.snap +++ b/modules/nf-core/custom/clustermetrics/tests/main.nf.test.snap @@ -27,9 +27,17 @@ ], [ "versions.yml:md5,236501fe75ac914d4de40a2c42dbec6b" - ] + ], + { + "CUSTOM_CLUSTERMETRICS": { + "python": "3.13.7", + "pandas": "3.0.0", + "scikit-learn": "1.8.0", + "matplotlib": "3.10.7" + } + } ], - "timestamp": "2026-05-11T13:08:00.102276222", + "timestamp": "2026-05-13T16:29:03.258208972", "meta": { "nf-test": "0.9.5", "nextflow": "25.09.0" @@ -37,90 +45,56 @@ }, "clustermetrics - features and clusters - stub": { "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test_metrics.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "1": [ - [ - { - "id": "test" - }, - "test_k_sweep.csv:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "2": [ - [ - { - "id": "test" - }, - "test_selected.json:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "3": [ - [ - { - "id": "test" - }, - [ - "test_calinski.png:md5,d41d8cd98f00b204e9800998ecf8427e", - "test_davies_bouldin.png:md5,d41d8cd98f00b204e9800998ecf8427e", - "test_elbow.png:md5,d41d8cd98f00b204e9800998ecf8427e", - "test_silhouette.png:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ] - ], - "4": [ - "versions.yml:md5,48951b680bf5fe7b6b0242dd160a2618" - ], - "k_sweep": [ - [ - { - "id": "test" - }, - "test_k_sweep.csv:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "metrics": [ - [ - { - "id": "test" - }, - "test_metrics.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "plots": [ - [ - { - "id": "test" - }, - [ - "test_calinski.png:md5,d41d8cd98f00b204e9800998ecf8427e", - "test_davies_bouldin.png:md5,d41d8cd98f00b204e9800998ecf8427e", - "test_elbow.png:md5,d41d8cd98f00b204e9800998ecf8427e", - "test_silhouette.png:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ] - ], - "selected": [ + [ + [ + { + "id": "test" + }, + "test_metrics.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + [ + { + "id": "test" + }, + "test_k_sweep.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + [ + { + "id": "test" + }, + "test_selected.json:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + [ + { + "id": "test" + }, [ - { - "id": "test" - }, - "test_selected.json:md5,d41d8cd98f00b204e9800998ecf8427e" + "test_calinski.png:md5,d41d8cd98f00b204e9800998ecf8427e", + "test_davies_bouldin.png:md5,d41d8cd98f00b204e9800998ecf8427e", + "test_elbow.png:md5,d41d8cd98f00b204e9800998ecf8427e", + "test_silhouette.png:md5,d41d8cd98f00b204e9800998ecf8427e" ] - ], - "versions": [ - "versions.yml:md5,48951b680bf5fe7b6b0242dd160a2618" ] + ], + [ + "versions.yml:md5,48951b680bf5fe7b6b0242dd160a2618" + ], + { + "CUSTOM_CLUSTERMETRICS": { + "python": "3.13.7", + "pandas": "3.0.0", + "scikit-learn": "1.8.0", + "matplotlib": "3.10.7" + } } ], - "timestamp": "2026-05-11T13:05:52.932850421", + "timestamp": "2026-05-13T16:29:07.20576465", "meta": { "nf-test": "0.9.5", "nextflow": "25.09.0" diff --git a/modules/nf-core/custom/clustervisualization/tests/main.nf.test b/modules/nf-core/custom/clustervisualization/tests/main.nf.test index bda59809d0b5..1256c87895f0 100644 --- a/modules/nf-core/custom/clustervisualization/tests/main.nf.test +++ b/modules/nf-core/custom/clustervisualization/tests/main.nf.test @@ -1,50 +1,69 @@ nextflow_process { + name "Test Process CUSTOM_CLUSTERVISUALIZATION" script "../main.nf" process "CUSTOM_CLUSTERVISUALIZATION" + tag "modules" tag "modules_nfcore" tag "custom" tag "custom/clustervisualization" - tag "clustervisualization" test("clustervisualization - features and clusters") { + when { - params { - nf_test = true - } process { """ - input[0] = [ [id:'test'], + input[0] = [ + [ id:'test' ], file("${projectDir}/modules/nf-core/custom/clustervisualization/tests/data/test_features.tsv", checkIfExists: true), - file("${projectDir}/modules/nf-core/custom/clustervisualization/tests/data/test_clusters.csv", checkIfExists: true) ] + file("${projectDir}/modules/nf-core/custom/clustervisualization/tests/data/test_clusters.csv", checkIfExists: true) + ] """ } } + then { - assert process.success - assert snapshot( - process.out.umap_tsv, - process.out.tsne_tsv, - process.out.versions - ).match() + assertAll( + { assert process.success }, + { assert snapshot( + process.out.umap_tsv, + process.out.tsne_tsv, + process.out.versions, + path(process.out.versions[0]).yaml + ).match() } + ) } } test("clustervisualization - features and clusters - stub") { + options "-stub" + when { process { """ - input[0] = [ [id:'test'], + input[0] = [ + [ id:'test' ], file("${projectDir}/modules/nf-core/custom/clustervisualization/tests/data/test_features.tsv", checkIfExists: true), - file("${projectDir}/modules/nf-core/custom/clustervisualization/tests/data/test_clusters.csv", checkIfExists: true) ] + file("${projectDir}/modules/nf-core/custom/clustervisualization/tests/data/test_clusters.csv", checkIfExists: true) + ] """ } } + then { - assert process.success - assert snapshot(process.out).match() + assertAll( + { assert process.success }, + { assert snapshot( + process.out.umap_tsv, + process.out.tsne_tsv, + process.out.umap_png, + process.out.tsne_png, + process.out.versions, + path(process.out.versions[0]).yaml + ).match() } + ) } } -} +} \ No newline at end of file diff --git a/modules/nf-core/custom/clustervisualization/tests/main.nf.test.snap b/modules/nf-core/custom/clustervisualization/tests/main.nf.test.snap index 610912e901ea..f0911139ed24 100644 --- a/modules/nf-core/custom/clustervisualization/tests/main.nf.test.snap +++ b/modules/nf-core/custom/clustervisualization/tests/main.nf.test.snap @@ -1,80 +1,53 @@ { "clustervisualization - features and clusters - stub": { "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.umap.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "1": [ - [ - { - "id": "test" - }, - "test.tsne.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "2": [ - [ - { - "id": "test" - }, - "test.umap.png:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "3": [ - [ - { - "id": "test" - }, - "test.tsne.png:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "4": [ - "versions.yml:md5,a50bc5907c3a28f12238b5497e3f4c67" - ], - "tsne_png": [ - [ - { - "id": "test" - }, - "test.tsne.png:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "tsne_tsv": [ - [ - { - "id": "test" - }, - "test.tsne.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "umap_png": [ - [ - { - "id": "test" - }, - "test.umap.png:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "umap_tsv": [ - [ - { - "id": "test" - }, - "test.umap.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "versions": [ - "versions.yml:md5,a50bc5907c3a28f12238b5497e3f4c67" + [ + [ + { + "id": "test" + }, + "test.umap.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + [ + { + "id": "test" + }, + "test.tsne.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + [ + { + "id": "test" + }, + "test.umap.png:md5,d41d8cd98f00b204e9800998ecf8427e" ] + ], + [ + [ + { + "id": "test" + }, + "test.tsne.png:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + "versions.yml:md5,a50bc5907c3a28f12238b5497e3f4c67" + ], + { + "CUSTOM_CLUSTERVISUALIZATION": { + "python": "3.13.7", + "pandas": "3.0.0", + "matplotlib": "3.10.7", + "seaborn": "0.13.2", + "umap-learn": "0.5.9.post2", + "scikit-learn": "1.8.0" + } } ], - "timestamp": "2026-05-11T14:15:13.987219333", + "timestamp": "2026-05-13T16:35:48.765737703", "meta": { "nf-test": "0.9.5", "nextflow": "25.09.0" @@ -100,9 +73,19 @@ ], [ "versions.yml:md5,43b533ced227b510ca833d01881efc8e" - ] + ], + { + "CUSTOM_CLUSTERVISUALIZATION": { + "python": "3.13.7", + "pandas": "3.0.0", + "matplotlib": "3.10.7", + "seaborn": "0.13.2", + "umap-learn": "0.5.9.post2", + "scikit-learn": "1.8.0" + } + } ], - "timestamp": "2026-05-11T14:15:07.547048716", + "timestamp": "2026-05-13T16:35:41.90989098", "meta": { "nf-test": "0.9.5", "nextflow": "25.09.0" From 6114ebd63a83f18d84e3e3f78fcf06ca46dbacde Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Wed, 13 May 2026 16:51:18 +0200 Subject: [PATCH 34/38] fix environment conflict --- .../custom/clustermetrics/environment.yml | 7 +++-- .../clustermetrics/tests/main.nf.test.snap | 24 +++++++-------- .../clustervisualization/environment.yml | 14 +++++---- .../tests/main.nf.test.snap | 30 +++++++++---------- 4 files changed, 39 insertions(+), 36 deletions(-) diff --git a/modules/nf-core/custom/clustermetrics/environment.yml b/modules/nf-core/custom/clustermetrics/environment.yml index f4cde46c06ab..ccbc287ac332 100644 --- a/modules/nf-core/custom/clustermetrics/environment.yml +++ b/modules/nf-core/custom/clustermetrics/environment.yml @@ -1,3 +1,4 @@ +# clustermetrics/environment.yml --- # yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json channels: @@ -6,8 +7,8 @@ channels: dependencies: - conda-forge::matplotlib=3.9.4 - conda-forge::numpy=2.4.2 - - conda-forge::pandas=2.3.2 + - conda-forge::pandas=2.2.3 - conda-forge::python=3.12.12 - - conda-forge::scikit-learn=1.8.0 + - conda-forge::scikit-learn=1.6.1 - conda-forge::seaborn=0.13.2 - - conda-forge::umap-learn=0.5.12 + - conda-forge::umap-learn=0.5.12 \ No newline at end of file diff --git a/modules/nf-core/custom/clustermetrics/tests/main.nf.test.snap b/modules/nf-core/custom/clustermetrics/tests/main.nf.test.snap index 034252978e7f..789c38b0def7 100644 --- a/modules/nf-core/custom/clustermetrics/tests/main.nf.test.snap +++ b/modules/nf-core/custom/clustermetrics/tests/main.nf.test.snap @@ -26,18 +26,18 @@ ] ], [ - "versions.yml:md5,236501fe75ac914d4de40a2c42dbec6b" + "versions.yml:md5,67cab9dfa6c955b0927cf3ff6fc8c5bd" ], { "CUSTOM_CLUSTERMETRICS": { - "python": "3.13.7", - "pandas": "3.0.0", - "scikit-learn": "1.8.0", - "matplotlib": "3.10.7" + "python": "3.12.12", + "pandas": "2.2.3", + "scikit-learn": "1.6.1", + "matplotlib": "3.9.4" } } ], - "timestamp": "2026-05-13T16:29:03.258208972", + "timestamp": "2026-05-13T16:49:11.628681612", "meta": { "nf-test": "0.9.5", "nextflow": "25.09.0" @@ -83,18 +83,18 @@ ] ], [ - "versions.yml:md5,48951b680bf5fe7b6b0242dd160a2618" + "versions.yml:md5,939e81a1c6d66dde0edb847e3e61defd" ], { "CUSTOM_CLUSTERMETRICS": { - "python": "3.13.7", - "pandas": "3.0.0", - "scikit-learn": "1.8.0", - "matplotlib": "3.10.7" + "python": "3.12.12", + "pandas": "2.2.3", + "scikit-learn": "1.6.1", + "matplotlib": "3.9.4" } } ], - "timestamp": "2026-05-13T16:29:07.20576465", + "timestamp": "2026-05-13T16:49:29.799534772", "meta": { "nf-test": "0.9.5", "nextflow": "25.09.0" diff --git a/modules/nf-core/custom/clustervisualization/environment.yml b/modules/nf-core/custom/clustervisualization/environment.yml index c2cf95a372d2..8addb3305878 100644 --- a/modules/nf-core/custom/clustervisualization/environment.yml +++ b/modules/nf-core/custom/clustervisualization/environment.yml @@ -1,12 +1,14 @@ +# clustervisualization/environment.yml --- # yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json channels: - conda-forge - bioconda dependencies: - - matplotlib=3.9.* - - pandas=2.2.* - - python=3.12 - - scikit-learn=1.6.* - - seaborn=0.13.* - - umap-learn=0.5.* + - conda-forge::matplotlib=3.9.4 + - conda-forge::numpy=2.4.2 + - conda-forge::pandas=2.2.3 + - conda-forge::python=3.12.12 + - conda-forge::scikit-learn=1.6.1 + - conda-forge::seaborn=0.13.2 + - conda-forge::umap-learn=0.5.12 \ No newline at end of file diff --git a/modules/nf-core/custom/clustervisualization/tests/main.nf.test.snap b/modules/nf-core/custom/clustervisualization/tests/main.nf.test.snap index f0911139ed24..03d8e5832d23 100644 --- a/modules/nf-core/custom/clustervisualization/tests/main.nf.test.snap +++ b/modules/nf-core/custom/clustervisualization/tests/main.nf.test.snap @@ -34,20 +34,20 @@ ] ], [ - "versions.yml:md5,a50bc5907c3a28f12238b5497e3f4c67" + "versions.yml:md5,f79a1469aa94553d8d58281262a9d76b" ], { "CUSTOM_CLUSTERVISUALIZATION": { - "python": "3.13.7", - "pandas": "3.0.0", - "matplotlib": "3.10.7", + "python": "3.12.12", + "pandas": "2.2.3", + "matplotlib": "3.9.4", "seaborn": "0.13.2", - "umap-learn": "0.5.9.post2", - "scikit-learn": "1.8.0" + "umap-learn": "0.5.12", + "scikit-learn": "1.6.1" } } ], - "timestamp": "2026-05-13T16:35:48.765737703", + "timestamp": "2026-05-13T16:50:23.618656008", "meta": { "nf-test": "0.9.5", "nextflow": "25.09.0" @@ -60,7 +60,7 @@ { "id": "test" }, - "test.umap.tsv:md5,2cba3fa6ba2d3ce80ad884b4210403eb" + "test.umap.tsv:md5,50c3bb50b36a174c55dd45201e9c0036" ] ], [ @@ -72,20 +72,20 @@ ] ], [ - "versions.yml:md5,43b533ced227b510ca833d01881efc8e" + "versions.yml:md5,c9099db9969c17be8e1f69dfd9ed925a" ], { "CUSTOM_CLUSTERVISUALIZATION": { - "python": "3.13.7", - "pandas": "3.0.0", - "matplotlib": "3.10.7", + "python": "3.12.12", + "pandas": "2.2.3", + "matplotlib": "3.9.4", "seaborn": "0.13.2", - "umap-learn": "0.5.9.post2", - "scikit-learn": "1.8.0" + "umap-learn": "0.5.12", + "scikit-learn": "1.6.1" } } ], - "timestamp": "2026-05-13T16:35:41.90989098", + "timestamp": "2026-05-13T16:49:58.200828019", "meta": { "nf-test": "0.9.5", "nextflow": "25.09.0" From 8348609a82866bf8edca58f3bc2504482be95222 Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Wed, 13 May 2026 19:07:11 +0200 Subject: [PATCH 35/38] feat: add custom clustering and metrics modules --- .../nf-core/custom/clustering/environment.yml | 14 ++ modules/nf-core/custom/clustering/main.nf | 40 ++++ modules/nf-core/custom/clustering/meta.yml | 77 ++++++++ .../custom/clustering/templates/clustering.py | 184 ++++++++++++++++++ .../clustering/tests/data/test.eigenvec | 6 + .../custom/clustering/tests/main.nf.test | 51 +++++ .../custom/clustering/tests/main.nf.test.snap | 79 ++++++++ 7 files changed, 451 insertions(+) create mode 100644 modules/nf-core/custom/clustering/environment.yml create mode 100644 modules/nf-core/custom/clustering/main.nf create mode 100644 modules/nf-core/custom/clustering/meta.yml create mode 100644 modules/nf-core/custom/clustering/templates/clustering.py create mode 100644 modules/nf-core/custom/clustering/tests/data/test.eigenvec create mode 100644 modules/nf-core/custom/clustering/tests/main.nf.test create mode 100644 modules/nf-core/custom/clustering/tests/main.nf.test.snap diff --git a/modules/nf-core/custom/clustering/environment.yml b/modules/nf-core/custom/clustering/environment.yml new file mode 100644 index 000000000000..4b7a89234526 --- /dev/null +++ b/modules/nf-core/custom/clustering/environment.yml @@ -0,0 +1,14 @@ +# clustermetrics/environment.yml +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::matplotlib=3.9.4 + - conda-forge::numpy=2.4.2 + - conda-forge::pandas=2.2.3 + - conda-forge::python=3.12.12 + - conda-forge::scikit-learn=1.6.1 + - conda-forge::seaborn=0.13.2 + - conda-forge::umap-learn=0.5.12 diff --git a/modules/nf-core/custom/clustering/main.nf b/modules/nf-core/custom/clustering/main.nf new file mode 100644 index 000000000000..2f6f778dc08c --- /dev/null +++ b/modules/nf-core/custom/clustering/main.nf @@ -0,0 +1,40 @@ +process CUSTOM_CLUSTERING { + tag "$meta.id" + label 'process_medium' + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'docker://community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn_pruned:054f91aaa56bd7d5' : + 'community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn_pruned:054f91aaa56bd7d5' }" + + input: + tuple val(meta), path(eigenvec) + val algorithm + val n_clusters + val dbscan_eps + val dbscan_min_samples + + output: + tuple val(meta), path("*_clusters.csv") , emit: clusters + tuple val(meta), path("*_clustering_info.json") , emit: info, optional: true + path "versions.yml" , emit: versions, topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + template 'clustering.py' + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_clusters.csv + touch ${prefix}_clustering_info.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //') + pandas: \$(python3 -c "import pandas; print(pandas.__version__)") + scikit-learn: \$(python3 -c "import sklearn; print(sklearn.__version__)") + END_VERSIONS + """ +} diff --git a/modules/nf-core/custom/clustering/meta.yml b/modules/nf-core/custom/clustering/meta.yml new file mode 100644 index 000000000000..7eb83d610dfe --- /dev/null +++ b/modules/nf-core/custom/clustering/meta.yml @@ -0,0 +1,77 @@ +name: "CUSTOM_CLUSTERING" +description: "Performs KMeans or DBSCAN clustering on principal components from PLINK2 + --pca" +keywords: + - clustering + - pca + - kmeans + - dbscan + - principal-components +tools: + - "scikit-learn": + description: "Machine learning library for clustering" + homepage: "https://scikit-learn.org/" + documentation: "https://scikit-learn.org/stable/modules/clustering.html" + licence: + - "BSD-3-Clause" + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - eigenvec: + type: file + description: PLINK2 .eigenvec file generated by --pca + pattern: "*.eigenvec" + ontologies: [] + - algorithm: + type: string + description: Clustering algorithm to use (kmeans or dbscan) + - n_clusters: + type: integer + description: Number of clusters for KMeans + - dbscan_eps: + type: float + description: Epsilon parameter for DBSCAN + - dbscan_min_samples: + type: integer + description: Minimum samples parameter for DBSCAN +output: + clusters: + - - meta: + type: map + description: Groovy Map containing sample information + - "*_clusters.csv": + type: file + description: CSV file with sample_id and assigned cluster + pattern: "*_clusters.csv" + ontologies: + - edam: http://edamontology.org/format_3752 + info: + - - meta: + type: map + description: Groovy Map containing sample information + - "*_clustering_info.json": + type: file + description: JSON file with clustering parameters and statistics + pattern: "*_clustering_info.json" + ontologies: + - edam: http://edamontology.org/format_3464 + versions: + - "versions.yml": + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 +topics: + versions: + - versions.yml: + type: string + description: The name of the process +authors: + - "@dbaku42" +maintainers: + - "@dbaku42" diff --git a/modules/nf-core/custom/clustering/templates/clustering.py b/modules/nf-core/custom/clustering/templates/clustering.py new file mode 100644 index 000000000000..5175b294a86c --- /dev/null +++ b/modules/nf-core/custom/clustering/templates/clustering.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 + +import json +import platform +import sklearn +import yaml +import re +from pathlib import Path + +import numpy as np +import pandas as pd +from sklearn.cluster import KMeans, DBSCAN + +PC_COL_RE = re.compile('[Pp][Cc][0-9]+', re.IGNORECASE) + + +def convert_eigenvec_to_tsv(eigenvec_path, out_pca, id_mode='iid'): + rows = [] + n_pcs = 0 + mode = None + + with eigenvec_path.open('r') as fh: + for line in fh: + line = line.strip() + if not line: + continue + parts = line.split() + if parts[0].startswith('#'): + header = [p.lstrip('#') for p in parts] + if len(header) >= 2 and header[0].upper() == 'FID' and header[1].upper() == 'IID': + mode = 'fid_iid' + elif header[0].upper() == 'IID': + mode = 'iid_only' + continue + if mode is None: + try: + float(parts[1]) + mode = 'iid_only' + except (ValueError, IndexError): + mode = 'fid_iid' + if mode == 'fid_iid': + if len(parts) < 3: + continue + fid = parts[0] + iid = parts[1] + pcs = parts[2:] + sample_id = iid if id_mode == 'iid' else f'{fid}:{iid}' + elif mode == 'iid_only': + if len(parts) < 2: + continue + iid = parts[0] + pcs = parts[1:] + sample_id = iid + else: + raise ValueError(f'Unrecognized eigenvec format in {eigenvec_path}') + if n_pcs == 0: + n_pcs = len(pcs) + rows.append((sample_id, pcs)) + + if not rows: + raise ValueError(f'No valid data found in {eigenvec_path}') + + header = ['sample_id'] + [f'PC{i+1}' for i in range(n_pcs)] + with out_pca.open('w') as fh: + fh.write('\\t'.join(header) + '\\n') + for sample_id, pcs in rows: + fh.write(sample_id + '\\t' + '\\t'.join(pcs) + '\\n') + + print(f'[INFO] Converted {len(rows)} samples with {n_pcs} PCs -> {out_pca}') + return n_pcs + + +def read_table_robust(path): + df = pd.read_csv(path, sep='\\t', dtype=str) + print(f'[DEBUG] Initial read: {df.shape[0]} rows x {df.shape[1]} cols', flush=True) + col_names_upper = set(str(c).upper() for c in df.columns) + + def is_header_row(row): + row_values_upper = [str(v).upper() for v in row.values] + overlap = sum(1 for v in row_values_upper if v in col_names_upper) + if overlap >= 3: + return True + header_keywords = {'FID', 'IID', 'PC1', 'PC2', 'PC3'} + if sum(1 for v in row_values_upper if v in header_keywords) >= 2: + return True + return False + + bad_rows = df.apply(is_header_row, axis=1) + if bad_rows.any(): + n_bad = int(bad_rows.sum()) + print(f'[INFO] Removed {n_bad} duplicate header row(s)', flush=True) + df = df[~bad_rows].copy().reset_index(drop=True) + + print(f'[INFO] After cleanup: {df.shape[0]} rows x {df.shape[1]} cols', flush=True) + return df + + +def build_sample_id(df): + cols = list(df.columns) + if 'sample_id' in df.columns: + return df['sample_id'].astype(str), df.drop(columns=['sample_id']) + iid_candidates = [c for c in cols if str(c).upper() == 'IID'] + if iid_candidates: + iid = iid_candidates[0] + return df[iid].astype(str), df.drop(columns=[iid]) + fid_candidates = [c for c in cols if str(c).upper() == 'FID'] + if fid_candidates and iid_candidates: + fid = fid_candidates[0] + iid = iid_candidates[0] + sample_ids = df[iid].astype(str) + return sample_ids, df.drop(columns=[c for c in [fid, iid] if c in df.columns]) + pc_cols = [c for c in cols if PC_COL_RE.match(str(c))] + non_pc_cols = [c for c in cols if c not in pc_cols] + if non_pc_cols: + id_col = non_pc_cols[0] + return df[id_col].astype(str), df.drop(columns=[id_col]) + return pd.Series([f'sample_{i}' for i in range(len(df))], index=df.index), df + + +def main(): + prefix = '${meta.id}' + + pca_tsv = Path(f'{prefix}_pca_scores.tsv') + convert_eigenvec_to_tsv(Path('${eigenvec}'), pca_tsv, 'iid') + + df = read_table_robust(str(pca_tsv)) + sample_ids, df_feats = build_sample_id(df) + + pc_cols = [c for c in df_feats.columns if PC_COL_RE.match(str(c))] + if not pc_cols: + raise ValueError('No PC columns found in input') + + X = df_feats[pc_cols].apply(pd.to_numeric, errors='coerce').values + if np.isnan(X).any(): + raise ValueError('NaN values detected in PCA data') + + print(f'[INFO] Loaded {X.shape[0]} samples x {X.shape[1]} principal components', flush=True) + + if '${algorithm}' == 'kmeans': + model = KMeans(n_clusters=${n_clusters}, init='random', n_init=100, random_state=42) + labels = model.fit_predict(X) + info = {'algorithm': 'kmeans', 'k': ${n_clusters}, 'inertia': float(model.inertia_)} + else: + model = DBSCAN(eps=${dbscan_eps}, min_samples=${dbscan_min_samples}) + labels = model.fit_predict(X) + n_found = len(set(labels)) - (1 if -1 in labels else 0) + n_noise = int(np.sum(labels == -1)) + info = { + 'algorithm': 'dbscan', + 'eps': ${dbscan_eps}, + 'min_samples': ${dbscan_min_samples}, + 'n_clusters_found': int(n_found), + 'n_noise': n_noise + } + + out_clusters = f'{prefix}_clusters.csv' + out_info = f'{prefix}_clustering_info.json' + + pd.DataFrame({'sample_id': sample_ids.astype(str), 'cluster': labels}).to_csv(out_clusters, index=False) + info.update({ + 'n_samples': int(X.shape[0]), + 'n_features': int(X.shape[1]), + 'feature_names': pc_cols, + 'input_file': Path('${eigenvec}').name + }) + Path(out_info).write_text(json.dumps(info, indent=2)) + + print('[SUCCESS] Clustering completed:') + print(f' -> Clusters : {out_clusters}') + print(f' -> Info : {out_info}') + + + versions = { + 'CUSTOM_CLUSTERING': { + 'python': platform.python_version(), + 'scikit-learn': sklearn.__version__, + 'pandas': pd.__version__, + 'numpy': np.__version__, + } + } + with open('versions.yml', 'w') as fh: + fh.write(yaml.dump(versions, default_flow_style=False)) + +main() diff --git a/modules/nf-core/custom/clustering/tests/data/test.eigenvec b/modules/nf-core/custom/clustering/tests/data/test.eigenvec new file mode 100644 index 000000000000..d0281ae180ce --- /dev/null +++ b/modules/nf-core/custom/clustering/tests/data/test.eigenvec @@ -0,0 +1,6 @@ +#FID IID PC1 PC2 PC3 +0 sample01 0.1234 0.5678 0.9012 +0 sample02 -0.2345 0.6789 -0.0123 +0 sample03 0.3456 -0.7890 0.1234 +0 sample04 -0.4567 0.8901 -0.2345 +0 sample05 0.5678 -0.9012 0.3456 diff --git a/modules/nf-core/custom/clustering/tests/main.nf.test b/modules/nf-core/custom/clustering/tests/main.nf.test new file mode 100644 index 000000000000..c590a235983c --- /dev/null +++ b/modules/nf-core/custom/clustering/tests/main.nf.test @@ -0,0 +1,51 @@ +nextflow_process { + name "Test Process CUSTOM_CLUSTERING" + script "../main.nf" + process "CUSTOM_CLUSTERING" + + tag "modules" + tag "modules_nfcore" + tag "custom" + tag "custom/clustering" + + test("clustering - eigenvec") { + when { + process { + """ + input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/clustering/tests/data/test.eigenvec", checkIfExists: true) ] + input[1] = 'kmeans' + input[2] = 3 + input[3] = 0.5 + input[4] = 5 + """ + } + } + then { + assert process.success + assert snapshot( + process.out.clusters, + process.out.info, + process.out.versions + ).match() + } + } + + test("clustering - eigenvec - stub") { + options "-stub" + when { + process { + """ + input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/clustering/tests/data/test.eigenvec", checkIfExists: true) ] + input[1] = 'kmeans' + input[2] = 3 + input[3] = 0.5 + input[4] = 5 + """ + } + } + then { + assert process.success + assert snapshot(process.out).match() + } + } +} diff --git a/modules/nf-core/custom/clustering/tests/main.nf.test.snap b/modules/nf-core/custom/clustering/tests/main.nf.test.snap new file mode 100644 index 000000000000..b852b800bcb4 --- /dev/null +++ b/modules/nf-core/custom/clustering/tests/main.nf.test.snap @@ -0,0 +1,79 @@ +{ + "clustering - eigenvec - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_clusters.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test_clustering_info.json:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,664d3210ebe520f6f680bb7c41d9b15e" + ], + "clusters": [ + [ + { + "id": "test" + }, + "test_clusters.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "info": [ + [ + { + "id": "test" + }, + "test_clustering_info.json:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,664d3210ebe520f6f680bb7c41d9b15e" + ] + } + ], + "timestamp": "2026-05-13T18:21:30.37624233", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.09.0" + } + }, + "clustering - eigenvec": { + "content": [ + [ + [ + { + "id": "test" + }, + "test_clusters.csv:md5,a0ce7a662fecdb42e15e2b2aa0906cf4" + ] + ], + [ + [ + { + "id": "test" + }, + "test_clustering_info.json:md5,6e61eece1d6cad24489312531115e55a" + ] + ], + [ + "versions.yml:md5,a5f57bd446ec1ba732607243bebd93fc" + ] + ], + "timestamp": "2026-05-13T18:53:15.637280975", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.09.0" + } + } +} \ No newline at end of file From b33105dfbd7fe3ce0bfef283ad214a930eeb8490 Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Wed, 13 May 2026 22:18:48 +0200 Subject: [PATCH 36/38] chore: remove extra custom modules from clustering PR --- .../custom/clustermetrics/environment.yml | 14 - modules/nf-core/custom/clustermetrics/main.nf | 43 --- .../nf-core/custom/clustermetrics/meta.yml | 90 ------ .../templates/cluster_metrics.py | 289 ------------------ .../tests/data/test_clusters.csv | 6 - .../tests/data/test_features.tsv | 6 - .../custom/clustermetrics/tests/main.nf.test | 70 ----- .../clustermetrics/tests/main.nf.test.snap | 103 ------- .../clustervisualization/environment.yml | 14 - .../custom/clustervisualization/main.nf | 42 --- .../custom/clustervisualization/meta.yml | 96 ------ .../templates/cluster_viz.py | 234 -------------- .../tests/data/test_clusters.csv | 6 - .../tests/data/test_features.tsv | 6 - .../tests/data/test_pca.eigenvec | 6 - .../clustervisualization/tests/main.nf.test | 69 ----- .../tests/main.nf.test.snap | 94 ------ 17 files changed, 1188 deletions(-) delete mode 100644 modules/nf-core/custom/clustermetrics/environment.yml delete mode 100644 modules/nf-core/custom/clustermetrics/main.nf delete mode 100644 modules/nf-core/custom/clustermetrics/meta.yml delete mode 100644 modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py delete mode 100644 modules/nf-core/custom/clustermetrics/tests/data/test_clusters.csv delete mode 100644 modules/nf-core/custom/clustermetrics/tests/data/test_features.tsv delete mode 100644 modules/nf-core/custom/clustermetrics/tests/main.nf.test delete mode 100644 modules/nf-core/custom/clustermetrics/tests/main.nf.test.snap delete mode 100644 modules/nf-core/custom/clustervisualization/environment.yml delete mode 100644 modules/nf-core/custom/clustervisualization/main.nf delete mode 100644 modules/nf-core/custom/clustervisualization/meta.yml delete mode 100644 modules/nf-core/custom/clustervisualization/templates/cluster_viz.py delete mode 100644 modules/nf-core/custom/clustervisualization/tests/data/test_clusters.csv delete mode 100644 modules/nf-core/custom/clustervisualization/tests/data/test_features.tsv delete mode 100644 modules/nf-core/custom/clustervisualization/tests/data/test_pca.eigenvec delete mode 100644 modules/nf-core/custom/clustervisualization/tests/main.nf.test delete mode 100644 modules/nf-core/custom/clustervisualization/tests/main.nf.test.snap diff --git a/modules/nf-core/custom/clustermetrics/environment.yml b/modules/nf-core/custom/clustermetrics/environment.yml deleted file mode 100644 index ccbc287ac332..000000000000 --- a/modules/nf-core/custom/clustermetrics/environment.yml +++ /dev/null @@ -1,14 +0,0 @@ -# clustermetrics/environment.yml ---- -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json -channels: - - conda-forge - - bioconda -dependencies: - - conda-forge::matplotlib=3.9.4 - - conda-forge::numpy=2.4.2 - - conda-forge::pandas=2.2.3 - - conda-forge::python=3.12.12 - - conda-forge::scikit-learn=1.6.1 - - conda-forge::seaborn=0.13.2 - - conda-forge::umap-learn=0.5.12 \ No newline at end of file diff --git a/modules/nf-core/custom/clustermetrics/main.nf b/modules/nf-core/custom/clustermetrics/main.nf deleted file mode 100644 index e7fe70ca970e..000000000000 --- a/modules/nf-core/custom/clustermetrics/main.nf +++ /dev/null @@ -1,43 +0,0 @@ -process CUSTOM_CLUSTERMETRICS { - tag "$meta.id" - label 'process_medium' - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? - 'docker://community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn_pruned:054f91aaa56bd7d5' : - 'community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn_pruned:054f91aaa56bd7d5' }" - input: - tuple val(meta), path(features), path(clusters) - - output: - tuple val(meta), path("*_metrics.tsv") , emit: metrics - tuple val(meta), path("*_k_sweep.csv") , emit: k_sweep - tuple val(meta), path("*_selected.json") , emit: selected - tuple val(meta), path("*.png") , emit: plots, optional: true - path "versions.yml" , emit: versions, topic: versions - - when: - task.ext.when == null || task.ext.when - - script: - template 'cluster_metrics.py' - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - """ - touch ${prefix}_metrics.tsv - touch ${prefix}_k_sweep.csv - touch ${prefix}_selected.json - touch ${prefix}_elbow.png - touch ${prefix}_silhouette.png - touch ${prefix}_davies_bouldin.png - touch ${prefix}_calinski.png - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python3 --version | sed 's/Python //') - pandas: \$(python3 -c "import pandas; print(pandas.__version__)") - scikit-learn: \$(python3 -c "import sklearn; print(sklearn.__version__)") - matplotlib: \$(python3 -c "import matplotlib; print(matplotlib.__version__)") - END_VERSIONS - """ -} diff --git a/modules/nf-core/custom/clustermetrics/meta.yml b/modules/nf-core/custom/clustermetrics/meta.yml deleted file mode 100644 index 432d56069427..000000000000 --- a/modules/nf-core/custom/clustermetrics/meta.yml +++ /dev/null @@ -1,90 +0,0 @@ -name: "CUSTOM_CLUSTERMETRICS" -description: "Computes clustering quality metrics (silhouette, Calinski-Harabasz, - Davies-Bouldin) and performs k-sweep analysis" -keywords: - - clustering - - metrics - - silhouette - - calinski-harabasz - - davies-bouldin - - evaluation -tools: - - "scikit-learn": - description: "Machine learning library for clustering metrics" - homepage: "https://scikit-learn.org/" - documentation: "https://scikit-learn.org/stable/modules/clustering.html" - licence: - - "BSD-3-Clause" - identifier: "" -input: - - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. `[ id:'sample1' ]` - - features: - type: file - description: Feature matrix file - pattern: "*" - ontologies: [] - - clusters: - type: file - description: Cluster assignment file - pattern: "*" - ontologies: [] -output: - metrics: - - - meta: - type: map - description: Groovy Map containing sample information - - "*_metrics.tsv": - type: file - description: TSV with selected cluster quality metrics - pattern: "*_metrics.tsv" - ontologies: - - edam: http://edamontology.org/format_3475 - k_sweep: - - - meta: - type: map - description: Groovy Map containing sample information - - "*_k_sweep.csv": - type: file - description: CSV with metrics for different values of k - pattern: "*_k_sweep.csv" - ontologies: - - edam: http://edamontology.org/format_3752 - selected: - - - meta: - type: map - description: Groovy Map containing sample information - - "*_selected.json": - type: file - description: JSON with the selected/best metrics - pattern: "*_selected.json" - ontologies: - - edam: http://edamontology.org/format_3464 - plots: - - - meta: - type: map - description: Groovy Map containing sample information - - "*.png": - type: file - description: Optional PNG plots (elbow, silhouette, etc.) - pattern: "*.png" - ontologies: [] - versions: - - "versions.yml": - type: file - description: File containing software versions - pattern: "versions.yml" - ontologies: - - edam: http://edamontology.org/format_3750 -topics: - versions: - - versions.yml: - type: string - description: The name of the process -authors: - - "@dbaku42" -maintainers: - - "@dbaku42" diff --git a/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py b/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py deleted file mode 100644 index f8fb15f2e776..000000000000 --- a/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py +++ /dev/null @@ -1,289 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import json -import platform -import sys -from pathlib import Path - -import matplotlib -import numpy as np -import pandas as pd -import sklearn -from sklearn.cluster import KMeans -from sklearn.metrics import ( - calinski_harabasz_score, - davies_bouldin_score, - silhouette_score, -) - -matplotlib.use("Agg") - - -def format_yaml_like(data: dict, indent: int = 0) -> str: - """Formats a dictionary to a YAML-like string (nf-core standard).""" - yaml_str = "" - for key, value in data.items(): - spaces = " " * indent - if isinstance(value, dict): - yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" - else: - yaml_str += f"{spaces}{key}: {value}\\n" - return yaml_str - - -def _normalise_id_column(df: pd.DataFrame) -> pd.DataFrame: - df = df.copy() - df.columns = [str(c).lstrip("#") for c in df.columns] - - cols_upper = {str(c).upper(): c for c in df.columns} - - if "IID" in cols_upper: - iid_col = cols_upper["IID"] - dup_mask = df[iid_col].astype(str).str.upper().isin({"FID", "IID"}) - if dup_mask.any(): - df = df.loc[~dup_mask].copy().reset_index(drop=True) - - cols_upper = {str(c).upper(): c for c in df.columns} - - if "SAMPLE_ID" in cols_upper: - sample_col = cols_upper["SAMPLE_ID"] - if sample_col != "sample_id": - df = df.rename(columns={sample_col: "sample_id"}) - return df - - if "IID" in cols_upper: - iid_col = cols_upper["IID"] - iid_numeric = pd.to_numeric(df[iid_col], errors="coerce").notna().all() - - if iid_numeric: - df = df.drop(columns=[iid_col]) - if len(df.columns) == 0: - raise ValueError("Cannot infer sample_id after dropping numeric IID column") - df = df.rename(columns={df.columns[0]: "sample_id"}) - else: - df = df.rename(columns={iid_col: "sample_id"}) - - fid_cols = [c for c in df.columns if str(c).upper() == "FID"] - if fid_cols: - df = df.drop(columns=fid_cols) - - return df - - raise ValueError(f"Cannot find sample ID column (expected 'sample_id' or 'IID'). Found: {list(df.columns)}") - - -def load_features(path: str) -> tuple[pd.DataFrame, pd.Series]: - df = pd.read_csv(path, sep="\\t", dtype=str) - df = _normalise_id_column(df) - - if "sample_id" not in df.columns: - raise ValueError("features file must contain a sample_id column after normalization") - - sample_ids = df["sample_id"].astype(str) - x = df.drop(columns=["sample_id"]).apply(pd.to_numeric, errors="coerce") - x = x.fillna(x.mean(numeric_only=True)) - x = x.fillna(0.0) - - return x, sample_ids - - -def load_clusters(path: str) -> tuple[pd.Series, str]: - df = pd.read_csv(path) - if "sample_id" in df.columns and "cluster" in df.columns: - series = df.set_index(df["sample_id"].astype(str))["cluster"].astype(int) - return series, "sample_id" - elif "cluster" in df.columns: - series = df["cluster"].astype(int).reset_index(drop=True) - return series, "row_order" - else: - raise ValueError( - f"clusters file must have a 'cluster' column (and optionally 'sample_id'). Found: {list(df.columns)}" - ) - - -def safe_cluster_metrics(x: np.ndarray, labels: np.ndarray) -> dict: - uniq = np.unique(labels) - n_clusters = len(uniq) - (1 if -1 in uniq else 0) - - if n_clusters < 2: - return { - "n_clusters": int(n_clusters), - "silhouette": None, - "calinski_harabasz": None, - "davies_bouldin": None, - } - - mask = labels != -1 - x_use, y_use = x[mask], labels[mask] - - if len(x_use) < 2 or len(np.unique(y_use)) < 2: - return { - "n_clusters": int(n_clusters), - "silhouette": None, - "calinski_harabasz": None, - "davies_bouldin": None, - } - - return { - "n_clusters": int(n_clusters), - "silhouette": float(silhouette_score(x_use, y_use)), - "calinski_harabasz": float(calinski_harabasz_score(x_use, y_use)), - "davies_bouldin": float(davies_bouldin_score(x_use, y_use)), - } - - -def main() -> None: - ap = argparse.ArgumentParser() - ap.add_argument("--features", required=True) - ap.add_argument("--clusters", required=True) - ap.add_argument("--k-min", type=int, default=2) - ap.add_argument("--k-max", type=int, default=12) - ap.add_argument("--out-k-sweep", required=True) - ap.add_argument("--out-selected", required=True) - ap.add_argument("--out-prefix", required=True) - args = ap.parse_args() - - x_df, sample_ids = load_features(args.features) - clusters_s, cluster_mode = load_clusters(args.clusters) - - if cluster_mode == "sample_id": - common = sample_ids[sample_ids.isin(clusters_s.index)] - - if len(common) > 0: - x = x_df.loc[common.index].values - labels = clusters_s.loc[common.values].values - aligned_ids = common.astype(str).tolist() - alignment_mode = "sample_id" - elif len(clusters_s) == len(sample_ids): - x = x_df.values - labels = clusters_s.values - aligned_ids = sample_ids.astype(str).tolist() - alignment_mode = "row_order_fallback" - else: - raise ValueError( - f"No overlapping sample_id between features and clusters.\\n" - f" features IDs (first 5): {sample_ids.head().tolist()}\\n" - f" clusters IDs (first 5): {list(clusters_s.index[:5])}" - ) - else: - if len(clusters_s) != len(sample_ids): - raise ValueError( - "clusters CSV has no usable sample_id column and row counts do not match.\\n" - f" n_features={len(sample_ids)}\\n" - f" n_clusters={len(clusters_s)}" - ) - x = x_df.values - labels = clusters_s.values - aligned_ids = sample_ids.astype(str).tolist() - alignment_mode = "row_order" - - if len(x) < 2: - raise ValueError("Need at least 2 samples to compute cluster metrics") - - selected = safe_cluster_metrics(x, labels) - selected["input_clusters"] = Path(args.clusters).name - selected["input_features"] = Path(args.features).name - selected["n_samples_used"] = int(len(aligned_ids)) - selected["alignment_mode"] = alignment_mode - - metrics_tsv = f"{args.out_prefix}_metrics.tsv" - pd.DataFrame([selected]).to_csv(metrics_tsv, sep="\\t", index=False) - - rows = [] - max_k = min(int(args.k_max), len(x)) - for k in range(int(args.k_min), max_k + 1): - model = KMeans(n_clusters=k, n_init=10, random_state=42) - y = model.fit_predict(x) - - sil = ch = db = None - if 1 < len(np.unique(y)) < len(x): - sil = float(silhouette_score(x, y)) - ch = float(calinski_harabasz_score(x, y)) - db = float(davies_bouldin_score(x, y)) - - rows.append( - { - "k": k, - "inertia": float(model.inertia_), - "silhouette": sil, - "calinski_harabasz": ch, - "davies_bouldin": db, - } - ) - - sweep_df = pd.DataFrame(rows) - sweep_df.to_csv(args.out_k_sweep, sep=",", index=False, float_format="%.10g") - Path(args.out_selected).write_text(json.dumps(selected, indent=2)) - - pfx = args.out_prefix - try: - import matplotlib.pyplot as plt - - def plot_curve(metric, title, ylabel, out_png): - plt.figure(figsize=(7, 4.5)) - vals = sweep_df[metric].dropna() - ks = sweep_df.loc[vals.index, "k"] - plt.plot(ks, vals, marker="o") - plt.xticks(sweep_df["k"].tolist()) - plt.title(title) - plt.xlabel("k") - plt.ylabel(ylabel) - plt.tight_layout() - plt.savefig(out_png, dpi=200) - plt.close() - - if not sweep_df.empty: - plot_curve("inertia", "Elbow method (KMeans inertia)", "inertia", f"{pfx}_elbow.png") - plot_curve("silhouette", "Silhouette score (higher is better)", "silhouette", f"{pfx}_silhouette.png") - plot_curve( - "davies_bouldin", - "Davies-Bouldin index (lower is better)", - "davies_bouldin", - f"{pfx}_davies_bouldin.png", - ) - plot_curve( - "calinski_harabasz", - "Calinski-Harabasz index (higher is better)", - "calinski_harabasz", - f"{pfx}_calinski.png", - ) - - except Exception as e: - Path("plot_warning.txt").write_text("Plotting failed: " + str(e) + "\\n") - - # === VERSIONS.YML (fix review) === - versions = { - "${task.process}": { - "python": platform.python_version(), - "pandas": pd.__version__, - "scikit-learn": sklearn.__version__, - "matplotlib": matplotlib.__version__, - } - } - with open("versions.yml", "w") as f: - f.write(format_yaml_like(versions)) - - -if __name__ == "__main__": - prefix = "${task.ext.prefix ? task.ext.prefix : meta.id}" - - sys.argv = [ - "cluster_metrics.py", - "--features", - "$features", - "--clusters", - "$clusters", - "--k-min", - "2", - "--k-max", - "12", - "--out-k-sweep", - f"{prefix}_k_sweep.csv", - "--out-selected", - f"{prefix}_selected.json", - "--out-prefix", - prefix, - ] - - main() diff --git a/modules/nf-core/custom/clustermetrics/tests/data/test_clusters.csv b/modules/nf-core/custom/clustermetrics/tests/data/test_clusters.csv deleted file mode 100644 index 1258849b8fbe..000000000000 --- a/modules/nf-core/custom/clustermetrics/tests/data/test_clusters.csv +++ /dev/null @@ -1,6 +0,0 @@ -sample_id,cluster -sample01,0 -sample02,2 -sample03,1 -sample04,2 -sample05,1 diff --git a/modules/nf-core/custom/clustermetrics/tests/data/test_features.tsv b/modules/nf-core/custom/clustermetrics/tests/data/test_features.tsv deleted file mode 100644 index 033d23b82df8..000000000000 --- a/modules/nf-core/custom/clustermetrics/tests/data/test_features.tsv +++ /dev/null @@ -1,6 +0,0 @@ -sample_id PC1 PC2 PC3 -sample01 0.1234 0.5678 0.9012 -sample02 -0.2345 0.6789 -0.0123 -sample03 0.3456 -0.7890 0.1234 -sample04 -0.4567 0.8901 -0.2345 -sample05 0.5678 -0.9012 0.3456 diff --git a/modules/nf-core/custom/clustermetrics/tests/main.nf.test b/modules/nf-core/custom/clustermetrics/tests/main.nf.test deleted file mode 100644 index 32f903d2fdeb..000000000000 --- a/modules/nf-core/custom/clustermetrics/tests/main.nf.test +++ /dev/null @@ -1,70 +0,0 @@ -nextflow_process { - - name "Test Process CUSTOM_CLUSTERMETRICS" - script "../main.nf" - process "CUSTOM_CLUSTERMETRICS" - - tag "modules" - tag "modules_nfcore" - tag "custom" - tag "custom/clustermetrics" - - test("clustermetrics - features and clusters") { - - when { - process { - """ - input[0] = [ - [ id:'test' ], - file("${projectDir}/modules/nf-core/custom/clustermetrics/tests/data/test_features.tsv", checkIfExists: true), - file("${projectDir}/modules/nf-core/custom/clustermetrics/tests/data/test_clusters.csv", checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot( - process.out.metrics, - process.out.k_sweep, - process.out.selected, - process.out.versions, - path(process.out.versions[0]).yaml - ).match() } - ) - } - } - - test("clustermetrics - features and clusters - stub") { - - options "-stub" - - when { - process { - """ - input[0] = [ - [ id:'test' ], - file("${projectDir}/modules/nf-core/custom/clustermetrics/tests/data/test_features.tsv", checkIfExists: true), - file("${projectDir}/modules/nf-core/custom/clustermetrics/tests/data/test_clusters.csv", checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot( - process.out.metrics, - process.out.k_sweep, - process.out.selected, - process.out.plots, - process.out.versions, - path(process.out.versions[0]).yaml - ).match() } - ) - } - } -} \ No newline at end of file diff --git a/modules/nf-core/custom/clustermetrics/tests/main.nf.test.snap b/modules/nf-core/custom/clustermetrics/tests/main.nf.test.snap deleted file mode 100644 index 789c38b0def7..000000000000 --- a/modules/nf-core/custom/clustermetrics/tests/main.nf.test.snap +++ /dev/null @@ -1,103 +0,0 @@ -{ - "clustermetrics - features and clusters": { - "content": [ - [ - [ - { - "id": "test" - }, - "test_metrics.tsv:md5,7bfdac9bca90ba2ea3c03eca25b24f28" - ] - ], - [ - [ - { - "id": "test" - }, - "test_k_sweep.csv:md5,8c382a00d959ae5d6e19f42cf9278f37" - ] - ], - [ - [ - { - "id": "test" - }, - "test_selected.json:md5,633493b1585fb0ec0a81629bde4c00cb" - ] - ], - [ - "versions.yml:md5,67cab9dfa6c955b0927cf3ff6fc8c5bd" - ], - { - "CUSTOM_CLUSTERMETRICS": { - "python": "3.12.12", - "pandas": "2.2.3", - "scikit-learn": "1.6.1", - "matplotlib": "3.9.4" - } - } - ], - "timestamp": "2026-05-13T16:49:11.628681612", - "meta": { - "nf-test": "0.9.5", - "nextflow": "25.09.0" - } - }, - "clustermetrics - features and clusters - stub": { - "content": [ - [ - [ - { - "id": "test" - }, - "test_metrics.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - [ - [ - { - "id": "test" - }, - "test_k_sweep.csv:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - [ - [ - { - "id": "test" - }, - "test_selected.json:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - [ - [ - { - "id": "test" - }, - [ - "test_calinski.png:md5,d41d8cd98f00b204e9800998ecf8427e", - "test_davies_bouldin.png:md5,d41d8cd98f00b204e9800998ecf8427e", - "test_elbow.png:md5,d41d8cd98f00b204e9800998ecf8427e", - "test_silhouette.png:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ] - ], - [ - "versions.yml:md5,939e81a1c6d66dde0edb847e3e61defd" - ], - { - "CUSTOM_CLUSTERMETRICS": { - "python": "3.12.12", - "pandas": "2.2.3", - "scikit-learn": "1.6.1", - "matplotlib": "3.9.4" - } - } - ], - "timestamp": "2026-05-13T16:49:29.799534772", - "meta": { - "nf-test": "0.9.5", - "nextflow": "25.09.0" - } - } -} \ No newline at end of file diff --git a/modules/nf-core/custom/clustervisualization/environment.yml b/modules/nf-core/custom/clustervisualization/environment.yml deleted file mode 100644 index 8addb3305878..000000000000 --- a/modules/nf-core/custom/clustervisualization/environment.yml +++ /dev/null @@ -1,14 +0,0 @@ -# clustervisualization/environment.yml ---- -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json -channels: - - conda-forge - - bioconda -dependencies: - - conda-forge::matplotlib=3.9.4 - - conda-forge::numpy=2.4.2 - - conda-forge::pandas=2.2.3 - - conda-forge::python=3.12.12 - - conda-forge::scikit-learn=1.6.1 - - conda-forge::seaborn=0.13.2 - - conda-forge::umap-learn=0.5.12 \ No newline at end of file diff --git a/modules/nf-core/custom/clustervisualization/main.nf b/modules/nf-core/custom/clustervisualization/main.nf deleted file mode 100644 index 7d9156a0cb14..000000000000 --- a/modules/nf-core/custom/clustervisualization/main.nf +++ /dev/null @@ -1,42 +0,0 @@ -process CUSTOM_CLUSTERVISUALIZATION { - tag "$meta.id" - label 'process_medium' - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? - 'docker://community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn_pruned:054f91aaa56bd7d5' : - 'community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn_pruned:054f91aaa56bd7d5' }" - input: - tuple val(meta), path(features), path(clusters) - - output: - tuple val(meta), path("*.umap.tsv") , emit: umap_tsv - tuple val(meta), path("*.tsne.tsv") , emit: tsne_tsv - tuple val(meta), path("*.umap.png") , emit: umap_png, optional: true - tuple val(meta), path("*.tsne.png") , emit: tsne_png, optional: true - path "versions.yml" , emit: versions, topic: versions - - when: - task.ext.when == null || task.ext.when - - script: - template 'cluster_viz.py' - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - """ - touch ${prefix}.umap.tsv - touch ${prefix}.tsne.tsv - touch ${prefix}.umap.png - touch ${prefix}.tsne.png - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python3 --version | sed 's/Python //') - pandas: \$(python3 -c "import pandas; print(pandas.__version__)") - matplotlib: \$(python3 -c "import matplotlib; print(matplotlib.__version__)") - seaborn: \$(python3 -c "import seaborn; print(seaborn.__version__)") - umap-learn: \$(python3 -c "import umap; print(umap.__version__)") - scikit-learn: \$(python3 -c "import sklearn; print(sklearn.__version__)") - END_VERSIONS - """ -} diff --git a/modules/nf-core/custom/clustervisualization/meta.yml b/modules/nf-core/custom/clustervisualization/meta.yml deleted file mode 100644 index 0d90ab6fa53d..000000000000 --- a/modules/nf-core/custom/clustervisualization/meta.yml +++ /dev/null @@ -1,96 +0,0 @@ -name: "CUSTOM_CLUSTERVISUALIZATION" -description: "Generates UMAP and t-SNE visualizations colored by cluster" -keywords: - - clustering - - visualization - - pca - - umap - - tsne - - dimension-reduction -tools: - - scikit-learn: - description: "Machine learning library for dimension reduction (PCA, t-SNE)" - homepage: "https://scikit-learn.org/" - documentation: "https://scikit-learn.org/stable/modules/clustering.html" - licence: - - "BSD-3-Clause" - identifier: "" - - umap-learn: - description: "Uniform Manifold Approximation and Projection for dimension reduction" - homepage: "https://umap-learn.readthedocs.io/" - documentation: "https://umap-learn.readthedocs.io/en/latest/" - licence: - - "BSD-3-Clause" - identifier: "" -input: - - - meta: - type: map - description: "Groovy Map containing sample information" - - features: - type: file - description: "TSV file with sample_id and numeric features" - pattern: "*.tsv" - ontologies: - - edam: "http://edamontology.org/format_3475" - - clusters: - type: file - description: "CSV/TSV file with sample_id and cluster assignment" - pattern: "*_clusters.*" - ontologies: [] -output: - umap_tsv: - - - meta: - type: map - description: "Groovy Map containing sample information" - - "*.umap.tsv": - type: file - description: "UMAP coordinates per sample" - pattern: "*.umap.tsv" - ontologies: - - edam: "http://edamontology.org/operation_2432" - - edam: http://edamontology.org/format_3475 - tsne_tsv: - - - meta: - type: map - description: "Groovy Map containing sample information" - - "*.tsne.tsv": - type: file - description: "t-SNE coordinates per sample" - pattern: "*.tsne.tsv" - ontologies: - - edam: "http://edamontology.org/operation_2432" - - edam: http://edamontology.org/format_3475 - umap_png: - - - meta: - type: map - description: "Groovy Map containing sample information" - - "*.umap.png": - type: file - description: "UMAP visualization coloured by cluster" - pattern: "*.umap.png" - ontologies: [] - tsne_png: - - - meta: - type: map - description: "Groovy Map containing sample information" - - "*.tsne.png": - type: file - description: "t-SNE visualization coloured by cluster" - pattern: "*.tsne.png" - ontologies: [] - versions: - - versions.yml: - type: file - description: "Software versions used in the module" - pattern: "versions.yml" - ontologies: - - edam: http://edamontology.org/format_3750 -topics: - versions: - - versions.yml: - type: string - description: The name of the process -authors: - - "@dbaku42" -maintainers: - - "@dbaku42" diff --git a/modules/nf-core/custom/clustervisualization/templates/cluster_viz.py b/modules/nf-core/custom/clustervisualization/templates/cluster_viz.py deleted file mode 100644 index c571d16ac884..000000000000 --- a/modules/nf-core/custom/clustervisualization/templates/cluster_viz.py +++ /dev/null @@ -1,234 +0,0 @@ -#!/usr/bin/env python3 - -import os - -# Fix numba + matplotlib in read-only Singularity container -os.environ["NUMBA_CACHE_DIR"] = "/tmp" -os.environ["MPLCONFIGDIR"] = "/tmp" - -import platform - -import matplotlib -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -import seaborn as sns -import sklearn -import umap as umap_module -from sklearn.manifold import TSNE -from umap import UMAP - -matplotlib.use("Agg") - - -def format_yaml_like(data: dict, indent: int = 0) -> str: - """Formats a dictionary to a YAML-like string (nf-core standard).""" - yaml_str = "" - for key, value in data.items(): - spaces = " " * indent - if isinstance(value, dict): - yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" - else: - yaml_str += f"{spaces}{key}: {value}\\n" - return yaml_str - - -def _normalise_id_column(df: pd.DataFrame) -> pd.DataFrame: - df = df.copy() - df.columns = [str(c).lstrip("#") for c in df.columns] - - cols_upper = {str(c).upper(): c for c in df.columns} - - if "IID" in cols_upper: - iid_col = cols_upper["IID"] - dup_mask = df[iid_col].astype(str).str.upper().isin({"FID", "IID"}) - if dup_mask.any(): - df = df.loc[~dup_mask].copy().reset_index(drop=True) - - cols_upper = {str(c).upper(): c for c in df.columns} - - if "SAMPLE_ID" in cols_upper: - sample_col = cols_upper["SAMPLE_ID"] - if sample_col != "sample_id": - df = df.rename(columns={sample_col: "sample_id"}) - return df - - if "IID" in cols_upper: - iid_col = cols_upper["IID"] - iid_numeric = pd.to_numeric(df[iid_col], errors="coerce").notna().all() - - if iid_numeric: - df = df.drop(columns=[iid_col]) - if len(df.columns) == 0: - raise ValueError("Cannot infer sample_id after dropping numeric IID column") - df = df.rename(columns={df.columns[0]: "sample_id"}) - else: - df = df.rename(columns={iid_col: "sample_id"}) - - fid_cols = [c for c in df.columns if str(c).upper() == "FID"] - if fid_cols: - df = df.drop(columns=fid_cols) - - return df - - raise ValueError(f"Cannot find sample ID column (expected 'sample_id' or 'IID'). Found: {list(df.columns)}") - - -def load_features(path: str) -> tuple[pd.DataFrame, pd.Series]: - df = pd.read_csv(path, sep="\\t", dtype=str) - df = _normalise_id_column(df) - - if "sample_id" not in df.columns: - raise ValueError("features file must contain a sample_id column after normalization") - - sample_ids = df["sample_id"].astype(str) - x = df.drop(columns=["sample_id"]).apply(pd.to_numeric, errors="coerce") - x = x.fillna(x.mean(numeric_only=True)) - x = x.fillna(0.0) - - return x, sample_ids - - -def load_clusters(path: str) -> tuple[pd.DataFrame, str]: - """Load clusters and return (df, mode). Same logic as cluster_metrics.""" - df = pd.read_csv(path, sep=",", dtype=str) - df = df.copy() - df.columns = [str(c).lstrip("#") for c in df.columns] - - cols_upper = {str(c).upper(): c for c in df.columns} - - if "CLUSTER" not in cols_upper: - raise ValueError("clusters CSV must have a 'cluster' column") - - cluster_col = cols_upper["CLUSTER"] - - if "SAMPLE_ID" in cols_upper: - sample_col = cols_upper["SAMPLE_ID"] - out = df[[sample_col, cluster_col]].copy() - out.columns = ["sample_id", "cluster"] - out["sample_id"] = out["sample_id"].astype(str) - out["cluster"] = pd.to_numeric(out["cluster"], errors="raise").astype(int) - return out, "sample_id" - - try: - norm = _normalise_id_column(df.copy()) - if "sample_id" in norm.columns and "cluster" in norm.columns: - out = norm[["sample_id", "cluster"]].copy() - out["sample_id"] = out["sample_id"].astype(str) - out["cluster"] = pd.to_numeric(out["cluster"], errors="raise").astype(int) - return out, "sample_id" - except Exception: - pass - - other_cols = [c for c in df.columns if c != cluster_col] - - if len(other_cols) == 1: - candidate = other_cols[0] - candidate_vals = df[candidate].astype(str) - - if not ( - len(candidate_vals) > 0 and float(pd.to_numeric(candidate_vals, errors="coerce").notna().mean()) >= 0.8 - ): - out = pd.DataFrame( - { - "sample_id": candidate_vals, - "cluster": pd.to_numeric(df[cluster_col], errors="raise").astype(int), - } - ) - return out, "sample_id" - - out = pd.DataFrame({"cluster": pd.to_numeric(df[cluster_col], errors="raise").astype(int)}) - return out, "row_order" - - -def plot_embedding(x: np.ndarray, labels: np.ndarray, method: str, prefix: str) -> None: - """Plot UMAP or t-SNE with cluster coloring.""" - if method == "umap": - reducer = UMAP(random_state=42) - embedding = reducer.fit_transform(x) - title = "UMAP" - out_tsv = f"{prefix}.umap.tsv" - out_png = f"{prefix}.umap.png" - else: # tsne - reducer = TSNE(n_components=2, random_state=42, perplexity=min(30, len(x) - 1)) - embedding = reducer.fit_transform(x) - title = "t-SNE" - out_tsv = f"{prefix}.tsne.tsv" - out_png = f"{prefix}.tsne.png" - - # Save embedding - emb_df = pd.DataFrame(embedding, columns=["Dim1", "Dim2"]) - emb_df["cluster"] = labels - emb_df.to_csv(out_tsv, sep="\\t", index=False) - - # Plot - plt.figure(figsize=(8, 6)) - palette = sns.color_palette("tab10", n_colors=len(np.unique(labels))) - sns.scatterplot( - x=embedding[:, 0], - y=embedding[:, 1], - hue=labels.astype(str), - palette=palette, - alpha=0.8, - s=60, - edgecolor="k", - linewidth=0.3, - ) - plt.title(f"{title} projection of features colored by cluster") - plt.xlabel(f"{title} 1") - plt.ylabel(f"{title} 2") - plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc="upper left") - plt.tight_layout() - plt.savefig(out_png, dpi=200, bbox_inches="tight") - plt.close() - - -def main() -> None: - features = "$features" - clusters_path = "$clusters" - prefix = "${task.ext.prefix ?: meta.id}" - - x_df, sample_ids = load_features(features) - clusters_df, cluster_mode = load_clusters(clusters_path) - - if cluster_mode == "sample_id": - clusters = clusters_df.set_index("sample_id")["cluster"] - common = sample_ids[sample_ids.isin(clusters.index)] - if len(common) > 0: - x = x_df.loc[common.index].values - labels = clusters.loc[common.values].values - elif len(clusters_df) == len(sample_ids): - x = x_df.values - labels = clusters_df["cluster"].values - else: - raise ValueError("No overlapping sample_id between features and clusters") - else: - if len(clusters_df) != len(sample_ids): - raise ValueError("Row counts do not match and no sample_id column found") - x = x_df.values - labels = clusters_df["cluster"].values - - if len(x) < 2: - raise ValueError("Need at least 2 samples for embedding") - - # Generate both embeddings - plot_embedding(x, labels, "umap", prefix) - plot_embedding(x, labels, "tsne", prefix) - - # versions.yml - versions = { - "${task.process}": { - "python": platform.python_version(), - "pandas": pd.__version__, - "matplotlib": matplotlib.__version__, - "seaborn": sns.__version__, - "umap-learn": umap_module.__version__, - "scikit-learn": sklearn.__version__, - } - } - with open("versions.yml", "w") as f: - f.write(format_yaml_like(versions)) - - -if __name__ == "__main__": - main() diff --git a/modules/nf-core/custom/clustervisualization/tests/data/test_clusters.csv b/modules/nf-core/custom/clustervisualization/tests/data/test_clusters.csv deleted file mode 100644 index 1258849b8fbe..000000000000 --- a/modules/nf-core/custom/clustervisualization/tests/data/test_clusters.csv +++ /dev/null @@ -1,6 +0,0 @@ -sample_id,cluster -sample01,0 -sample02,2 -sample03,1 -sample04,2 -sample05,1 diff --git a/modules/nf-core/custom/clustervisualization/tests/data/test_features.tsv b/modules/nf-core/custom/clustervisualization/tests/data/test_features.tsv deleted file mode 100644 index 033d23b82df8..000000000000 --- a/modules/nf-core/custom/clustervisualization/tests/data/test_features.tsv +++ /dev/null @@ -1,6 +0,0 @@ -sample_id PC1 PC2 PC3 -sample01 0.1234 0.5678 0.9012 -sample02 -0.2345 0.6789 -0.0123 -sample03 0.3456 -0.7890 0.1234 -sample04 -0.4567 0.8901 -0.2345 -sample05 0.5678 -0.9012 0.3456 diff --git a/modules/nf-core/custom/clustervisualization/tests/data/test_pca.eigenvec b/modules/nf-core/custom/clustervisualization/tests/data/test_pca.eigenvec deleted file mode 100644 index 61aae5d8b413..000000000000 --- a/modules/nf-core/custom/clustervisualization/tests/data/test_pca.eigenvec +++ /dev/null @@ -1,6 +0,0 @@ -#FID IID PC1 PC2 PC3 -0 sample01 0.1234 0.5678 0.9012 -0 sample02 -0.2345 0.6789 -0.0123 -0 sample03 0.3456 -0.7890 0.1234 -0 sample04 -0.4567 0.8901 -0.2345 -0 sample05 0.5678 -0.9012 0.3456 diff --git a/modules/nf-core/custom/clustervisualization/tests/main.nf.test b/modules/nf-core/custom/clustervisualization/tests/main.nf.test deleted file mode 100644 index 1256c87895f0..000000000000 --- a/modules/nf-core/custom/clustervisualization/tests/main.nf.test +++ /dev/null @@ -1,69 +0,0 @@ -nextflow_process { - - name "Test Process CUSTOM_CLUSTERVISUALIZATION" - script "../main.nf" - process "CUSTOM_CLUSTERVISUALIZATION" - - tag "modules" - tag "modules_nfcore" - tag "custom" - tag "custom/clustervisualization" - - test("clustervisualization - features and clusters") { - - when { - process { - """ - input[0] = [ - [ id:'test' ], - file("${projectDir}/modules/nf-core/custom/clustervisualization/tests/data/test_features.tsv", checkIfExists: true), - file("${projectDir}/modules/nf-core/custom/clustervisualization/tests/data/test_clusters.csv", checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot( - process.out.umap_tsv, - process.out.tsne_tsv, - process.out.versions, - path(process.out.versions[0]).yaml - ).match() } - ) - } - } - - test("clustervisualization - features and clusters - stub") { - - options "-stub" - - when { - process { - """ - input[0] = [ - [ id:'test' ], - file("${projectDir}/modules/nf-core/custom/clustervisualization/tests/data/test_features.tsv", checkIfExists: true), - file("${projectDir}/modules/nf-core/custom/clustervisualization/tests/data/test_clusters.csv", checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot( - process.out.umap_tsv, - process.out.tsne_tsv, - process.out.umap_png, - process.out.tsne_png, - process.out.versions, - path(process.out.versions[0]).yaml - ).match() } - ) - } - } -} \ No newline at end of file diff --git a/modules/nf-core/custom/clustervisualization/tests/main.nf.test.snap b/modules/nf-core/custom/clustervisualization/tests/main.nf.test.snap deleted file mode 100644 index 03d8e5832d23..000000000000 --- a/modules/nf-core/custom/clustervisualization/tests/main.nf.test.snap +++ /dev/null @@ -1,94 +0,0 @@ -{ - "clustervisualization - features and clusters - stub": { - "content": [ - [ - [ - { - "id": "test" - }, - "test.umap.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - [ - [ - { - "id": "test" - }, - "test.tsne.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - [ - [ - { - "id": "test" - }, - "test.umap.png:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - [ - [ - { - "id": "test" - }, - "test.tsne.png:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - [ - "versions.yml:md5,f79a1469aa94553d8d58281262a9d76b" - ], - { - "CUSTOM_CLUSTERVISUALIZATION": { - "python": "3.12.12", - "pandas": "2.2.3", - "matplotlib": "3.9.4", - "seaborn": "0.13.2", - "umap-learn": "0.5.12", - "scikit-learn": "1.6.1" - } - } - ], - "timestamp": "2026-05-13T16:50:23.618656008", - "meta": { - "nf-test": "0.9.5", - "nextflow": "25.09.0" - } - }, - "clustervisualization - features and clusters": { - "content": [ - [ - [ - { - "id": "test" - }, - "test.umap.tsv:md5,50c3bb50b36a174c55dd45201e9c0036" - ] - ], - [ - [ - { - "id": "test" - }, - "test.tsne.tsv:md5,738a97587fa8c72614d2655eddbd2f7a" - ] - ], - [ - "versions.yml:md5,c9099db9969c17be8e1f69dfd9ed925a" - ], - { - "CUSTOM_CLUSTERVISUALIZATION": { - "python": "3.12.12", - "pandas": "2.2.3", - "matplotlib": "3.9.4", - "seaborn": "0.13.2", - "umap-learn": "0.5.12", - "scikit-learn": "1.6.1" - } - } - ], - "timestamp": "2026-05-13T16:49:58.200828019", - "meta": { - "nf-test": "0.9.5", - "nextflow": "25.09.0" - } - } -} \ No newline at end of file From 280aac970bdb6ccf7e7cd1e30d3c41a837b85ba2 Mon Sep 17 00:00:00 2001 From: dbaku42 Date: Wed, 13 May 2026 22:26:54 +0200 Subject: [PATCH 37/38] test: update clustering snapshot after pyyaml addition --- modules/nf-core/custom/clustering/tests/main.nf.test.snap | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/nf-core/custom/clustering/tests/main.nf.test.snap b/modules/nf-core/custom/clustering/tests/main.nf.test.snap index b852b800bcb4..8a29091d57a2 100644 --- a/modules/nf-core/custom/clustering/tests/main.nf.test.snap +++ b/modules/nf-core/custom/clustering/tests/main.nf.test.snap @@ -63,14 +63,14 @@ { "id": "test" }, - "test_clustering_info.json:md5,6e61eece1d6cad24489312531115e55a" + "test_clustering_info.json:md5,c4cb7430071a48a117eae03f66e654ed" ] ], [ "versions.yml:md5,a5f57bd446ec1ba732607243bebd93fc" ] ], - "timestamp": "2026-05-13T18:53:15.637280975", + "timestamp": "2026-05-13T22:26:38.454903789", "meta": { "nf-test": "0.9.5", "nextflow": "25.09.0" From 9d98375e7b8584de8f739c31f76e20aab92e0d28 Mon Sep 17 00:00:00 2001 From: Donald Baku <141358602+dbaku42@users.noreply.github.com> Date: Thu, 14 May 2026 12:04:22 +0200 Subject: [PATCH 38/38] Add pyyaml version 6.0.2 to environment.yml --- modules/nf-core/custom/clustering/environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/nf-core/custom/clustering/environment.yml b/modules/nf-core/custom/clustering/environment.yml index 4b7a89234526..6dd100648188 100644 --- a/modules/nf-core/custom/clustering/environment.yml +++ b/modules/nf-core/custom/clustering/environment.yml @@ -12,3 +12,4 @@ dependencies: - conda-forge::scikit-learn=1.6.1 - conda-forge::seaborn=0.13.2 - conda-forge::umap-learn=0.5.12 + - conda-forge::pyyaml=6.0.2