diff --git a/.gitignore b/.gitignore index df5aafd3cc74..b9460e81b015 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,14 @@ test_output/ tests/data/ work/ .github/CODEOWNERS-tmp +modules/local/ +vcf_data/ +subworkflows/nf-core/snpclustering/modules/ +subworkflows/nf-core/snpclustering/run.log +subworkflows/nf-core/snpclustering/run_test.nf +subworkflows/nf-core/snpclustering/test_local.nf +subworkflows/nf-core/snpclustering/scripts/ +subworkflows/nf-core/snpclustering/Dockerfile +subworkflows/nf-core/snpclustering/tests/ +subworkflows/nf-core/snpclustering/main.nf +modules/nf-core/clustering/ diff --git a/modules/nf-core/custom/clustering/environment.yml b/modules/nf-core/custom/clustering/environment.yml new file mode 100644 index 000000000000..6dd100648188 --- /dev/null +++ b/modules/nf-core/custom/clustering/environment.yml @@ -0,0 +1,15 @@ +# clustermetrics/environment.yml +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::matplotlib=3.9.4 + - conda-forge::numpy=2.4.2 + - conda-forge::pandas=2.2.3 + - conda-forge::python=3.12.12 + - conda-forge::scikit-learn=1.6.1 + - conda-forge::seaborn=0.13.2 + - conda-forge::umap-learn=0.5.12 + - conda-forge::pyyaml=6.0.2 diff --git a/modules/nf-core/custom/clustering/main.nf b/modules/nf-core/custom/clustering/main.nf new file mode 100644 index 000000000000..2f6f778dc08c --- /dev/null +++ b/modules/nf-core/custom/clustering/main.nf @@ -0,0 +1,40 @@ +process CUSTOM_CLUSTERING { + tag "$meta.id" + label 'process_medium' + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'docker://community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn_pruned:054f91aaa56bd7d5' : + 'community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn_pruned:054f91aaa56bd7d5' }" + + input: + tuple val(meta), path(eigenvec) + val algorithm + val n_clusters + val dbscan_eps + val dbscan_min_samples + + output: + tuple val(meta), path("*_clusters.csv") , emit: clusters + tuple val(meta), path("*_clustering_info.json") , emit: info, optional: true + path "versions.yml" , emit: versions, topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + template 'clustering.py' + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_clusters.csv + touch ${prefix}_clustering_info.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //') + pandas: \$(python3 -c "import pandas; print(pandas.__version__)") + scikit-learn: \$(python3 -c "import sklearn; print(sklearn.__version__)") + END_VERSIONS + """ +} diff --git a/modules/nf-core/custom/clustering/meta.yml b/modules/nf-core/custom/clustering/meta.yml new file mode 100644 index 000000000000..7eb83d610dfe --- /dev/null +++ b/modules/nf-core/custom/clustering/meta.yml @@ -0,0 +1,77 @@ +name: "CUSTOM_CLUSTERING" +description: "Performs KMeans or DBSCAN clustering on principal components from PLINK2 + --pca" +keywords: + - clustering + - pca + - kmeans + - dbscan + - principal-components +tools: + - "scikit-learn": + description: "Machine learning library for clustering" + homepage: "https://scikit-learn.org/" + documentation: "https://scikit-learn.org/stable/modules/clustering.html" + licence: + - "BSD-3-Clause" + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - eigenvec: + type: file + description: PLINK2 .eigenvec file generated by --pca + pattern: "*.eigenvec" + ontologies: [] + - algorithm: + type: string + description: Clustering algorithm to use (kmeans or dbscan) + - n_clusters: + type: integer + description: Number of clusters for KMeans + - dbscan_eps: + type: float + description: Epsilon parameter for DBSCAN + - dbscan_min_samples: + type: integer + description: Minimum samples parameter for DBSCAN +output: + clusters: + - - meta: + type: map + description: Groovy Map containing sample information + - "*_clusters.csv": + type: file + description: CSV file with sample_id and assigned cluster + pattern: "*_clusters.csv" + ontologies: + - edam: http://edamontology.org/format_3752 + info: + - - meta: + type: map + description: Groovy Map containing sample information + - "*_clustering_info.json": + type: file + description: JSON file with clustering parameters and statistics + pattern: "*_clustering_info.json" + ontologies: + - edam: http://edamontology.org/format_3464 + versions: + - "versions.yml": + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 +topics: + versions: + - versions.yml: + type: string + description: The name of the process +authors: + - "@dbaku42" +maintainers: + - "@dbaku42" diff --git a/modules/nf-core/custom/clustering/templates/clustering.py b/modules/nf-core/custom/clustering/templates/clustering.py new file mode 100644 index 000000000000..5175b294a86c --- /dev/null +++ b/modules/nf-core/custom/clustering/templates/clustering.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 + +import json +import platform +import sklearn +import yaml +import re +from pathlib import Path + +import numpy as np +import pandas as pd +from sklearn.cluster import KMeans, DBSCAN + +PC_COL_RE = re.compile('[Pp][Cc][0-9]+', re.IGNORECASE) + + +def convert_eigenvec_to_tsv(eigenvec_path, out_pca, id_mode='iid'): + rows = [] + n_pcs = 0 + mode = None + + with eigenvec_path.open('r') as fh: + for line in fh: + line = line.strip() + if not line: + continue + parts = line.split() + if parts[0].startswith('#'): + header = [p.lstrip('#') for p in parts] + if len(header) >= 2 and header[0].upper() == 'FID' and header[1].upper() == 'IID': + mode = 'fid_iid' + elif header[0].upper() == 'IID': + mode = 'iid_only' + continue + if mode is None: + try: + float(parts[1]) + mode = 'iid_only' + except (ValueError, IndexError): + mode = 'fid_iid' + if mode == 'fid_iid': + if len(parts) < 3: + continue + fid = parts[0] + iid = parts[1] + pcs = parts[2:] + sample_id = iid if id_mode == 'iid' else f'{fid}:{iid}' + elif mode == 'iid_only': + if len(parts) < 2: + continue + iid = parts[0] + pcs = parts[1:] + sample_id = iid + else: + raise ValueError(f'Unrecognized eigenvec format in {eigenvec_path}') + if n_pcs == 0: + n_pcs = len(pcs) + rows.append((sample_id, pcs)) + + if not rows: + raise ValueError(f'No valid data found in {eigenvec_path}') + + header = ['sample_id'] + [f'PC{i+1}' for i in range(n_pcs)] + with out_pca.open('w') as fh: + fh.write('\\t'.join(header) + '\\n') + for sample_id, pcs in rows: + fh.write(sample_id + '\\t' + '\\t'.join(pcs) + '\\n') + + print(f'[INFO] Converted {len(rows)} samples with {n_pcs} PCs -> {out_pca}') + return n_pcs + + +def read_table_robust(path): + df = pd.read_csv(path, sep='\\t', dtype=str) + print(f'[DEBUG] Initial read: {df.shape[0]} rows x {df.shape[1]} cols', flush=True) + col_names_upper = set(str(c).upper() for c in df.columns) + + def is_header_row(row): + row_values_upper = [str(v).upper() for v in row.values] + overlap = sum(1 for v in row_values_upper if v in col_names_upper) + if overlap >= 3: + return True + header_keywords = {'FID', 'IID', 'PC1', 'PC2', 'PC3'} + if sum(1 for v in row_values_upper if v in header_keywords) >= 2: + return True + return False + + bad_rows = df.apply(is_header_row, axis=1) + if bad_rows.any(): + n_bad = int(bad_rows.sum()) + print(f'[INFO] Removed {n_bad} duplicate header row(s)', flush=True) + df = df[~bad_rows].copy().reset_index(drop=True) + + print(f'[INFO] After cleanup: {df.shape[0]} rows x {df.shape[1]} cols', flush=True) + return df + + +def build_sample_id(df): + cols = list(df.columns) + if 'sample_id' in df.columns: + return df['sample_id'].astype(str), df.drop(columns=['sample_id']) + iid_candidates = [c for c in cols if str(c).upper() == 'IID'] + if iid_candidates: + iid = iid_candidates[0] + return df[iid].astype(str), df.drop(columns=[iid]) + fid_candidates = [c for c in cols if str(c).upper() == 'FID'] + if fid_candidates and iid_candidates: + fid = fid_candidates[0] + iid = iid_candidates[0] + sample_ids = df[iid].astype(str) + return sample_ids, df.drop(columns=[c for c in [fid, iid] if c in df.columns]) + pc_cols = [c for c in cols if PC_COL_RE.match(str(c))] + non_pc_cols = [c for c in cols if c not in pc_cols] + if non_pc_cols: + id_col = non_pc_cols[0] + return df[id_col].astype(str), df.drop(columns=[id_col]) + return pd.Series([f'sample_{i}' for i in range(len(df))], index=df.index), df + + +def main(): + prefix = '${meta.id}' + + pca_tsv = Path(f'{prefix}_pca_scores.tsv') + convert_eigenvec_to_tsv(Path('${eigenvec}'), pca_tsv, 'iid') + + df = read_table_robust(str(pca_tsv)) + sample_ids, df_feats = build_sample_id(df) + + pc_cols = [c for c in df_feats.columns if PC_COL_RE.match(str(c))] + if not pc_cols: + raise ValueError('No PC columns found in input') + + X = df_feats[pc_cols].apply(pd.to_numeric, errors='coerce').values + if np.isnan(X).any(): + raise ValueError('NaN values detected in PCA data') + + print(f'[INFO] Loaded {X.shape[0]} samples x {X.shape[1]} principal components', flush=True) + + if '${algorithm}' == 'kmeans': + model = KMeans(n_clusters=${n_clusters}, init='random', n_init=100, random_state=42) + labels = model.fit_predict(X) + info = {'algorithm': 'kmeans', 'k': ${n_clusters}, 'inertia': float(model.inertia_)} + else: + model = DBSCAN(eps=${dbscan_eps}, min_samples=${dbscan_min_samples}) + labels = model.fit_predict(X) + n_found = len(set(labels)) - (1 if -1 in labels else 0) + n_noise = int(np.sum(labels == -1)) + info = { + 'algorithm': 'dbscan', + 'eps': ${dbscan_eps}, + 'min_samples': ${dbscan_min_samples}, + 'n_clusters_found': int(n_found), + 'n_noise': n_noise + } + + out_clusters = f'{prefix}_clusters.csv' + out_info = f'{prefix}_clustering_info.json' + + pd.DataFrame({'sample_id': sample_ids.astype(str), 'cluster': labels}).to_csv(out_clusters, index=False) + info.update({ + 'n_samples': int(X.shape[0]), + 'n_features': int(X.shape[1]), + 'feature_names': pc_cols, + 'input_file': Path('${eigenvec}').name + }) + Path(out_info).write_text(json.dumps(info, indent=2)) + + print('[SUCCESS] Clustering completed:') + print(f' -> Clusters : {out_clusters}') + print(f' -> Info : {out_info}') + + + versions = { + 'CUSTOM_CLUSTERING': { + 'python': platform.python_version(), + 'scikit-learn': sklearn.__version__, + 'pandas': pd.__version__, + 'numpy': np.__version__, + } + } + with open('versions.yml', 'w') as fh: + fh.write(yaml.dump(versions, default_flow_style=False)) + +main() diff --git a/modules/nf-core/custom/clustering/tests/data/test.eigenvec b/modules/nf-core/custom/clustering/tests/data/test.eigenvec new file mode 100644 index 000000000000..d0281ae180ce --- /dev/null +++ b/modules/nf-core/custom/clustering/tests/data/test.eigenvec @@ -0,0 +1,6 @@ +#FID IID PC1 PC2 PC3 +0 sample01 0.1234 0.5678 0.9012 +0 sample02 -0.2345 0.6789 -0.0123 +0 sample03 0.3456 -0.7890 0.1234 +0 sample04 -0.4567 0.8901 -0.2345 +0 sample05 0.5678 -0.9012 0.3456 diff --git a/modules/nf-core/custom/clustering/tests/main.nf.test b/modules/nf-core/custom/clustering/tests/main.nf.test new file mode 100644 index 000000000000..c590a235983c --- /dev/null +++ b/modules/nf-core/custom/clustering/tests/main.nf.test @@ -0,0 +1,51 @@ +nextflow_process { + name "Test Process CUSTOM_CLUSTERING" + script "../main.nf" + process "CUSTOM_CLUSTERING" + + tag "modules" + tag "modules_nfcore" + tag "custom" + tag "custom/clustering" + + test("clustering - eigenvec") { + when { + process { + """ + input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/clustering/tests/data/test.eigenvec", checkIfExists: true) ] + input[1] = 'kmeans' + input[2] = 3 + input[3] = 0.5 + input[4] = 5 + """ + } + } + then { + assert process.success + assert snapshot( + process.out.clusters, + process.out.info, + process.out.versions + ).match() + } + } + + test("clustering - eigenvec - stub") { + options "-stub" + when { + process { + """ + input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/clustering/tests/data/test.eigenvec", checkIfExists: true) ] + input[1] = 'kmeans' + input[2] = 3 + input[3] = 0.5 + input[4] = 5 + """ + } + } + then { + assert process.success + assert snapshot(process.out).match() + } + } +} diff --git a/modules/nf-core/custom/clustering/tests/main.nf.test.snap b/modules/nf-core/custom/clustering/tests/main.nf.test.snap new file mode 100644 index 000000000000..8a29091d57a2 --- /dev/null +++ b/modules/nf-core/custom/clustering/tests/main.nf.test.snap @@ -0,0 +1,79 @@ +{ + "clustering - eigenvec - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_clusters.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test_clustering_info.json:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,664d3210ebe520f6f680bb7c41d9b15e" + ], + "clusters": [ + [ + { + "id": "test" + }, + "test_clusters.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "info": [ + [ + { + "id": "test" + }, + "test_clustering_info.json:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,664d3210ebe520f6f680bb7c41d9b15e" + ] + } + ], + "timestamp": "2026-05-13T18:21:30.37624233", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.09.0" + } + }, + "clustering - eigenvec": { + "content": [ + [ + [ + { + "id": "test" + }, + "test_clusters.csv:md5,a0ce7a662fecdb42e15e2b2aa0906cf4" + ] + ], + [ + [ + { + "id": "test" + }, + "test_clustering_info.json:md5,c4cb7430071a48a117eae03f66e654ed" + ] + ], + [ + "versions.yml:md5,a5f57bd446ec1ba732607243bebd93fc" + ] + ], + "timestamp": "2026-05-13T22:26:38.454903789", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.09.0" + } + } +} \ No newline at end of file