nf-core · dbaku42 · Apr 29, 2026 · May 4, 2026 · May 4, 2026 · May 4, 2026
@@ -15,3 +15,14 @@ test_output/
 tests/data/
 work/
 .github/CODEOWNERS-tmp
+modules/local/
+vcf_data/
+subworkflows/nf-core/snpclustering/modules/
+subworkflows/nf-core/snpclustering/run.log
+subworkflows/nf-core/snpclustering/run_test.nf
+subworkflows/nf-core/snpclustering/test_local.nf
+subworkflows/nf-core/snpclustering/scripts/
+subworkflows/nf-core/snpclustering/Dockerfile
+subworkflows/nf-core/snpclustering/tests/
+subworkflows/nf-core/snpclustering/main.nf
+modules/nf-core/clustering/
diff --git a/modules/nf-core/custom/clustering/environment.yml b/modules/nf-core/custom/clustering/environment.yml
@@ -0,0 +1,15 @@
+# clustermetrics/environment.yml
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - conda-forge::matplotlib=3.9.4
+  - conda-forge::numpy=2.4.2
+  - conda-forge::pandas=2.2.3
+  - conda-forge::python=3.12.12
+  - conda-forge::scikit-learn=1.6.1
+  - conda-forge::seaborn=0.13.2
+  - conda-forge::umap-learn=0.5.12
+  - conda-forge::pyyaml=6.0.2 
diff --git a/modules/nf-core/custom/clustering/main.nf b/modules/nf-core/custom/clustering/main.nf
@@ -0,0 +1,40 @@
+process CUSTOM_CLUSTERING {
+    tag "$meta.id"
+    label 'process_medium'
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ?
+    'docker://community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn_pruned:054f91aaa56bd7d5' :
+    'community.wave.seqera.io/library/matplotlib_pandas_python_scikit-learn_pruned:054f91aaa56bd7d5' }"
+
+    input:
+    tuple val(meta), path(eigenvec)
+    val algorithm
+    val n_clusters
+    val dbscan_eps
+    val dbscan_min_samples
+
+    output:
+    tuple val(meta), path("*_clusters.csv")         , emit: clusters
+    tuple val(meta), path("*_clustering_info.json") , emit: info, optional: true
+    path "versions.yml"                             , emit: versions, topic: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    template 'clustering.py'
+
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    touch ${prefix}_clusters.csv
+    touch ${prefix}_clustering_info.json
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python3 --version | sed 's/Python //')
+        pandas: \$(python3 -c "import pandas; print(pandas.__version__)")
+        scikit-learn: \$(python3 -c "import sklearn; print(sklearn.__version__)")
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/custom/clustering/meta.yml b/modules/nf-core/custom/clustering/meta.yml
@@ -0,0 +1,77 @@
+name: "CUSTOM_CLUSTERING"
+description: "Performs KMeans or DBSCAN clustering on principal components from PLINK2
+  --pca"
+keywords:
+  - clustering
+  - pca
+  - kmeans
+  - dbscan
+  - principal-components
+tools:
+  - "scikit-learn":
+      description: "Machine learning library for clustering"
+      homepage: "https://scikit-learn.org/"
+      documentation: "https://scikit-learn.org/stable/modules/clustering.html"
+      licence:
+        - "BSD-3-Clause"
+      identifier: ""
+input:
+  - - meta:
+        type: map
+        description: |
+          Groovy Map containing sample information
+          e.g. `[ id:'sample1' ]`
+    - eigenvec:
+        type: file
+        description: PLINK2 .eigenvec file generated by --pca
+        pattern: "*.eigenvec"
+        ontologies: []
+  - algorithm:
+      type: string
+      description: Clustering algorithm to use (kmeans or dbscan)
+  - n_clusters:
+      type: integer
+      description: Number of clusters for KMeans
+  - dbscan_eps:
+      type: float
+      description: Epsilon parameter for DBSCAN
+  - dbscan_min_samples:
+      type: integer
+      description: Minimum samples parameter for DBSCAN
+output:
+  clusters:
+    - - meta:
+          type: map
+          description: Groovy Map containing sample information
+      - "*_clusters.csv":
+          type: file
+          description: CSV file with sample_id and assigned cluster
+          pattern: "*_clusters.csv"
+          ontologies:
+            - edam: http://edamontology.org/format_3752
+  info:
+    - - meta:
+          type: map
+          description: Groovy Map containing sample information
+      - "*_clustering_info.json":
+          type: file
+          description: JSON file with clustering parameters and statistics
+          pattern: "*_clustering_info.json"
+          ontologies:
+            - edam: http://edamontology.org/format_3464
+  versions:
+    - "versions.yml":
+        type: file
+        description: File containing software versions
+        pattern: "versions.yml"
+        ontologies:
+          - edam: http://edamontology.org/format_3750
+topics:
+  versions:
+    - versions.yml:
+        type: string
+        description: The name of the process
+authors:
+  - "@dbaku42"
+maintainers:
+  - "@dbaku42"
diff --git a/modules/nf-core/custom/clustering/templates/clustering.py b/modules/nf-core/custom/clustering/templates/clustering.py
@@ -0,0 +1,184 @@
+#!/usr/bin/env python3
+
+import json
+import platform
+import sklearn
+import yaml
+import re
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from sklearn.cluster import KMeans, DBSCAN
+
+PC_COL_RE = re.compile('[Pp][Cc][0-9]+', re.IGNORECASE)
+
+
+def convert_eigenvec_to_tsv(eigenvec_path, out_pca, id_mode='iid'):
+    rows = []
+    n_pcs = 0
+    mode = None
+
+    with eigenvec_path.open('r') as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            parts = line.split()
+            if parts[0].startswith('#'):
+                header = [p.lstrip('#') for p in parts]
+                if len(header) >= 2 and header[0].upper() == 'FID' and header[1].upper() == 'IID':
+                    mode = 'fid_iid'
+                elif header[0].upper() == 'IID':
+                    mode = 'iid_only'
+                continue
+            if mode is None:
+                try:
+                    float(parts[1])
+                    mode = 'iid_only'
+                except (ValueError, IndexError):
+                    mode = 'fid_iid'
+            if mode == 'fid_iid':
+                if len(parts) < 3:
+                    continue
+                fid = parts[0]
+                iid = parts[1]
+                pcs = parts[2:]
+                sample_id = iid if id_mode == 'iid' else f'{fid}:{iid}'
+            elif mode == 'iid_only':
+                if len(parts) < 2:
+                    continue
+                iid = parts[0]
+                pcs = parts[1:]
+                sample_id = iid
+            else:
+                raise ValueError(f'Unrecognized eigenvec format in {eigenvec_path}')
+            if n_pcs == 0:
+                n_pcs = len(pcs)
+            rows.append((sample_id, pcs))
+
+    if not rows:
+        raise ValueError(f'No valid data found in {eigenvec_path}')
+
+    header = ['sample_id'] + [f'PC{i+1}' for i in range(n_pcs)]
+    with out_pca.open('w') as fh:
+        fh.write('\\t'.join(header) + '\\n')
+        for sample_id, pcs in rows:
+            fh.write(sample_id + '\\t' + '\\t'.join(pcs) + '\\n')
+
+    print(f'[INFO] Converted {len(rows)} samples with {n_pcs} PCs -> {out_pca}')
+    return n_pcs
+
+
+def read_table_robust(path):
+    df = pd.read_csv(path, sep='\\t', dtype=str)
+    print(f'[DEBUG] Initial read: {df.shape[0]} rows x {df.shape[1]} cols', flush=True)
+    col_names_upper = set(str(c).upper() for c in df.columns)
+
+    def is_header_row(row):
+        row_values_upper = [str(v).upper() for v in row.values]
+        overlap = sum(1 for v in row_values_upper if v in col_names_upper)
+        if overlap >= 3:
+            return True
+        header_keywords = {'FID', 'IID', 'PC1', 'PC2', 'PC3'}
+        if sum(1 for v in row_values_upper if v in header_keywords) >= 2:
+            return True
+        return False
+
+    bad_rows = df.apply(is_header_row, axis=1)
+    if bad_rows.any():
+        n_bad = int(bad_rows.sum())
+        print(f'[INFO] Removed {n_bad} duplicate header row(s)', flush=True)
+        df = df[~bad_rows].copy().reset_index(drop=True)
+
+    print(f'[INFO] After cleanup: {df.shape[0]} rows x {df.shape[1]} cols', flush=True)
+    return df
+
+
+def build_sample_id(df):
+    cols = list(df.columns)
+    if 'sample_id' in df.columns:
+        return df['sample_id'].astype(str), df.drop(columns=['sample_id'])
+    iid_candidates = [c for c in cols if str(c).upper() == 'IID']
+    if iid_candidates:
+        iid = iid_candidates[0]
+        return df[iid].astype(str), df.drop(columns=[iid])
+    fid_candidates = [c for c in cols if str(c).upper() == 'FID']
+    if fid_candidates and iid_candidates:
+        fid = fid_candidates[0]
+        iid = iid_candidates[0]
+        sample_ids = df[iid].astype(str)
+        return sample_ids, df.drop(columns=[c for c in [fid, iid] if c in df.columns])
+    pc_cols = [c for c in cols if PC_COL_RE.match(str(c))]
+    non_pc_cols = [c for c in cols if c not in pc_cols]
+    if non_pc_cols:
+        id_col = non_pc_cols[0]
+        return df[id_col].astype(str), df.drop(columns=[id_col])
+    return pd.Series([f'sample_{i}' for i in range(len(df))], index=df.index), df
+
+
+def main():
+    prefix = '${meta.id}'
+
+    pca_tsv = Path(f'{prefix}_pca_scores.tsv')
+    convert_eigenvec_to_tsv(Path('${eigenvec}'), pca_tsv, 'iid')
+
+    df = read_table_robust(str(pca_tsv))
+    sample_ids, df_feats = build_sample_id(df)
+
+    pc_cols = [c for c in df_feats.columns if PC_COL_RE.match(str(c))]
+    if not pc_cols:
+        raise ValueError('No PC columns found in input')
+
+    X = df_feats[pc_cols].apply(pd.to_numeric, errors='coerce').values
+    if np.isnan(X).any():
+        raise ValueError('NaN values detected in PCA data')
+
+    print(f'[INFO] Loaded {X.shape[0]} samples x {X.shape[1]} principal components', flush=True)
+
+    if '${algorithm}' == 'kmeans':
+        model = KMeans(n_clusters=${n_clusters}, init='random', n_init=100, random_state=42)
+        labels = model.fit_predict(X)
+        info = {'algorithm': 'kmeans', 'k': ${n_clusters}, 'inertia': float(model.inertia_)}
+    else:
+        model = DBSCAN(eps=${dbscan_eps}, min_samples=${dbscan_min_samples})
+        labels = model.fit_predict(X)
+        n_found = len(set(labels)) - (1 if -1 in labels else 0)
+        n_noise = int(np.sum(labels == -1))
+        info = {
+            'algorithm': 'dbscan',
+            'eps': ${dbscan_eps},
+            'min_samples': ${dbscan_min_samples},
+            'n_clusters_found': int(n_found),
+            'n_noise': n_noise
+        }
+
+    out_clusters = f'{prefix}_clusters.csv'
+    out_info = f'{prefix}_clustering_info.json'
+
+    pd.DataFrame({'sample_id': sample_ids.astype(str), 'cluster': labels}).to_csv(out_clusters, index=False)
+    info.update({
+        'n_samples': int(X.shape[0]),
+        'n_features': int(X.shape[1]),
+        'feature_names': pc_cols,
+        'input_file': Path('${eigenvec}').name
+    })
+    Path(out_info).write_text(json.dumps(info, indent=2))
+
+    print('[SUCCESS] Clustering completed:')
+    print(f'   -> Clusters : {out_clusters}')
+    print(f'   -> Info     : {out_info}')
+
+
+    versions = {
+        'CUSTOM_CLUSTERING': {
+            'python': platform.python_version(),
+            'scikit-learn': sklearn.__version__,
+            'pandas': pd.__version__,
+            'numpy': np.__version__,
+        }
+    }
+    with open('versions.yml', 'w') as fh:
+        fh.write(yaml.dump(versions, default_flow_style=False))
+
+main()
diff --git a/modules/nf-core/custom/clustering/tests/data/test.eigenvec b/modules/nf-core/custom/clustering/tests/data/test.eigenvec
@@ -0,0 +1,6 @@
+#FID	IID	PC1	PC2	PC3
+0	sample01	0.1234	0.5678	0.9012
+0	sample02	-0.2345	0.6789	-0.0123
+0	sample03	0.3456	-0.7890	0.1234
+0	sample04	-0.4567	0.8901	-0.2345
+0	sample05	0.5678	-0.9012	0.3456
diff --git a/modules/nf-core/custom/clustering/tests/main.nf.test b/modules/nf-core/custom/clustering/tests/main.nf.test
@@ -0,0 +1,51 @@
+nextflow_process {
+    name "Test Process CUSTOM_CLUSTERING"
+    script "../main.nf"
+    process "CUSTOM_CLUSTERING"
+
+    tag "modules"
+    tag "modules_nfcore"
+    tag "custom"
+    tag "custom/clustering"
+
+    test("clustering - eigenvec") {
+        when {
+            process {
+                """
+                input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/clustering/tests/data/test.eigenvec", checkIfExists: true) ]
+                input[1] = 'kmeans'
+                input[2] = 3
+                input[3] = 0.5
+                input[4] = 5
+                """
+            }
+        }
+        then {
+            assert process.success
+            assert snapshot(
+                process.out.clusters,
+                process.out.info,
+                process.out.versions
+            ).match()
+        }
+    }
+
+    test("clustering - eigenvec - stub") {
+        options "-stub"
+        when {
+            process {
+                """
+                input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/clustering/tests/data/test.eigenvec", checkIfExists: true) ]
+                input[1] = 'kmeans'
+                input[2] = 3
+                input[3] = 0.5
+                input[4] = 5
+                """
+            }
+        }
+        then {
+            assert process.success
+            assert snapshot(process.out).match()
+        }
+    }
+}