nebiolabs · bwlang · Oct 13, 2025 · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025
diff --git a/README.md b/README.md
@@ -27,7 +27,7 @@ nextflow run fastq_to_ubam.nf \
 ## Quick Start
 1. Install [miniforge](https://conda-forge.org/download/) and [bioconda](https://bioconda.github.io/) (see [Requirements](<README#Requirements>))
 2. Install [Nextflow](https://www.nextflow.io/) (e.g. conda install nextflow, or see [Nextflow installation guide](https://www.nextflow.io/docs/latest/getstarted.html#installation))
-3. Clone this repository (`git clone https://github.com/nebiolabs/EM-seq.git`). Modify `nextflow.config` as needed for your environment, e.g. if running locally, change executor block to 'local' and set, e.g. `--max_cpus 10 --max_memory 30.GB`. 
+3. Clone this repository (`git clone https://github.com/nebiolabs/EM-seq.git`). Modify `nextflow.config` as needed for your environment, e.g. if running locally, change executor block to 'local' and set, e.g. `--max_cpus 10 --max_memory 30.GB`.
 4. Download or prepare a genome reference FASTA file (see [Reference Genomes](<README#Reference Genomes>))
 5. Create a bwameth index for the fasta and add it to your references in conf/references.config
 6. Run the pipeline with appropriate parameters (see [Basic Usage](<README#Basic Usage>))
@@ -59,10 +59,10 @@ nextflow run main.nf \
 ### References Config
 
 Modify the conf/references.config file to specify your genome files
-- `genome_fa` path to your genome fasta file 
-- `genome_fai` path to your genome fasta fai file 
-- `bwameth_index` path to your genome fasta file where bwameth indices exist 
-- `target_bed` BED file for targeted analysis, Optional 
+- `genome_fa` path to your genome fasta file
+- `genome_fai` path to your genome fasta fai file
+- `bwameth_index` path to your genome fasta file where bwameth indices exist
+- `target_bed` BED file for targeted analysis, Optional
 
 ### Advanced Options
 - `--tmp_dir` - Temporary directory (default: `/tmp`)
@@ -77,7 +77,8 @@ Pre-built reference genomes with methylation spike-in controls:
 - Create your own reference by appending the [control sequences](methylation_controls.fa) to your preferred genome fasta (e.g. `cat genome.fa methylation_controls.fa > genome+methylation_controls.fa`)
 
 ## Requirements
-- [Nextflow](https://www.nextflow.io/)
+- [Nextflow](https://www.nextflow.io/) (version 24.04 or later)
+ https://get.nextflow.io | bash`
 - [Miniforge](https://conda-forge.org/download/), [Micromamba](https://mamba.readthedocs.io/en/latest/installation/micromamba-installation.html), or [Conda](https://docs.conda.io/projects/conda/en/stable/) for dependency management
 - [Bioconda](https://bioconda.github.io/) channel configured
 - Sufficient computational resources (memory scales with input size)
@@ -105,7 +106,7 @@ You may also be interested in the [nf-core methylseq project](https://nf-co.re/m
  - development workflow will run from master branch
 
  ### Testing:
- - Tests are run using nf-test and are integrated into github actions
+ - Tests are run using [nf-test](https://www.nf-test.com/) and are integrated into github actions
  - install nf-test from bioconda using conda/mamba
  - To run all tests:
  ```bash
@@ -115,4 +116,3 @@ nf-test test
 ```bash
 nf-test test --updateSnapshot
 ```
-
diff --git a/conf/references.config b/conf/references.config
@@ -4,23 +4,58 @@ params {
             bwa_index: "/mnt/galaxy/galaxyworks/tool-data/T2T_chm13v2.0+bs_controls/bwameth_index/T2T_chm13v2.0+bs_controls/T2T_chm13v2.0+bs_controls.fa",
             genome_fa: "/mnt/galaxy/galaxyworks/tool-data/T2T_chm13v2.0+bs_controls/bwameth_index/T2T_chm13v2.0+bs_controls/T2T_chm13v2.0+bs_controls.fa",
             genome_fai: "/mnt/galaxy/galaxyworks/tool-data/T2T_chm13v2.0+bs_controls/bwameth_index/T2T_chm13v2.0+bs_controls/T2T_chm13v2.0+bs_controls.fa.fai",
-            target_bed: "/bioinfo/ref/t2t_chm13_v2+meth_controls+m13+phix/folded_windows/T2T_chm13v2.0+meth_controls_hairpins.s18w200nogt.bed"
+            target_bed: "/bioinfo/ref/t2t_chm13_v2+meth_controls+m13+phix/folded_windows/T2T_chm13v2.0+meth_controls_hairpins.s18w200nogt.bed",
+            ucsc_cpg_islands_url: "https://hgdownload.soe.ucsc.edu/gbdb/hs1/bbi/cpgIslandExt.bb",  // bigBed format
+            refseq_gtf_url: "https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/annotation_releases/GCF_009914755.1-RS_2023_10/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz",
+            ncbi_assembly_report_url: "https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/annotation_releases/GCF_009914755.1-RS_2023_10/GCF_009914755.1_T2T-CHM13v2.0_assembly_report.txt",
+            dfam_url: "https://www.dfam.org/releases/annotations_exports/hs1_df39/hs1_df39-byacc-full_region.tsv",
+            epd_promoters_url: "https://epd.expasy.org/ftp/epdnew/H_sapiens/current/Hs_EPDnew.bed",
+            epd_liftover_url: "https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chain/v1_nflo/grch38-chm13v2.chain" //GRCh38 version, no hs1 available
+        ],
+        'grch38_core+bs_controls': [
+            bwa_index: "/mnt/galaxy/data/genome/grch38_core+bs_controls/bwameth_index/grch38_core+bs_controls/grch38_core+bs_controls.fa",
+            genome_fa: "/mnt/galaxy/data/genome/grch38_core+bs_controls/sam_indexes/grch38_core+bs_controls/grch38_core+bs_controls.fa",
+            genome_fai: "/mnt/galaxy/data/genome/grch38_core+bs_controls/sam_indexes/grch38_core+bs_controls/grch38_core+bs_controls.fa.fai",
+            ucsc_cpg_islands_url: "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/cpgIslandExt.txt.gz",
+            refseq_gtf_url: "https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz",
+            ncbi_assembly_report_url: "https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_assembly_report.txt",
+            dfam_url: "https://www.dfam.org/releases/annotations_exports/hg38_df39/hg38_df39-byacc-full_region.tsv",
+            epd_promoters_url: "https://epd.expasy.org/ftp/epdnew/H_sapiens/current/Hs_EPDnew.bed",
+            epd_liftover_url: null
         ],
         'grcm39+meth_controls': [
             bwa_index: "/mnt/galaxy/data/genome/grcm39+meth_controls/bwameth_index/grcm39+meth_controls/grcm39+meth_controls.fa",
             genome_fa: "/mnt/galaxy/data/genome/grcm39+meth_controls/bwameth_index/grcm39+meth_controls/grcm39+meth_controls.fa",
             genome_fai: "/mnt/galaxy/data/genome/grcm39+meth_controls/bwameth_index/grcm39+meth_controls/grcm39+meth_controls.fa.fai",
+            ucsc_cpg_islands_url: "https://hgdownload.soe.ucsc.edu/goldenPath/mm39/database/cpgIslandExt.txt.gz",
+            refseq_gtf_url: "https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Mus_musculus/latest_assembly_versions/GCF_000001635.27_GRCm39/GCF_000001635.27_GRCm39_genomic.gff.gz",
+            ncbi_assembly_report_url: "https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Mus_musculus/latest_assembly_versions/GCF_000001635.27_GRCm39/GCF_000001635.27_GRCm39_assembly_report.txt",
+            dfam_url: null,
+            epd_promoters_url: "ftp://ccg.epfl.ch/epdnew/M_musculus/006/Mm_EPDnew_006_mm39.bed",
+            epd_liftover_url: null
         ],
         't2t_2.0_eschColi_K12_bs_controls': [
             bwa_index: "/mnt/galaxy/galaxyworks/tool-data/t2t_2.0_eschColi_K12_bs_controls/bwameth_index/t2t_2.0_eschColi_K12_bs_controls/t2t_2.0_eschColi_K12_bs_controls.fa",
             genome_fa: "/mnt/galaxy/galaxyworks/tool-data/t2t_2.0_eschColi_K12_bs_controls/bwameth_index/t2t_2.0_eschColi_K12_bs_controls/t2t_2.0_eschColi_K12_bs_controls.fa",
             genome_fai: "/mnt/galaxy/galaxyworks/tool-data/t2t_2.0_eschColi_K12_bs_controls/bwameth_index/t2t_2.0_eschColi_K12_bs_controls/t2t_2.0_eschColi_K12_bs_controls.fa.fai",
+            ucsc_cpg_islands_url: "https://hgdownload.soe.ucsc.edu/gbdb/hs1/bbi/cpgIslandExt.bb",
+            refseq_gtf_url: "https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/annotation_releases/GCF_009914755.1-RS_2023_10/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz",
+            ncbi_assembly_report_url: "https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/annotation_releases/GCF_009914755.1-RS_2023_10/GCF_009914755.1_T2T-CHM13v2.0_assembly_report.txt",
+            dfam_url: "https://www.dfam.org/releases/annotations_exports/hs1_df39/hs1_df39-byacc-full_region.tsv",
+            epd_promoters_url: "https://epd.expasy.org/ftp/epdnew/H_sapiens/current/Hs_EPDnew.bed",
+            epd_liftover_url: "https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chain/v1_nflo/grch38-chm13v2.chain", //GRCh38 version, no hs1 available
         ],
         'test': [
             bwa_index: "${projectDir}/tests/fixtures/reference_files/reference.fa",
             genome_fa: "${projectDir}/tests/fixtures/reference_files/reference.fa",
             genome_fai: "${projectDir}/tests/fixtures/reference_files/reference.fa.fai",
-            target_bed: "${projectDir}/tests/fixtures/target_bed/emseq_test_regions.bed"
+            target_bed: "${projectDir}/tests/fixtures/target_bed/emseq_test_regions.bed",
+            ucsc_cpg_islands_url: "file://${projectDir}/tests/fixtures/features/test_cpgIslandExt.bb",
+            refseq_gtf_url: "file://${projectDir}/tests/fixtures/features/test_refseq.gff.gz",
+            ncbi_assembly_report_url: "file://${projectDir}/tests/fixtures/features/test_assembly_report.txt",
+            dfam_url: "file://${projectDir}/tests/fixtures/features/test_dfam.out",
+            epd_promoters_url: "file://${projectDir}/tests/fixtures/features/epd_promoters.bed",
+            epd_liftover_url: "file://${projectDir}/tests/fixtures/features/grch38-chm13v2.chain"
         ]
     ]
-}
+}
diff --git a/conf/test.config b/conf/test.config
@@ -24,8 +24,6 @@ params {
     max_time   = '6.h'
 
     outputDir = "test_output"
+}
 
-    }
-
-    trace.overwrite = true
-
+trace.overwrite = true
diff --git a/feature_cov.nf b/feature_cov.nf
@@ -0,0 +1,192 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl=2
+
+// Require Nextflow version 24.04 or later for topic channels support
+// Topic channels are used for version tracking in subprocesses
+if( !nextflow.version.matches('>=24.04') ) {
+    error "This workflow requires Nextflow version 24.04 or later. Please upgrade your Nextflow installation:\n" +
+          "  Current version: ${nextflow.version}\n" +
+          "  Required version: 24.04+\n" +
+          "  Upgrade with: nextflow self-update\n" +
+          "  Or install latest: curl -s https://get.nextflow.io | bash"
+}
+
+// Enable topic preview flag for versions < 25.04 (graduated from preview in 25.04)
+if( nextflow.version.matches('>=24.04') && nextflow.version.matches('<25.04') ) {
+    nextflow.preview.topic = true
+}
+
+
+include { download_epd_promoters;
+          download_liftover_chain;
+          crossmap_epd_promoters;
+          download_cpg_islands;
+          download_refseq_gtf;
+          download_assembly_report;
+          normalize_cpg_islands;
+          normalize_refseq_features;
+          download_dfam_annotations }                     from './modules/prepare_features'
+include { calculate_feature_methylation;
+          combine_methylation }                           from './modules/feature_methylation'
+include { calculate_feature_counts;
+          combine_counts }                                from './modules/feature_counts'
+
+params.genome = null
+params.methylation_bed = null
+params.bam_files_glob = '*.bam'  // Indexes auto-detected: .bai, .csi, .crai
+params.tmp_dir = '/tmp'
+params.outputDir = 'output'
+
+params.ucsc_cpg_islands_url = null
+params.refseq_gtf_url = null
+params.ncbi_assembly_report_url = null
+params.dfam_url = null
+params.epd_promoters_url = null
+
+if (!params.genome) {
+    exit 1, "Error: --genome parameter is required"
+}
+if (!params.bam_files_glob) {
+    exit 1, "Error: --bam_files_glob parameter is required"
+}
+
+if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) {
+    exit 1, "The provided genome '${params.genome}' is not available in the genomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}"
+}
+
+genome = params.genome
+params.reference_list = params.genomes ? params.genomes[genome] : [:]
+genome_fa = params.reference_list.genome_fa ?: params.genome_fa
+genome_fai = params.reference_list.genome_fai ?: params.genome_fai
+
+ucsc_cpg_islands_url = params.ucsc_cpg_islands_url ?: params.reference_list.ucsc_cpg_islands_url
+refseq_gtf_url = params.refseq_gtf_url ?: params.reference_list.refseq_gtf_url
+ncbi_assembly_report_url = params.ncbi_assembly_report_url ?: params.reference_list.ncbi_assembly_report_url
+dfam_url = params.dfam_url ?: params.reference_list.dfam_url
+epd_promoters_url = params.epd_promoters_url ?: params.reference_list.epd_promoters_url
+epd_liftover_url = params.reference_list.epd_liftover_url ?: null
+
+refseq_feature_types = Channel.of(
+    'promoter',
+    'transcriptional_cis_regulatory_region',
+    'enhancer',
+    'silencer',
+    'mobile_genetic_element',
+    'lnc_RNA',
+    'snRNA',
+    'snoRNA',
+    'primary_transcript',
+    'tRNA',
+    'exon',
+    'mRNAexon1',
+    'mRNAexon'
+)
+
+workflow {
+    main:
+        bam_with_index = Channel.fromPath(params.bam_files_glob, checkIfExists: true)
+            .map { alignment ->
+                def base = alignment.toString().replaceAll(/\.(bam|cram)$/, '')
+                def idx = ["${alignment}.bai", "${base}.bai",
+                           "${alignment}.csi", "${base}.csi",
+                           "${alignment}.crai", "${base}.crai"]
+                    .collect { file(it) }
+                    .find { it.exists() }
+                if (!idx) error "No index found for: ${alignment}"
+                tuple(alignment, idx)
+            }
+            .collect()
+
+        if (epd_promoters_url) {
+            download_epd_promoters(epd_promoters_url)
+
+            if (epd_liftover_url) {
+                download_liftover_chain(epd_liftover_url)
+                crossmap_epd_promoters(
+                    download_epd_promoters.out.gtf,
+                    download_liftover_chain.out.chain_file
+                )
+
+                epd_gtf = crossmap_epd_promoters.out.gtf
+                    .map { gtf -> tuple('epd_promoter', gtf) }
+            } else {
+                epd_gtf = download_epd_promoters.out.gtf
+                    .map { gtf -> tuple('epd_promoter', gtf) }
+            }
+        } else {
+            epd_gtf = Channel.empty()
+        }
+
+        if (ncbi_assembly_report_url) {
+            download_assembly_report(ncbi_assembly_report_url)
+            assembly_report = download_assembly_report.out.report
+        } else {
+            assembly_report = Channel.empty()
+        }
+
+        if (ucsc_cpg_islands_url && ncbi_assembly_report_url) {
+            download_cpg_islands(ucsc_cpg_islands_url)
+            normalize_cpg_islands(download_cpg_islands.out.ucsc_file, assembly_report)
+            cpg_gtf = normalize_cpg_islands.out.gtf
+                .map { gtf -> tuple('cpg_island', gtf) }
+        } else {
+            cpg_gtf = Channel.empty()
+        }
+
+        if (refseq_gtf_url && ncbi_assembly_report_url) {
+            download_refseq_gtf(refseq_gtf_url)
+            refseq_combined = download_refseq_gtf.out.gtf
+                .combine(assembly_report)
+                .combine(Channel.value(file(genome_fai, checkIfExists: true)))
+                .combine(refseq_feature_types)
+
+            normalize_refseq_features(refseq_combined)
+            refseq_gtfs = normalize_refseq_features.out.gtf
+        } else {
+            refseq_gtfs = Channel.empty()
+        }
+
+        if (dfam_url) {
+            download_dfam_annotations(dfam_url)
+            dfam_gtf = download_dfam_annotations.out.gtf
+                .map { gtf -> tuple('dfam_repeat', gtf) }
+        } else {
+            dfam_gtf = Channel.empty()
+        }
+
+        if (params.methylation_bed) {
+            methylation_bed = Channel.fromPath(params.methylation_bed, checkIfExists: true)
+
+            all_gtfs = epd_gtf
+                .mix(cpg_gtf)
+                .mix(refseq_gtfs)
+                .mix(dfam_gtf)
+
+            calculate_feature_methylation(all_gtfs, methylation_bed)
+
+            combine_methylation(
+                calculate_feature_methylation.out.methylation
+                    .map { feature_type, tsv -> tsv }
+                    .collect()
+            )
+        }
+
+        all_features_for_counts = epd_gtf
+            .map { type, file -> tuple(type, file, 'gene_id') }
+            .mix(cpg_gtf.map { type, file -> tuple(type, file, 'transcript_id') })
+            .mix(refseq_gtfs.map { type, file -> tuple(type, file, 'gene_id') })
+            .mix(dfam_gtf.map { type, file -> tuple(type, file, 'gene_id') })
+
+        calculate_feature_counts(
+            all_features_for_counts.map { type, file, attr -> tuple(type, file) },
+            bam_with_index,
+            all_features_for_counts.map { type, file, attr -> attr }
+        )
+
+        combine_counts(
+            calculate_feature_counts.out.counts
+                .map { feature_type, tsv -> tsv }
+                .collect()
+        )
+}
diff --git a/main.nf b/main.nf
@@ -1,4 +1,16 @@
-nextflow.preview.topic = true
+// Require Nextflow version 24.04 or later for topic channels support
+if( !nextflow.version.matches('>=24.04') ) {
+    error "This workflow requires Nextflow version 24.04 or later. Please upgrade your Nextflow installation:\n" +
+          "  Current version: ${nextflow.version}\n" +
+          "  Required version: 24.04+\n" +
+          "  Upgrade with: nextflow self-update\n" +
+          "  Or install latest: curl -s https://get.nextflow.io | bash"
+}
+
+// Enable topic preview flag for versions < 25.04 (graduated from preview in 25.04)
+if( nextflow.version.matches('>=24.04') && nextflow.version.matches('<25.04') ) {
+    nextflow.preview.topic = true
+}
 
 include { createVersionsFile }                                from './lib/versions.nf'
 include { format_ngs_agg_opts }                               from './modules/aggregate_results'

diff --git a/modules/aggregate_results.nf b/modules/aggregate_results.nf
@@ -2,10 +2,10 @@ def format_ngs_agg_opts(optlist) {
     '''
     optlist: list of lists of format:
         [
-            [ opt1_name, Channel( val(library), results1 ) ], 
-            [ opt2_name, Channel( val(library), results2 ) ], 
+            [ opt1_name, Channel( val(library), results1 ) ],
+            [ opt2_name, Channel( val(library), results2 ) ],
         ]
-    
+
     Returns a channel of format: [lib, [opt1, opt2], [results1, results2]]
     '''
 
@@ -26,10 +26,10 @@ def format_ngs_agg_opts(optlist) {
 process aggregate_results {
     tag { library }
     conda "bioconda::samtools=1.9 git=2.40.1"
-    publishDir "${params.outputDir}/aggregate_results"
+    publishDir "${params.outputDir}/aggregate_results", mode: 'copy'
     label 'process_single'
 
-    input:         
+    input:
 	tuple val(library), val(ngs_agg_opts), path(ngs_agg_paths)
     val(workflow_name)
 
@@ -53,7 +53,7 @@ process aggregate_results {
         --commit_hash \$GIT_HASH \\
         --contact_email "${params.email}" \\
         ${opt_val} \\
-    
+
     """
     stub:
     def opt_val = [ngs_agg_opts, ngs_agg_paths].transpose().collect{ opt, fp -> "${opt} ${fp}" }.join(' ')

diff --git a/modules/combine_nonconverted_counts.nf b/modules/combine_nonconverted_counts.nf
@@ -1,6 +1,6 @@
 process combine_nonconverted_counts {
     label 'process_single'
-    publishDir "${params.outputDir}/stats/nonconverted_counts"
+    publishDir "${params.outputDir}/stats/nonconverted_counts", mode: 'copy'
 
     input:
         tuple val(library), path(nonconverted_counts)
@@ -23,6 +23,6 @@ process combine_nonconverted_counts {
             }
         }
     ' ${nonconverted_counts} > ${library}.nonconverted_counts.for_agg.tsv
-    
+
     """
 }