diff --git a/modules/nf-core/bigslice/main.nf b/modules/nf-core/bigslice/main.nf index 906e98597b18..779095d6eaf7 100644 --- a/modules/nf-core/bigslice/main.nf +++ b/modules/nf-core/bigslice/main.nf @@ -8,12 +8,13 @@ process BIGSLICE { : 'biocontainers/bigslice:2.0.2--pyh8ed023e_0'}" input: - tuple val(meta), path(bgc, stageAs: 'bgc_files/*') - path hmmdb + tuple val(meta), path(bgc, stageAs: 'bgc_files/s*/*') + path(hmmdb) + val(export_tsv) output: - tuple val(meta), path("${prefix}/result/data.db"), emit: db - tuple val(meta), path("${prefix}/result/tmp/**/*.fa"), emit: fa + tuple val(meta), path("${prefix}/result") , emit: output + tuple val(meta), path("${prefix}/result/tsv_export") , emit: tsv, optional: true // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. tuple val("${task.process}"), val('bigslice'), val("2.0.2"), topic: versions, emit: versions_bigslice @@ -21,12 +22,14 @@ process BIGSLICE { task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' - prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + prefix = task.ext.prefix ?: "${meta.id}" def sample = meta.id + def export_tsv_cmd = export_tsv ? "bigslice --export-tsv ${prefix}/result/tsv_export --program_db_folder ${hmmdb} ${args2} ${prefix}" : '' """ mkdir -p input/dataset/${sample} input/taxonomy - cp bgc_files/* input/dataset/${sample}/ + find bgc_files -name '*.gbk' | xargs -I{} cp {} input/dataset/${sample}/ printf "# dataset_name\\tdataset_path\\ttaxonomy_path\\tdescription\\n" > input/datasets.tsv printf "dataset\\tdataset\\ttaxonomy/taxonomy.tsv\\tBGC dataset\\n" >> input/datasets.tsv @@ -39,16 +42,22 @@ process BIGSLICE { -i input \\ --program_db_folder ${hmmdb} \\ ${prefix} + + ${export_tsv_cmd} """ stub: - def args = task.ext.args ?: '' - prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" """ echo ${args} mkdir -p ${prefix}/result/tmp/2e555308dfc411186cf012334262f127 touch ${prefix}/result/data.db touch ${prefix}/result/tmp/2e555308dfc411186cf012334262f127/test.fa + if ${export_tsv}; then + mkdir -p ${prefix}/result/tsv_export + touch ${prefix}/result/tsv_export/bgcs.tsv + fi """ } diff --git a/modules/nf-core/bigslice/meta.yml b/modules/nf-core/bigslice/meta.yml index 611bd8127115..54caea097ad2 100644 --- a/modules/nf-core/bigslice/meta.yml +++ b/modules/nf-core/bigslice/meta.yml @@ -8,15 +8,15 @@ keywords: - analysis tools: - "bigslice": - description: A highly scalable, user-interactive tool for the large scale analysis - of Biosynthetic Gene Clusters data + description: A highly scalable, user-interactive tool for the large scale + analysis of Biosynthetic Gene Clusters data homepage: "https://github.com/medema-group/bigslice" documentation: "https://github.com/medema-group/bigslice" tool_dev_url: "https://github.com/medema-group/bigslice" doi: "10.1093/gigascience/giaa154" - licence: ["AGPL v3-or-later"] + licence: + - "AGPL v3-or-later" identifier: "" - input: - - meta: type: map @@ -24,43 +24,46 @@ input: Groovy Map containing sample information e.g. `[ id:'sample1' ]` - bgc: - type: directory + type: file description: | - Path to a folder containing genomic regions in GenBank format, structured for BiG-SLiCE. - Each genome should have its own subfolder with region `.gbk` files. - The folder should also contain a datasets.tsv, and a taxonomy folder, with TSV taxonomy files per dataset. - See the tool's wiki for more information: https://github.com/medema-group/bigslice/wiki/Input-folder - pattern: "*" + List of GenBank (.gbk) files containing genomic region annotations for BiG-SLiCE input. + Each file represents a BGC region. The module internally organises them into the required + BiG-SLiCE input folder structure (datasets.tsv and taxonomy TSV). + pattern: "*.gbk" + ontologies: [] - hmmdb: type: directory description: | Path to the BiG-SLiCE HMM database folder containing biosynthetic and sub Pfams for annotation, in the required BiG-SLiCE format. An example directory in compressed archive format can be found here: https://github.com/medema-group/bigslice/releases/download/v2.0.0rc/bigslice-models.2022-11-30.tar.gz - + - export_tsv: + type: boolean + description: | + If true, runs a second BiG-SLiCE invocation to export all results from the SQLite database + to TSV files under `tsv_export/`. Additional arguments for this step can be passed via `task.ext.args2`. output: - db: + output: - - meta: type: map description: Groovy Map containing sample/dataset information - - ${prefix}/result/data.db: - type: file + - ${prefix}/result: + type: directory description: | - The results SQLite database. Contains various tables relevant to result - BGCs, CDSs, GCFs, HMMs and HSPs. - pattern: "data.db" - ontologies: - - edam: "http://edamontology.org/format_3621" # SQLite format - fa: + BiG-SLiCE result directory containing the SQLite database (`data.db`), + predicted feature FASTA files (`tmp/**/*.fa`), and optionally TSV exports + (`tsv_export/`) when `export_tsv` is `true`. + pattern: "result" + tsv: - - meta: type: map description: Groovy Map containing sample/dataset information - - ${prefix}/result/tmp/**/*.fa: - type: file + - ${prefix}/result/tsv_export: + type: directory description: | - Predicted features as FASTA files. One file per hit HMM. - pattern: "*.fa" - ontologies: - - edam: "http://edamontology.org/format_1929" # FASTA + Directory containing TSV exports of all parsed BGC metadata, vectorized + features and clustering results. Only present when `export_tsv` input is + set to `true`. + pattern: "tsv_export" versions_bigslice: - - ${task.process}: type: string @@ -70,8 +73,7 @@ output: description: The name of the tool - 2.0.2: type: string - description: The expression to obtain the version of the tool - + description: The version of the tool topics: versions: - - ${task.process}: @@ -82,9 +84,10 @@ topics: description: The name of the tool - 2.0.2: type: string - description: The expression to obtain the version of the tool - + description: The version of the tool authors: - "@vagkaratzas" + - "@SkyLex" maintainers: - "@vagkaratzas" + - "@SkyLex" diff --git a/modules/nf-core/bigslice/tests/main.nf.test b/modules/nf-core/bigslice/tests/main.nf.test index 19bcb011abe5..f895cfd0e01f 100644 --- a/modules/nf-core/bigslice/tests/main.nf.test +++ b/modules/nf-core/bigslice/tests/main.nf.test @@ -67,16 +67,76 @@ nextflow_process { [ meta, gbk_files ] } input[1] = UNTAR_HMMDB.out.untar.map{ it -> it[1] } + input[2] = false """ } } then { assert process.success + def resultDir = file(process.out.output[0][1]) + def allNames = [] + def tmpFaCount = 0 + resultDir.eachFileRecurse { f -> + if (!f.isDirectory()) { + def rel = resultDir.toPath().relativize(f.toPath()).toString() + if (rel.startsWith('tmp/') || rel.startsWith('tmp\\')) { + if (f.name.endsWith('.fa')) tmpFaCount++ + } else { + allNames.add(f.name) + } + } + } + assertAll( + { assert resultDir.isDirectory() }, + { assert tmpFaCount > 0 }, + { assert snapshot( + allNames.sort(), + process.out.findAll { key, val -> key.startsWith("versions")} + ).match() } + ) + } + + } + + test("streptomyces_coelicolor - bigslice - gbk - export_tsv") { + + when { + process { + """ + // Flatten the GBK directory into a list of individual GBK files with meta + input[0] = UNTAR_GBK.out.untar.map { meta, dir -> + def gbk_files = [] + dir.eachFileRecurse { if (it.name.endsWith('.gbk')) gbk_files << it } + [ meta, gbk_files ] + } + input[1] = UNTAR_HMMDB.out.untar.map{ it -> it[1] } + input[2] = true + """ + } + } + + then { + assert process.success + def resultDir = file(process.out.output[0][1]) + def allNames = [] + def tmpFaCount = 0 + resultDir.eachFileRecurse { f -> + if (!f.isDirectory()) { + def rel = resultDir.toPath().relativize(f.toPath()).toString() + if (rel.startsWith('tmp/') || rel.startsWith('tmp\\')) { + if (f.name.endsWith('.fa')) tmpFaCount++ + } else { + allNames.add(f.name) + } + } + } assertAll( + { assert resultDir.isDirectory() }, + { assert tmpFaCount > 0 }, + { assert file(process.out.tsv[0][1]).isDirectory() }, { assert snapshot( - file(process.out.db[0][1]).name, - process.out.fa[0][1].size(), + allNames.sort(), process.out.findAll { key, val -> key.startsWith("versions")} ).match() } ) @@ -98,6 +158,7 @@ nextflow_process { [ meta, gbk_files ] } input[1] = UNTAR_HMMDB.out.untar.map{ it -> it[1] } + input[2] = false """ } } diff --git a/modules/nf-core/bigslice/tests/main.nf.test.snap b/modules/nf-core/bigslice/tests/main.nf.test.snap index c678d7f89ce5..d945a37e3ff2 100644 --- a/modules/nf-core/bigslice/tests/main.nf.test.snap +++ b/modules/nf-core/bigslice/tests/main.nf.test.snap @@ -1,4 +1,30 @@ { + "streptomyces_coelicolor - bigslice - gbk - export_tsv": { + "content": [ + [ + "bgc_features_1.pkl", + "bgc_metadata.tsv", + "data.db", + "gcf_membership.tsv", + "gcf_models_1.pkl", + "run_metadata.tsv" + ], + { + "versions_bigslice": [ + [ + "BIGSLICE", + "bigslice", + "2.0.2" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-04-03T01:11:26.257005672" + }, "streptomyces_coelicolor - bigslice - gbk - stub": { "content": [ { @@ -7,16 +33,18 @@ { "id": "test_gbk" }, - "data.db:md5,d41d8cd98f00b204e9800998ecf8427e" + [ + "data.db:md5,d41d8cd98f00b204e9800998ecf8427e", + [ + [ + "test.fa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ] ] ], "1": [ - [ - { - "id": "test_gbk" - }, - "test.fa:md5,d41d8cd98f00b204e9800998ecf8427e" - ] + ], "2": [ [ @@ -25,21 +53,23 @@ "2.0.2" ] ], - "db": [ + "output": [ [ { "id": "test_gbk" }, - "data.db:md5,d41d8cd98f00b204e9800998ecf8427e" + [ + "data.db:md5,d41d8cd98f00b204e9800998ecf8427e", + [ + [ + "test.fa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ] ] ], - "fa": [ - [ - { - "id": "test_gbk" - }, - "test.fa:md5,d41d8cd98f00b204e9800998ecf8427e" - ] + "tsv": [ + ], "versions_bigslice": [ [ @@ -63,12 +93,15 @@ "nf-test": "0.9.3", "nextflow": "25.10.3" }, - "timestamp": "2026-03-04T09:47:43.387153103" + "timestamp": "2026-04-02T22:45:23.737040708" }, "streptomyces_coelicolor - bigslice - gbk": { "content": [ - "data.db", - 40, + [ + "bgc_features_1.pkl", + "data.db", + "gcf_models_1.pkl" + ], { "versions_bigslice": [ [ @@ -83,6 +116,6 @@ "nf-test": "0.9.3", "nextflow": "25.10.3" }, - "timestamp": "2026-03-04T09:47:30.918713387" + "timestamp": "2026-04-03T01:10:19.794409662" } } \ No newline at end of file