From 416b5fbe17582382ef5d4a980329a72cb3c48a74 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 8 May 2026 15:43:16 -0400 Subject: [PATCH] Add --cluster-min-len option to vg deconstruct Gates -L/--cluster Jaccard clustering at each snarl by the longest non-boundary traversal length: clustering only runs at sites where the maximum interior traversal length is >= N bp (default 0 = always). Useful for SV-only clustering (set to 50) while keeping small variants represented exactly as they are. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/deconstructor.cpp | 46 +++++++++++++++++++++++++---- src/deconstructor.hpp | 6 ++++ src/subcommand/deconstruct_main.cpp | 18 +++++++++++ test/t/26_deconstruct.t | 16 ++++++++-- 4 files changed, 79 insertions(+), 7 deletions(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index fd18987884..311b08ccfe 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -859,11 +859,45 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t // jaccard clustering (using handles for now) on traversals vector> trav_cluster_info; vector unused_child_snarl_mapping; - vector> trav_clusters = cluster_traversals(graph, travs, sorted_travs, - vector>(), - cluster_threshold, - trav_cluster_info, - unused_child_snarl_mapping); + vector> trav_clusters; + + // If a minimum-allele-length gate is set, skip clustering at sites + // where every used traversal's interior (non-boundary) sequence is + // shorter than the threshold. This lets the clustering be SV-only + // (default 50bp matches the standard SV size cutoff) while leaving + // small variants represented exactly as they are. + bool gate_clustering = false; + if (cluster_min_allele_len > 0 && cluster_threshold < 1.0) { + int64_t max_alt_len = 0; + for (size_t i = 0; i < travs.size(); ++i) { + if (!use_trav[i]) continue; + const Traversal& t = travs[i]; + int64_t len = 0; + if (t.size() > 2) { + for (size_t k = 1; k + 1 < t.size(); ++k) { + len += graph->get_length(t[k]); + } + } + if (len > max_alt_len) max_alt_len = len; + } + gate_clustering = max_alt_len < cluster_min_allele_len; + } + + if (gate_clustering) { + // Trivial clusters (one per used traversal) in sorted_travs order. + // Mirrors what cluster_traversals would return when nothing collapses. + trav_cluster_info.assign(travs.size(), make_pair(-1.0, (int64_t)0)); + for (int idx : sorted_travs) { + trav_clusters.push_back({idx}); + trav_cluster_info[idx] = make_pair(1.0, (int64_t)0); + } + } else { + trav_clusters = cluster_traversals(graph, travs, sorted_travs, + vector>(), + cluster_threshold, + trav_cluster_info, + unused_child_snarl_mapping); + } #ifdef debug cerr << "cluster priority"; @@ -1286,6 +1320,7 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand bool strict_conflicts, bool long_ref_contig, double cluster_threshold, + int64_t cluster_min_allele_len, gbwt::GBWT* gbwt, bool star_allele) { @@ -1301,6 +1336,7 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand this->gbwt_reference_samples = gbwtgraph::parse_reference_samples_tag(*gbwt); } this->cluster_threshold = cluster_threshold; + this->cluster_min_allele_len = cluster_min_allele_len; this->gbwt = gbwt; this->star_allele = star_allele; diff --git a/src/deconstructor.hpp b/src/deconstructor.hpp index 604d7216ba..2b26015d93 100644 --- a/src/deconstructor.hpp +++ b/src/deconstructor.hpp @@ -49,6 +49,7 @@ class Deconstructor : public VCFOutputCaller { bool strict_conflicts, bool long_ref_contig, double cluster_threshold = 1.0, + int64_t cluster_min_allele_len = 0, gbwt::GBWT* gbwt = nullptr, bool star_allele = false); @@ -188,6 +189,11 @@ class Deconstructor : public VCFOutputCaller { // merge if identical (which is what deconstruct has always done) double cluster_threshold = 1.0; + // only apply cluster_threshold at sites where the longest non-boundary + // traversal sequence is at least this many bp. 0 disables the gate + // (so cluster_threshold applies at every site). + int64_t cluster_min_allele_len = 0; + // use *-alleles to represent haplotypes that span the parent site but don't // traverse the current nested site (e.g., a deletion that spans a nested SNP) // only works with include_nested diff --git a/src/subcommand/deconstruct_main.cpp b/src/subcommand/deconstruct_main.cpp index 1e36db320a..f9cbac9a8a 100644 --- a/src/subcommand/deconstruct_main.cpp +++ b/src/subcommand/deconstruct_main.cpp @@ -64,6 +64,9 @@ void help_deconstruct(char** argv) { << " for reference if possible (i.e. only one ref sample)" << endl << " -L, --cluster F cluster traversals whose (handle) Jaccard coefficient" << endl << " is >= F together [1.0; experimental]" << endl + << " --cluster-min-len N only apply -L clustering at sites where the longest" << endl + << " non-boundary traversal is >= N bp (50 = SVs only," << endl + << " 0 = always) [0]" << endl << " -R, --star-allele use *-alleles to represent haplotypes that span the" << endl << " parent but don't traverse nested sites (requires -a)" << endl << " -t, --threads N use N threads" << endl @@ -94,8 +97,11 @@ int main_deconstruct(int argc, char** argv) { bool untangle_traversals = false; bool contig_only_ref = false; double cluster_threshold = 1.0; + int64_t cluster_min_allele_len = 0; bool star_allele = false; + constexpr int OPT_CLUSTER_MIN_LEN = 1000; + int c; optind = 2; // force optind past command positional argument while (true) { @@ -118,6 +124,7 @@ int main_deconstruct(int argc, char** argv) { {"strict-conflicts", no_argument, 0, 'S'}, {"contig-only-ref", no_argument, 0, 'C'}, {"cluster", required_argument, 0, 'L'}, + {"cluster-min-len", required_argument, 0, OPT_CLUSTER_MIN_LEN}, {"star-allele", no_argument, 0, 'R'}, {"threads", required_argument, 0, 't'}, {"verbose", no_argument, 0, 'v'}, @@ -182,6 +189,12 @@ int main_deconstruct(int argc, char** argv) { case 'L': cluster_threshold = max(0.0, min(1.0, parse(optarg))); break; + case OPT_CLUSTER_MIN_LEN: + cluster_min_allele_len = parse(optarg); + if (cluster_min_allele_len < 0) { + logger.error() << "--cluster-min-len must be >= 0" << endl; + } + break; case 'R': star_allele = true; break; @@ -206,6 +219,10 @@ int main_deconstruct(int argc, char** argv) { logger.error() << "-R can only be used with -a" << endl; } + if (cluster_min_allele_len > 0 && cluster_threshold >= 1.0) { + logger.warn() << "--cluster-min-len has no effect without -L (cluster threshold < 1.0)" << endl; + } + // Read the graph unique_ptr path_handle_graph_up; unique_ptr gbz_graph; @@ -422,6 +439,7 @@ int main_deconstruct(int argc, char** argv) { strict_conflicts, !contig_only_ref, cluster_threshold, + cluster_min_allele_len, gbwt_index, star_allele); diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t index 15a6f53f91..2c4312b078 100644 --- a/test/t/26_deconstruct.t +++ b/test/t/26_deconstruct.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 59 +plan tests 63 vg msga -f GRCh38_alts/FASTA/HLA/V-352962.fa -t 1 -k 16 | vg mod -U 10 - | vg mod -c - > hla.vg vg index hla.vg -x hla.xg @@ -156,7 +156,19 @@ is "$(tail -1 small_cluster_0.vcf | awk '{print $5}')" "GATTTGA,G" "cluster-free is "$(tail -1 small_cluster_3.vcf | awk '{print $5}')" "G" "clustered deconstruction finds fewer alt alleles" is "$(tail -1 small_cluster_3.vcf | awk '{print $10}')" "0:0.333:0" "clustered deconstruction finds correct allele info" -rm -f small_cluster.gfa small_cluster_0.vcf small_cluster_3.vcf +# --cluster-min-len gates when the longest non-boundary traversal is below +# the threshold. small_cluster's only snarl has max interior length 6 bp. +vg deconstruct small_cluster.gfa -p x -L 0.3 --cluster-min-len 0 > small_cluster_3_off.vcf +is "$(tail -1 small_cluster_3_off.vcf | awk '{print $5}')" "G" "--cluster-min-len 0 is equivalent to clustering everywhere" +vg deconstruct small_cluster.gfa -p x -L 0.3 --cluster-min-len 50 > small_cluster_3_50.vcf +is "$(tail -1 small_cluster_3_50.vcf | awk '{print $5}')" "GATTTGA,G" "--cluster-min-len 50 gates clustering on a small site" +vg deconstruct small_cluster.gfa -p x -L 0.3 --cluster-min-len 6 > small_cluster_3_6.vcf +is "$(tail -1 small_cluster_3_6.vcf | awk '{print $5}')" "G" "--cluster-min-len at the site length still clusters" +vg deconstruct small_cluster.gfa -p x --cluster-min-len 50 2>/dev/null > small_cluster_min_only.vcf +diff small_cluster_0.vcf small_cluster_min_only.vcf +is "$?" 0 "--cluster-min-len without -L is a no-op" + +rm -f small_cluster.gfa small_cluster_0.vcf small_cluster_3.vcf small_cluster_3_off.vcf small_cluster_3_50.vcf small_cluster_3_6.vcf small_cluster_min_only.vcf # Nesting tests now use a two-step process: # 1. Compute gref cover with vg paths