From 647de61ec120d5339fc7bbffbadc60bd758d76e1 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 12 May 2026 18:11:14 -0400 Subject: [PATCH 01/23] Break out chain scoring related values into a parameter struct --- src/algorithms/chain_items.cpp | 65 ++++++++++------------------ src/algorithms/chain_items.hpp | 47 +++++++++++--------- src/minimizer_mapper_from_chains.cpp | 19 +++++--- src/subcommand/chain_main.cpp | 9 ++-- 4 files changed, 67 insertions(+), 73 deletions(-) diff --git a/src/algorithms/chain_items.cpp b/src/algorithms/chain_items.cpp index ebc46da684..2c96bec8c9 100644 --- a/src/algorithms/chain_items.cpp +++ b/src/algorithms/chain_items.cpp @@ -638,13 +638,9 @@ TracedScore chain_items_dp(vector& chain_scores, const HandleGraph& graph, int gap_open, int gap_extension, + const ChainScoringScheme& scheme, const transition_iterator& for_each_transition, - int item_bonus, - double item_scale, - double gap_scale, - double points_per_possible_match, size_t max_indel_bases, - int recomb_penalty, bool show_work) { DiagramExplainer diagram(show_work); @@ -657,7 +653,7 @@ TracedScore chain_items_dp(vector& chain_scores, cerr << "Chaining group of " << to_chain.size() << " items" << endl; } - crash_unless(recomb_penalty >= 0); + crash_unless(scheme.recombination_penalty >= 0); // Compute a base seed average length. // TODO: Weight anchors differently? @@ -681,11 +677,11 @@ TracedScore chain_items_dp(vector& chain_scores, // We store the bonus used to select the current winning predecessor for // each seed in this vector, which runs alongside the DP table. // - // Starting from nowhere means full path conservation, so bonus = recomb_penalty. - std::vector eval_bonuses(to_chain.size(), recomb_penalty); + // Starting from nowhere means full path conservation, so bonus = scheme.recombination_penalty. + std::vector eval_bonuses(to_chain.size(), scheme.recombination_penalty); for (size_t i = 0; i < to_chain.size(); i++) { // Set up DP table so we can start anywhere with that item's score, scaled and with bonus applied. - chain_scores[i] = {(int)(to_chain[i].score() * item_scale + item_bonus), TracedScore::nowhere(), to_chain[i].anchor_end_paths()}; + chain_scores[i] = {(int)(to_chain[i].score() * scheme.item_scale + scheme.item_bonus), TracedScore::nowhere(), to_chain[i].anchor_end_paths()}; } // We will run this over every transition in a good DP order. @@ -704,7 +700,7 @@ TracedScore chain_items_dp(vector& chain_scores, auto& here = to_chain[transition.to_anchor]; // How many points is it worth to collect? - auto item_points = here.score() * item_scale + item_bonus; + auto item_points = here.score() * scheme.item_scale + scheme.item_bonus; std::string here_gvnode; if (diagram) { @@ -712,10 +708,10 @@ TracedScore chain_items_dp(vector& chain_scores, } // If we come from nowhere, we get those points. - // This also has full path conservation (bonus = recomb_penalty). + // This also has full path conservation (bonus = scheme.recombination_penalty). { TracedScore from_nowhere = {(int)item_points, TracedScore::nowhere(), here.anchor_end_paths()}; - int nowhere_bonus = recomb_penalty; + int nowhere_bonus = scheme.recombination_penalty; int eval_nowhere = from_nowhere.score + nowhere_bonus; int eval_current = chain_scores[transition.to_anchor].score + eval_bonuses[transition.to_anchor]; if (eval_nowhere > eval_current) { @@ -785,13 +781,13 @@ TracedScore chain_items_dp(vector& chain_scores, // // But we account for anchor length in the item points, so don't use it // here. - jump_points = -score_chain_gap(indel_length, base_seed_length) * gap_scale; + jump_points = -score_chain_gap(indel_length, base_seed_length) * scheme.gap_scale; // add recombination penalty if necessary - jump_points -= check_recombination(chain_scores[transition.from_anchor], here) * recomb_penalty; + jump_points -= check_recombination(chain_scores[transition.from_anchor], here) * scheme.recombination_penalty; // We can also account for the non-indel material, which we assume will have some identity in it. - jump_points += possible_match_length * points_per_possible_match; + jump_points += possible_match_length * scheme.points_per_possible_match; } if (jump_points != numeric_limits::min()) { @@ -803,15 +799,15 @@ TracedScore chain_items_dp(vector& chain_scores, .set_shared_paths(here.anchor_paths()); // Evaluate heuristic to preserve path flexibility without inflating actual scoring DP. - // Bonus = fraction of conserved paths * recomb_penalty. + // Bonus = fraction of conserved paths * scheme.recombination_penalty. // Bonus is 0 when recombination occurs (no shared paths). int eval_bonus_from = 0; - if (recomb_penalty > 0) { + if (scheme.recombination_penalty > 0) { int pre_count = __builtin_popcountll(source_score.paths); if (pre_count > 0 && (source_score.paths & here.anchor_start_paths()) != 0) { // No recombination: bonus = fraction of paths conserved * penalty int post_count = __builtin_popcountll(from_source_score.paths); - eval_bonus_from = (recomb_penalty * post_count) / pre_count; + eval_bonus_from = (scheme.recombination_penalty * post_count) / pre_count; } // Recombination case (no shared paths): bonus stays 0 } @@ -893,7 +889,7 @@ TracedScore chain_items_dp(vector& chain_scores, if (diagram) { // Draw the item in the diagram - auto item_points = here.score() * item_scale + item_bonus; + auto item_points = here.score() * scheme.item_scale + scheme.item_bonus; std::string here_gvnode = "i" + std::to_string(to_anchor); std::stringstream label_stream; label_stream << "#" << to_anchor << " " << here << " = " << item_points @@ -939,8 +935,7 @@ TracedScore chain_items_dp(vector& chain_scores, vector, int>> chain_items_traceback(const vector& chain_scores, const VectorView& to_chain, const TracedScore& best_past_ending_score_ever, - int item_bonus, - double item_scale, + const ChainScoringScheme& scheme, size_t max_tracebacks) { // We will fill this in with all the tracebacks, and then sort and truncate. @@ -988,7 +983,7 @@ vector, int>> chain_items_traceback(const vector& to_chain, const HandleGraph& graph, int gap_open, int gap_extension, - int recomb_penalty, + const ChainScoringScheme& scheme, size_t max_chains, const transition_iterator& for_each_transition, - int item_bonus, - double item_scale, - double gap_scale, - double points_per_possible_match, size_t max_indel_bases, bool show_work) { @@ -1054,20 +1045,16 @@ ChainsResult find_best_chains(const VectorView& to_chain, graph, gap_open, gap_extension, + scheme, for_each_transition, - item_bonus, - item_scale, - gap_scale, - points_per_possible_match, max_indel_bases, - recomb_penalty, show_work); #ifdef debug_chaining std::cerr << "[REC INFO] Recombination number for chain: " << best_past_ending_score_ever.rec_num << "\tscore: " << best_past_ending_score_ever.score << "\tpaths: " << best_past_ending_score_ever.paths << std::endl; #endif // Then do the tracebacks vector, int>> tracebacks = chain_items_traceback( - chain_scores, to_chain, best_past_ending_score_ever, item_bonus, item_scale, max_chains); + chain_scores, to_chain, best_past_ending_score_ever, scheme, max_chains); if (tracebacks.empty()) { // Somehow we got nothing @@ -1136,12 +1123,8 @@ pair> find_best_chain(const VectorView& to_chain, const HandleGraph& graph, int gap_open, int gap_extension, - int recomb_penalty, + const ChainScoringScheme& scheme, const transition_iterator& for_each_transition, - int item_bonus, - double item_scale, - double gap_scale, - double points_per_possible_match, size_t max_indel_bases) { ChainsResult cr = find_best_chains( @@ -1150,13 +1133,9 @@ pair> find_best_chain(const VectorView& to_chain, graph, gap_open, gap_extension, - recomb_penalty, + scheme, 1, for_each_transition, - item_bonus, - item_scale, - gap_scale, - points_per_possible_match, max_indel_bases ); return cr.chains.front().scored_chain; diff --git a/src/algorithms/chain_items.hpp b/src/algorithms/chain_items.hpp index f4531d4201..0299c7a83a 100644 --- a/src/algorithms/chain_items.hpp +++ b/src/algorithms/chain_items.hpp @@ -382,6 +382,7 @@ ostream& operator<<(ostream& out, const TracedScore& value); */ void sort_anchor_indexes(const std::vector& items, std::vector& indexes); +/// Represents a possible transition between anchors struct transition_info { // Index of the source anchor within the list of anchors size_t from_anchor; @@ -392,14 +393,31 @@ struct transition_info { // Distance between anchors in the read size_t read_distance; - // Constructor; read_distance defaults to max if not given + /// Build transition info from loose values inline transition_info(size_t from, size_t to, size_t graph_dist, size_t read_dist = std::numeric_limits::max()) : from_anchor(from), to_anchor(to), graph_distance(graph_dist), read_distance(read_dist) {} }; + +/// Represents the scoring scheme for chains, to determine which are best. +/// Doesn't cover the parameters that really belong to an alignment scoring +/// scheme (like gap open and extend). +struct ChainScoringScheme { + /// Score bonus for each item collected + int item_bonus = 0; + /// Scale to apply to each item's own score + double item_scale = 1.0; + /// Scale to apply to the scores of gaps + double gap_scale = 1.0; + double points_per_possible_match = 0; + int recombination_penalty = 0; +}; + /// A single chain result: scored chain plus the recombination count observed /// on its endpoint. +/// TODO: Is there a better name for the abstraction this is getting at? struct ChainWithRec { + // TODO: Shouldn't we split this into 2 fields? std::pair> scored_chain; // Positions (anchor indices) in the chain that introduce a recombination // event between anchors. These correspond to anchors where we had to @@ -410,6 +428,8 @@ struct ChainWithRec { /// Result of finding best chains: a list of chains each paired with the /// recombination count observed at that chain's endpoint. +/// TODO: Can we get rid of this once we're sure it won't need more fields? +/// TODO: Is there a better name for this? struct ChainsResult { std::vector chains; }; @@ -517,8 +537,7 @@ std::vector calculate_transition_read_distances( * * Input items must be sorted by start position in the read. * - * Takes the given per-item bonus for each item collected, and scales item - * scores by the given scale. + * Uses the given scoring scheme to score chains. * * Uses a transition iterator to enumerate where we can come from to reach an * item. @@ -530,15 +549,12 @@ TracedScore chain_items_dp(vector& chain_scores, const VectorView& to_chain, const SnarlDistanceIndex& distance_index, const HandleGraph& graph, + // TODO: We should maybe just take an EditAlignmentScorer here. int gap_open, int gap_extension, + const ChainScoringScheme& scheme = ChainScoringScheme(), const transition_iterator& for_each_transition = lookback_transition_iterator(150, 0, 100), - int item_bonus = 0, - double item_scale = 1.0, - double gap_scale = 1.0, - double points_per_possible_match = 0, size_t max_indel_bases = 100, - int recomb_penalty = 0, bool show_work = false ); @@ -559,8 +575,7 @@ TracedScore chain_items_dp(vector& chain_scores, vector, int>> chain_items_traceback(const vector& chain_scores, const VectorView& to_chain, const TracedScore& best_past_ending_score_ever, - int item_bonus = 0, - double item_scale = 1.0, + const ChainScoringScheme& scheme = ChainScoringScheme(), size_t max_tracebacks = 1); @@ -578,13 +593,9 @@ ChainsResult find_best_chains(const VectorView& to_chain, const HandleGraph& graph, int gap_open, int gap_extension, - int recomb_penalty = 0, + const ChainScoringScheme& scheme = ChainScoringScheme(), size_t max_chains = 1, const transition_iterator& for_each_transition = lookback_transition_iterator(150, 0, 100), - int item_bonus = 0, - double item_scale = 1.0, - double gap_scale = 1.0, - double points_per_possible_match = 0, size_t max_indel_bases = 100, bool show_work = false); @@ -602,12 +613,8 @@ pair> find_best_chain(const VectorView& to_chain, const HandleGraph& graph, int gap_open, int gap_extension, - int recomb_penalty = 0, + const ChainScoringScheme& scheme = ChainScoringScheme(), const transition_iterator& for_each_transition = lookback_transition_iterator(150, 0, 100), - int item_bonus = 0, - double item_scale = 1.0, - double gap_scale = 1.0, - double points_per_possible_match = 0, size_t max_indel_bases = 100); /** diff --git a/src/minimizer_mapper_from_chains.cpp b/src/minimizer_mapper_from_chains.cpp index 3c8cb7b50e..a35c12c9d4 100644 --- a/src/minimizer_mapper_from_chains.cpp +++ b/src/minimizer_mapper_from_chains.cpp @@ -878,11 +878,11 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { size_t chain_index = alignments_to_source[alignment_index]; if (chain_index != std::numeric_limits::max() && chain_index < chain_rec_counts.size()) { set_annotation(alignments[alignment_index], "chain.rec_count", (double) chain_rec_counts[chain_index]); - if (rec_penalty_chain != 0) { + if (rec_penalty != 0) { // Penalize the score of alignment candidates according to the number of recombinations their chains required. // This allows alignments that required fewer recombinations in their chains to win. // TODO: We'd also eventaully like to count recombinations that we don't know are needed until base-level DP. - int64_t penalty = static_cast(rec_penalty_chain) * static_cast(chain_rec_counts[chain_index]); + int64_t penalty = static_cast(rec_penalty) * static_cast(chain_rec_counts[chain_index]); int64_t penalized_score = static_cast(alignments[alignment_index].score()) - penalty; alignments[alignment_index].set_score(static_cast(penalized_score)); } @@ -1575,19 +1575,24 @@ void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& graph_lookback_limit, read_lookback_limit ); + // TODO: Should we just inherit from ChainScoringScheme? Or should + // we set one up as a member? + algorithms::ChainScoringScheme scheme { + this->item_bonus, + this->item_scale, + this->gap_scale, + this->points_per_possible_match, + this->rec_penalty_chain + }; chain_results = algorithms::find_best_chains( anchor_view, *distance_index, gbwt_graph, get_regular_aligner()->scorer->gap_open, get_regular_aligner()->scorer->gap_extension, - this->rec_penalty_chain, + scheme, this->max_alignments, for_each_transition, - this->item_bonus, - this->item_scale, - this->gap_scale, - this->points_per_possible_match, indel_limit, show_work ); diff --git a/src/subcommand/chain_main.cpp b/src/subcommand/chain_main.cpp index b595077e80..c8108e0aab 100644 --- a/src/subcommand/chain_main.cpp +++ b/src/subcommand/chain_main.cpp @@ -38,7 +38,7 @@ int main_chain(int argc, char** argv) { Logger logger("vg chain"); bool show_progress = false; - int recomb_penalty = 0; + int recombination_penalty = 0; int c; optind = 2; // force optind past command positional argument @@ -67,7 +67,7 @@ int main_chain(int argc, char** argv) { break; case 'r': - recomb_penalty = atof(optarg); + recombination_penalty = atof(optarg); break; case 'h': @@ -284,7 +284,10 @@ int main_chain(int argc, char** argv) { #endif // Do the chaining. We assume items is already sorted right. - std::pair> score_and_chain = vg::algorithms::find_best_chain(items, distance_index, graph, scorer.scorer->gap_open, scorer.scorer->gap_extension, recomb_penalty); + // TODO: Replace with designated initializer list when we get C++20 + vg::algorithms::ChainScoringScheme scheme; + scheme.recombination_penalty = recombination_penalty; + std::pair> score_and_chain = vg::algorithms::find_best_chain(items, distance_index, graph, scorer.scorer->gap_open, scorer.scorer->gap_extension, scheme); logger.info() << "Best chain gets score " << score_and_chain.first << std::endl; From 748b814b122bf11db0a4bee75ae4cd6ae775114d Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 12 May 2026 18:28:46 -0400 Subject: [PATCH 02/23] Split --rec-penalty-chain into --rec-penalty, --rec-penalty-aln, and --rec-consistency-bonus --- src/algorithms/chain_items.cpp | 16 +++++++++------- src/algorithms/chain_items.hpp | 6 ++++++ src/minimizer_mapper.hpp | 12 +++++++++--- src/minimizer_mapper_from_chains.cpp | 6 ++++-- src/subcommand/giraffe_main.cpp | 22 ++++++++++++++++++---- 5 files changed, 46 insertions(+), 16 deletions(-) diff --git a/src/algorithms/chain_items.cpp b/src/algorithms/chain_items.cpp index 2c96bec8c9..1db5b0e88c 100644 --- a/src/algorithms/chain_items.cpp +++ b/src/algorithms/chain_items.cpp @@ -654,6 +654,8 @@ TracedScore chain_items_dp(vector& chain_scores, } crash_unless(scheme.recombination_penalty >= 0); + crash_unless(scheme.consistency_bonus >= 0); + // Compute a base seed average length. // TODO: Weight anchors differently? @@ -677,8 +679,8 @@ TracedScore chain_items_dp(vector& chain_scores, // We store the bonus used to select the current winning predecessor for // each seed in this vector, which runs alongside the DP table. // - // Starting from nowhere means full path conservation, so bonus = scheme.recombination_penalty. - std::vector eval_bonuses(to_chain.size(), scheme.recombination_penalty); + // Starting from nowhere means full path conservation, so bonus = scheme.consistency_bonus. + std::vector eval_bonuses(to_chain.size(), scheme.consistency_bonus); for (size_t i = 0; i < to_chain.size(); i++) { // Set up DP table so we can start anywhere with that item's score, scaled and with bonus applied. chain_scores[i] = {(int)(to_chain[i].score() * scheme.item_scale + scheme.item_bonus), TracedScore::nowhere(), to_chain[i].anchor_end_paths()}; @@ -708,10 +710,10 @@ TracedScore chain_items_dp(vector& chain_scores, } // If we come from nowhere, we get those points. - // This also has full path conservation (bonus = scheme.recombination_penalty). + // This also has full path conservation (bonus = scheme.consistency_bonus). { TracedScore from_nowhere = {(int)item_points, TracedScore::nowhere(), here.anchor_end_paths()}; - int nowhere_bonus = scheme.recombination_penalty; + int nowhere_bonus = scheme.consistency_bonus; int eval_nowhere = from_nowhere.score + nowhere_bonus; int eval_current = chain_scores[transition.to_anchor].score + eval_bonuses[transition.to_anchor]; if (eval_nowhere > eval_current) { @@ -799,15 +801,15 @@ TracedScore chain_items_dp(vector& chain_scores, .set_shared_paths(here.anchor_paths()); // Evaluate heuristic to preserve path flexibility without inflating actual scoring DP. - // Bonus = fraction of conserved paths * scheme.recombination_penalty. + // Bonus = fraction of conserved paths * scheme.consistency_bonus. // Bonus is 0 when recombination occurs (no shared paths). int eval_bonus_from = 0; - if (scheme.recombination_penalty > 0) { + if (scheme.consistency_bonus > 0) { int pre_count = __builtin_popcountll(source_score.paths); if (pre_count > 0 && (source_score.paths & here.anchor_start_paths()) != 0) { // No recombination: bonus = fraction of paths conserved * penalty int post_count = __builtin_popcountll(from_source_score.paths); - eval_bonus_from = (scheme.recombination_penalty * post_count) / pre_count; + eval_bonus_from = (scheme.consistency_bonus * post_count) / pre_count; } // Recombination case (no shared paths): bonus stays 0 } diff --git a/src/algorithms/chain_items.hpp b/src/algorithms/chain_items.hpp index 0299c7a83a..cf112d0d89 100644 --- a/src/algorithms/chain_items.hpp +++ b/src/algorithms/chain_items.hpp @@ -409,8 +409,14 @@ struct ChainScoringScheme { double item_scale = 1.0; /// Scale to apply to the scores of gaps double gap_scale = 1.0; + /// Add this many points per potential match between two seeds double points_per_possible_match = 0; + /// Penalize this many points per recombination int recombination_penalty = 0; + /// Apply a bonus during alternative selection (but not to actual DP + /// scores) of this many points when matching haplotype paths are + /// preserved, scaled by fraction of haplotypes preserved. + int consistency_bonus = 0; }; /// A single chain result: scored chain plus the recombination count observed diff --git a/src/minimizer_mapper.hpp b/src/minimizer_mapper.hpp index 48924f0a2a..cddd77d105 100644 --- a/src/minimizer_mapper.hpp +++ b/src/minimizer_mapper.hpp @@ -291,9 +291,15 @@ class MinimizerMapper : public AlignerClient { /// at chaining? static constexpr double default_gap_scale = 1.0; double gap_scale = default_gap_scale; - /// Recombination penalty for chaining. This is added to the score of a transition if there are no shared haplotypes. - static constexpr int default_rec_penalty_chain = 0; - int rec_penalty_chain = default_rec_penalty_chain; + /// Recombination penalty. This is added to the cost of a transition if there are no shared haplotypes. + static constexpr int default_rec_penalty = 0; + int rec_penalty = default_rec_penalty; + /// Recombination penalty for alignment scoring. This is deducted from the score of an alignment for each required recombination. -1 means use rec_penalty. + static constexpr int default_rec_penalty_aln = -1; + int rec_penalty_aln = default_rec_penalty_aln; + /// Recombination-aware chaining bonus for avoiding losing haplotypes. Not actually tracked in chaining DP score. -1 means use rec_penalty. + static constexpr int default_rec_consistency_bonus = -1; + int rec_consistency_bonus = default_rec_consistency_bonus; // How many points should we treat a non-gap connection base as producing, at chaining? static constexpr double default_points_per_possible_match = 0; double points_per_possible_match = default_points_per_possible_match; diff --git a/src/minimizer_mapper_from_chains.cpp b/src/minimizer_mapper_from_chains.cpp index a35c12c9d4..5a9ec5e3a0 100644 --- a/src/minimizer_mapper_from_chains.cpp +++ b/src/minimizer_mapper_from_chains.cpp @@ -882,7 +882,7 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { // Penalize the score of alignment candidates according to the number of recombinations their chains required. // This allows alignments that required fewer recombinations in their chains to win. // TODO: We'd also eventaully like to count recombinations that we don't know are needed until base-level DP. - int64_t penalty = static_cast(rec_penalty) * static_cast(chain_rec_counts[chain_index]); + int64_t penalty = static_cast(rec_penalty_aln == -1 ? rec_penalty : rec_penalty_aln) * static_cast(chain_rec_counts[chain_index]); int64_t penalized_score = static_cast(alignments[alignment_index].score()) - penalty; alignments[alignment_index].set_score(static_cast(penalized_score)); } @@ -1582,7 +1582,9 @@ void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& this->item_scale, this->gap_scale, this->points_per_possible_match, - this->rec_penalty_chain + this->rec_penalty, + // TODO: Do this once at setup? + this->rec_consistency_bonus == -1 ? this->rec_penalty : this->rec_consistency_bonus, }; chain_results = algorithms::find_best_chains( anchor_view, diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index 4c9671b284..b75b62edfd 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -450,10 +450,24 @@ static std::unique_ptr get_options() { double_is_nonnegative ); chaining_opts.add_range( - "rec-penalty-chain", - &MinimizerMapper::rec_penalty_chain, - MinimizerMapper::default_rec_penalty_chain, - "penalty for a recombination when chaining, requires -E (ALPHA)", + "rec-penalty", + &MinimizerMapper::rec_penalty, + MinimizerMapper::default_rec_penalty, + "penalty for a recombination, requires -E (ALPHA)", + int_is_nonnegative + ); + chaining_opts.add_range( + "rec-penalty-aln", + &MinimizerMapper::rec_penalty_aln, + MinimizerMapper::default_rec_penalty_aln, + "penalty for a recombination for final alignment scoring, -1 for --rec-penalty, requires -E (ALPHA)", + int_is_nonnegative + ); + chaining_opts.add_range( + "rec-consistency-bonus", + &MinimizerMapper::rec_consistency_bonus, + MinimizerMapper::default_rec_consistency_bonus, + "untracked bonus for chains staying consistent with haplotypes to avoid recombinations, -1 for --rec-penalty, requires -E (ALPHA)", int_is_nonnegative ); chaining_opts.add_range( From d8a81464f15169ca3b9187c414e675940b862719 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 14 May 2026 11:58:01 -0400 Subject: [PATCH 03/23] Move rec mode option to option parser system --- src/subcommand/giraffe_main.cpp | 33 ++++++++++++++++++--------------- src/subcommand/options.hpp | 2 +- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index aedc8be2dc..87449ac17d 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -80,6 +80,9 @@ struct GiraffeMainOptions { /// Should we prune low-complexity anchors when surjecting? static constexpr bool default_prune_low_cplx = false; bool prune_low_cplx = default_prune_low_cplx; + /// Should we use recombination-aware mapping? + static constexpr bool default_rec_mode = false; + bool rec_mode = default_rec_mode; }; /// Options struct for scoring-related parameters. Defaults are in aligner.hpp. @@ -120,6 +123,12 @@ static std::unique_ptr get_options() { GiraffeMainOptions::default_prune_low_cplx, "prune short and low complexity anchors during linear format realignment (on by default for long reads)" ); + main_opts.add_flag( + "rec-mode", 'E', + &GiraffeMainOptions::rec_mode, + GiraffeMainOptions::default_rec_mode, + "activate giraffe recombination-aware mode" + ); // Configure scoring auto& scoring_opts = parser->add_group("scoring options"); @@ -729,7 +738,6 @@ void help_giraffe(char** argv, const BaseOptionGroup& parser, const std::map rescue_algorithms = { { "none", MinimizerMapper::rescue_none }, @@ -1138,7 +1142,6 @@ int main_giraffe(int argc, char** argv) { {"output-basename", required_argument, 0, OPT_OUTPUT_BASENAME}, {"report-name", required_argument, 0, OPT_REPORT_NAME}, {"parameter-preset", required_argument, 0, 'b'}, - {"rec-mode", no_argument, 0, 'E'}, {"rescue-algorithm", required_argument, 0, 'A'}, {"fragment-mean", required_argument, 0, OPT_FRAGMENT_MEAN }, {"fragment-stdev", required_argument, 0, OPT_FRAGMENT_STDEV }, @@ -1153,7 +1156,7 @@ int main_giraffe(int argc, char** argv) { parser->make_long_options(long_options); long_options.push_back({0, 0, 0, 0}); - std::string short_options = "h?Z:x:g:H:m:z:d:pG:f:iN:R:o:nb:t:A:E"; + std::string short_options = "h?Z:x:g:H:m:z:d:pG:f:iN:R:o:nb:t:A:"; parser->make_short_options(short_options); if (argc == 2) { @@ -1355,9 +1358,6 @@ int main_giraffe(int argc, char** argv) { map_long_reads = true; } break; - case 'E': - use_path_minimizer = true; - break; case 'A': { std::string algo_name = optarg; @@ -1423,6 +1423,9 @@ int main_giraffe(int argc, char** argv) { } } + // Fill in the optiopn values for the first values of all the ranges. + parser->reset_chain(); + // Now we can use main_options. // Get positional arguments before validating user intent if (have_input_file(optind, argc, argv)) { @@ -1534,7 +1537,7 @@ int main_giraffe(int argc, char** argv) { logger.error() << "Paired-end alignment is not yet implemented " << "for --align-from-chains or chaining-based presets" << std::endl; } - if (use_path_minimizer && !map_long_reads) { + if (main_options.rec_mode && !map_long_reads) { // We don't have file paths to load defined for recombination-aware short-read minimizers. logger.error() << "Path minimizers cannot be used with short reads." << endl; } @@ -1653,7 +1656,7 @@ int main_giraffe(int argc, char** argv) { {"Giraffe Distance Index", {"dist"}} }; if (map_long_reads) { - if (use_path_minimizer) { + if (main_options.rec_mode) { indexes_and_extensions.emplace(std::string("Long Read PathMinimizers"), std::vector({"longread.path.min","path.min", "min"})); indexes_and_extensions.emplace(std::string("Long Read PathZipcodes"), std::vector({"longread.path.zipcodes", "path.zipcodes", "zipcodes"})); } else { @@ -1743,7 +1746,7 @@ int main_giraffe(int argc, char** argv) { if (!map_long_reads) { index_targets = VGIndexes::get_default_short_giraffe_indexes(); } else { - if (use_path_minimizer) { + if (main_options.rec_mode) { index_targets = VGIndexes::get_default_long_path_giraffe_indexes(); } else { index_targets = VGIndexes::get_default_long_giraffe_indexes(); @@ -1782,7 +1785,7 @@ int main_giraffe(int argc, char** argv) { unique_ptr minimizer_index; MinimizerIndexParameters::PayloadType payload_type = MinimizerIndexParameters::PAYLOAD_ZIPCODES; if (map_long_reads) { - if (use_path_minimizer) { + if (main_options.rec_mode) { minimizer_indexname = "Long Read PathMinimizers"; payload_type = MinimizerIndexParameters::PAYLOAD_ZIPCODES_WITH_PATHS; } else { @@ -1805,7 +1808,7 @@ int main_giraffe(int argc, char** argv) { IndexName oversized_zipcodes_indexname; ZipCodeCollection oversized_zipcodes; if (map_long_reads) { - if (use_path_minimizer) { + if (main_options.rec_mode) { oversized_zipcodes_indexname = "Long Read PathZipcodes"; } else { oversized_zipcodes_indexname = "Long Read Zipcodes"; diff --git a/src/subcommand/options.hpp b/src/subcommand/options.hpp index 0f1d38a3f2..6403a90878 100644 --- a/src/subcommand/options.hpp +++ b/src/subcommand/options.hpp @@ -150,7 +150,7 @@ struct TickChainLink { /// May not delegate to a different item. virtual bool tick_along_chain(); - /// Reset along the chain, makign this item and all parents take on their + /// Reset along the chain, making this item and all parents take on their /// initial values. virtual void reset_along_chain(); }; From 2401b0f6562f10ff12c8150447f0aa3c351146a4 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 14 May 2026 13:44:24 -0400 Subject: [PATCH 04/23] Start on logic to decide the real default recombination mode value --- src/index_registry.hpp | 4 ++++ src/subcommand/giraffe_main.cpp | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/src/index_registry.hpp b/src/index_registry.hpp index 7f20a5fd37..0b8067d308 100644 --- a/src/index_registry.hpp +++ b/src/index_registry.hpp @@ -145,6 +145,10 @@ struct IndexingParameters { static int haplotype_sampling_minimizer_w; }; +/// How many hapolotypoes should we expect to be able to store on a minimizer index path payload? +/// TODO: Move this to the minimizer index. +static constexpr int MAX_PAYLOAD_PATHS = 60; + /** * A struct namespace for standard inputs */ diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index 87449ac17d..020c5b31c0 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -1551,6 +1551,25 @@ int main_giraffe(int argc, char** argv) { // or when either --haplotype-name or --kff-name is given. // Otherwise we'd always sample when we had reads available. bool haplotype_sampling = haplotype_sampling_flag || provided_indexes.count("Haplotype Index") || !kff_filename.empty(); + + if (haplotype_sampling) { + // Figure out how many samples we need in the haplotype-sampled graph. + // TODO: This assumes all reference samples are haploid. + int hap_count = reference_samples.size() + IndexingParameters::haplotype_sampling_diploid ? 2 : IndexingParameters::haplotype_sampling_num_haplotypes; + if (hap_count > MAX_PAYLOAD_PATHS) { + // We won't be able to store our sampled haplotypes in the index payload. + // We want to default recombination-awareness off. + + + if (main_options.rec_mode) { + // The user asked for recombination-aware chaining anyway. + logger.error() << "Cannot store " << hap_count << " distinct haplotypes in minimizer index payloads for --rec-mode" << std::endl; + } + } else { + // Our samples haplotypes fit in the recombination-aware chaining payload. + // We want to default recombination-awarenss on. + } + } string sample_scope; if (haplotype_sampling) { From 78ebc23d6f8946773417a8ed49a83c09d2e76f0c Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 14 May 2026 13:44:33 -0400 Subject: [PATCH 05/23] Revert "Move rec mode option to option parser system" This reverts commit d8a81464f15169ca3b9187c414e675940b862719. --- src/subcommand/giraffe_main.cpp | 33 +++++++++++++++------------------ src/subcommand/options.hpp | 2 +- 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index 020c5b31c0..b18ff8fe8b 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -80,9 +80,6 @@ struct GiraffeMainOptions { /// Should we prune low-complexity anchors when surjecting? static constexpr bool default_prune_low_cplx = false; bool prune_low_cplx = default_prune_low_cplx; - /// Should we use recombination-aware mapping? - static constexpr bool default_rec_mode = false; - bool rec_mode = default_rec_mode; }; /// Options struct for scoring-related parameters. Defaults are in aligner.hpp. @@ -123,12 +120,6 @@ static std::unique_ptr get_options() { GiraffeMainOptions::default_prune_low_cplx, "prune short and low complexity anchors during linear format realignment (on by default for long reads)" ); - main_opts.add_flag( - "rec-mode", 'E', - &GiraffeMainOptions::rec_mode, - GiraffeMainOptions::default_rec_mode, - "activate giraffe recombination-aware mode" - ); // Configure scoring auto& scoring_opts = parser->add_group("scoring options"); @@ -738,6 +729,7 @@ void help_giraffe(char** argv, const BaseOptionGroup& parser, const std::map rescue_algorithms = { { "none", MinimizerMapper::rescue_none }, @@ -1142,6 +1138,7 @@ int main_giraffe(int argc, char** argv) { {"output-basename", required_argument, 0, OPT_OUTPUT_BASENAME}, {"report-name", required_argument, 0, OPT_REPORT_NAME}, {"parameter-preset", required_argument, 0, 'b'}, + {"rec-mode", no_argument, 0, 'E'}, {"rescue-algorithm", required_argument, 0, 'A'}, {"fragment-mean", required_argument, 0, OPT_FRAGMENT_MEAN }, {"fragment-stdev", required_argument, 0, OPT_FRAGMENT_STDEV }, @@ -1156,7 +1153,7 @@ int main_giraffe(int argc, char** argv) { parser->make_long_options(long_options); long_options.push_back({0, 0, 0, 0}); - std::string short_options = "h?Z:x:g:H:m:z:d:pG:f:iN:R:o:nb:t:A:"; + std::string short_options = "h?Z:x:g:H:m:z:d:pG:f:iN:R:o:nb:t:A:E"; parser->make_short_options(short_options); if (argc == 2) { @@ -1358,6 +1355,9 @@ int main_giraffe(int argc, char** argv) { map_long_reads = true; } break; + case 'E': + use_path_minimizer = true; + break; case 'A': { std::string algo_name = optarg; @@ -1423,9 +1423,6 @@ int main_giraffe(int argc, char** argv) { } } - // Fill in the optiopn values for the first values of all the ranges. - parser->reset_chain(); - // Now we can use main_options. // Get positional arguments before validating user intent if (have_input_file(optind, argc, argv)) { @@ -1537,7 +1534,7 @@ int main_giraffe(int argc, char** argv) { logger.error() << "Paired-end alignment is not yet implemented " << "for --align-from-chains or chaining-based presets" << std::endl; } - if (main_options.rec_mode && !map_long_reads) { + if (use_path_minimizer && !map_long_reads) { // We don't have file paths to load defined for recombination-aware short-read minimizers. logger.error() << "Path minimizers cannot be used with short reads." << endl; } @@ -1675,7 +1672,7 @@ int main_giraffe(int argc, char** argv) { {"Giraffe Distance Index", {"dist"}} }; if (map_long_reads) { - if (main_options.rec_mode) { + if (use_path_minimizer) { indexes_and_extensions.emplace(std::string("Long Read PathMinimizers"), std::vector({"longread.path.min","path.min", "min"})); indexes_and_extensions.emplace(std::string("Long Read PathZipcodes"), std::vector({"longread.path.zipcodes", "path.zipcodes", "zipcodes"})); } else { @@ -1765,7 +1762,7 @@ int main_giraffe(int argc, char** argv) { if (!map_long_reads) { index_targets = VGIndexes::get_default_short_giraffe_indexes(); } else { - if (main_options.rec_mode) { + if (use_path_minimizer) { index_targets = VGIndexes::get_default_long_path_giraffe_indexes(); } else { index_targets = VGIndexes::get_default_long_giraffe_indexes(); @@ -1804,7 +1801,7 @@ int main_giraffe(int argc, char** argv) { unique_ptr minimizer_index; MinimizerIndexParameters::PayloadType payload_type = MinimizerIndexParameters::PAYLOAD_ZIPCODES; if (map_long_reads) { - if (main_options.rec_mode) { + if (use_path_minimizer) { minimizer_indexname = "Long Read PathMinimizers"; payload_type = MinimizerIndexParameters::PAYLOAD_ZIPCODES_WITH_PATHS; } else { @@ -1827,7 +1824,7 @@ int main_giraffe(int argc, char** argv) { IndexName oversized_zipcodes_indexname; ZipCodeCollection oversized_zipcodes; if (map_long_reads) { - if (main_options.rec_mode) { + if (use_path_minimizer) { oversized_zipcodes_indexname = "Long Read PathZipcodes"; } else { oversized_zipcodes_indexname = "Long Read Zipcodes"; diff --git a/src/subcommand/options.hpp b/src/subcommand/options.hpp index 6403a90878..0f1d38a3f2 100644 --- a/src/subcommand/options.hpp +++ b/src/subcommand/options.hpp @@ -150,7 +150,7 @@ struct TickChainLink { /// May not delegate to a different item. virtual bool tick_along_chain(); - /// Reset along the chain, making this item and all parents take on their + /// Reset along the chain, makign this item and all parents take on their /// initial values. virtual void reset_along_chain(); }; From 23b32b12d58ec4ca2223d8b26d2f0ca3c94d9df9 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 14 May 2026 13:45:21 -0400 Subject: [PATCH 06/23] Fix spelling --- src/subcommand/options.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/subcommand/options.hpp b/src/subcommand/options.hpp index 0f1d38a3f2..54e8fd62bd 100644 --- a/src/subcommand/options.hpp +++ b/src/subcommand/options.hpp @@ -150,7 +150,7 @@ struct TickChainLink { /// May not delegate to a different item. virtual bool tick_along_chain(); - /// Reset along the chain, makign this item and all parents take on their + /// Reset along the chain, making this item and all parents take on their /// initial values. virtual void reset_along_chain(); }; @@ -883,7 +883,7 @@ struct BaseOptionGroup : public TickChainLink { /// otherwies. virtual bool set(const BaseValuation& entry) = 0; - /// Fill in entry with the value of the correspondign option, if we have + /// Fill in entry with the value of the corresponding option, if we have /// that option. If so, return true. virtual bool query(BaseValuation& entry) const = 0; From 91c281761876f2217871ea50dcd91373867384d2 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 14 May 2026 13:46:28 -0400 Subject: [PATCH 07/23] Rename variable --- src/subcommand/giraffe_main.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index b18ff8fe8b..d42e4c1039 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -893,7 +893,7 @@ int main_giraffe(int argc, char** argv) { bool map_long_reads = false; // If recombination mode is set, we are using PathMinimizer, else standard Giraffe. - bool use_path_minimizer = false; + bool rec_mode = false; // Map algorithm names to rescue algorithms std::map rescue_algorithms = { @@ -1356,7 +1356,7 @@ int main_giraffe(int argc, char** argv) { } break; case 'E': - use_path_minimizer = true; + rec_mode = true; break; case 'A': { @@ -1534,7 +1534,7 @@ int main_giraffe(int argc, char** argv) { logger.error() << "Paired-end alignment is not yet implemented " << "for --align-from-chains or chaining-based presets" << std::endl; } - if (use_path_minimizer && !map_long_reads) { + if (rec_mode && !map_long_reads) { // We don't have file paths to load defined for recombination-aware short-read minimizers. logger.error() << "Path minimizers cannot be used with short reads." << endl; } @@ -1558,7 +1558,7 @@ int main_giraffe(int argc, char** argv) { // We want to default recombination-awareness off. - if (main_options.rec_mode) { + if (rec_mode) { // The user asked for recombination-aware chaining anyway. logger.error() << "Cannot store " << hap_count << " distinct haplotypes in minimizer index payloads for --rec-mode" << std::endl; } @@ -1672,7 +1672,7 @@ int main_giraffe(int argc, char** argv) { {"Giraffe Distance Index", {"dist"}} }; if (map_long_reads) { - if (use_path_minimizer) { + if (rec_mode) { indexes_and_extensions.emplace(std::string("Long Read PathMinimizers"), std::vector({"longread.path.min","path.min", "min"})); indexes_and_extensions.emplace(std::string("Long Read PathZipcodes"), std::vector({"longread.path.zipcodes", "path.zipcodes", "zipcodes"})); } else { @@ -1762,7 +1762,7 @@ int main_giraffe(int argc, char** argv) { if (!map_long_reads) { index_targets = VGIndexes::get_default_short_giraffe_indexes(); } else { - if (use_path_minimizer) { + if (rec_mode) { index_targets = VGIndexes::get_default_long_path_giraffe_indexes(); } else { index_targets = VGIndexes::get_default_long_giraffe_indexes(); @@ -1801,7 +1801,7 @@ int main_giraffe(int argc, char** argv) { unique_ptr minimizer_index; MinimizerIndexParameters::PayloadType payload_type = MinimizerIndexParameters::PAYLOAD_ZIPCODES; if (map_long_reads) { - if (use_path_minimizer) { + if (rec_mode) { minimizer_indexname = "Long Read PathMinimizers"; payload_type = MinimizerIndexParameters::PAYLOAD_ZIPCODES_WITH_PATHS; } else { @@ -1824,7 +1824,7 @@ int main_giraffe(int argc, char** argv) { IndexName oversized_zipcodes_indexname; ZipCodeCollection oversized_zipcodes; if (map_long_reads) { - if (use_path_minimizer) { + if (rec_mode) { oversized_zipcodes_indexname = "Long Read PathZipcodes"; } else { oversized_zipcodes_indexname = "Long Read Zipcodes"; From 3e32c11f09b3e92cfe4ed125e2090d71636da73d Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 14 May 2026 14:05:28 -0400 Subject: [PATCH 08/23] Implement part of defaulting the recombination-aware mapping to a sensible value. --- src/subcommand/giraffe_main.cpp | 79 ++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 25 deletions(-) diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index d42e4c1039..c561f89464 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -729,7 +729,8 @@ void help_giraffe(char** argv, const BaseOptionGroup& parser, const std::map rec_mode = false; + // Whether to use recombination-aware mapping if the user did not set the option. + // Can't be a constant because it has to depend on the haplotype count in the index. + bool default_rec_mode = false; // Map algorithm names to rescue algorithms std::map rescue_algorithms = { @@ -1139,6 +1148,7 @@ int main_giraffe(int argc, char** argv) { {"report-name", required_argument, 0, OPT_REPORT_NAME}, {"parameter-preset", required_argument, 0, 'b'}, {"rec-mode", no_argument, 0, 'E'}, + {"no-rec-mode", no_argument, 0, OPT_NO_REC_MODE}, {"rescue-algorithm", required_argument, 0, 'A'}, {"fragment-mean", required_argument, 0, OPT_FRAGMENT_MEAN }, {"fragment-stdev", required_argument, 0, OPT_FRAGMENT_STDEV }, @@ -1351,6 +1361,9 @@ int main_giraffe(int argc, char** argv) { found->second.apply(*parser); } } + // TODO: Can we make the presets know if they are long read + // presets and not list their names here? What if someone + // manually turns on chaining? if (param_preset == "hifi" || param_preset == "r10") { map_long_reads = true; } @@ -1358,6 +1371,9 @@ int main_giraffe(int argc, char** argv) { case 'E': rec_mode = true; break; + case OPT_NO_REC_MODE: + rec_mode = false; + break; case 'A': { std::string algo_name = optarg; @@ -1534,10 +1550,6 @@ int main_giraffe(int argc, char** argv) { logger.error() << "Paired-end alignment is not yet implemented " << "for --align-from-chains or chaining-based presets" << std::endl; } - if (rec_mode && !map_long_reads) { - // We don't have file paths to load defined for recombination-aware short-read minimizers. - logger.error() << "Path minimizers cannot be used with short reads." << endl; - } // Use all the provided indexes for (auto& index : provided_indexes) { @@ -1549,24 +1561,41 @@ int main_giraffe(int argc, char** argv) { // Otherwise we'd always sample when we had reads available. bool haplotype_sampling = haplotype_sampling_flag || provided_indexes.count("Haplotype Index") || !kff_filename.empty(); - if (haplotype_sampling) { - // Figure out how many samples we need in the haplotype-sampled graph. - // TODO: This assumes all reference samples are haploid. - int hap_count = reference_samples.size() + IndexingParameters::haplotype_sampling_diploid ? 2 : IndexingParameters::haplotype_sampling_num_haplotypes; - if (hap_count > MAX_PAYLOAD_PATHS) { - // We won't be able to store our sampled haplotypes in the index payload. - // We want to default recombination-awareness off. - - - if (rec_mode) { - // The user asked for recombination-aware chaining anyway. - logger.error() << "Cannot store " << hap_count << " distinct haplotypes in minimizer index payloads for --rec-mode" << std::endl; + if (map_long_reads) { + // We might want to default recombination-awareness to on. + if (haplotype_sampling) { + // Figure out how many samples we need in the haplotype-sampled graph. + // TODO: This assumes all reference samples are haploid. + int hap_count = reference_samples.size() + IndexingParameters::haplotype_sampling_diploid ? 2 : IndexingParameters::haplotype_sampling_num_haplotypes; + if (hap_count > MAX_PAYLOAD_PATHS) { + // We won't be able to store our sampled haplotypes in the index payload. + // We want to default recombination-awareness off. + default_rec_mode = false; + + + if (rec_mode.value_or(default_rec_mode)) { + // The user asked for recombination-aware chaining anyway. + logger.error() << "Cannot store " << hap_count << " distinct haplotypes in minimizer index payloads for --rec-mode" << std::endl; + } + } else { + // Our samples haplotypes fit in the recombination-aware chaining payload. + // We want to default recombination-awarenss on. + default_rec_mode = true; } } else { - // Our samples haplotypes fit in the recombination-aware chaining payload. - // We want to default recombination-awarenss on. + // We can't count the samples we're going to downsample to, so we'd + // like to count the haplotypes we will have available in the GBZ. + // But the GBZ won't be available to look at until later; we might + // be coming from a VCF or a graph that needs a path cover or + // something. TODO: Figure this out. + } } + + if (rec_mode.value_or(default_rec_mode) && !map_long_reads) { + // We don't have file paths to load defined for recombination-aware short-read minimizers. + logger.error() << "Recombination-aware mode cannot be used with short reads because no short read path-payload minimizer index type has been invented." << endl; + } string sample_scope; if (haplotype_sampling) { @@ -1672,7 +1701,7 @@ int main_giraffe(int argc, char** argv) { {"Giraffe Distance Index", {"dist"}} }; if (map_long_reads) { - if (rec_mode) { + if (rec_mode.value_or(default_rec_mode)) { indexes_and_extensions.emplace(std::string("Long Read PathMinimizers"), std::vector({"longread.path.min","path.min", "min"})); indexes_and_extensions.emplace(std::string("Long Read PathZipcodes"), std::vector({"longread.path.zipcodes", "path.zipcodes", "zipcodes"})); } else { @@ -1762,7 +1791,7 @@ int main_giraffe(int argc, char** argv) { if (!map_long_reads) { index_targets = VGIndexes::get_default_short_giraffe_indexes(); } else { - if (rec_mode) { + if (rec_mode.value_or(default_rec_mode)) { index_targets = VGIndexes::get_default_long_path_giraffe_indexes(); } else { index_targets = VGIndexes::get_default_long_giraffe_indexes(); @@ -1801,7 +1830,7 @@ int main_giraffe(int argc, char** argv) { unique_ptr minimizer_index; MinimizerIndexParameters::PayloadType payload_type = MinimizerIndexParameters::PAYLOAD_ZIPCODES; if (map_long_reads) { - if (rec_mode) { + if (rec_mode.value_or(default_rec_mode)) { minimizer_indexname = "Long Read PathMinimizers"; payload_type = MinimizerIndexParameters::PAYLOAD_ZIPCODES_WITH_PATHS; } else { @@ -1824,7 +1853,7 @@ int main_giraffe(int argc, char** argv) { IndexName oversized_zipcodes_indexname; ZipCodeCollection oversized_zipcodes; if (map_long_reads) { - if (rec_mode) { + if (rec_mode.value_or(default_rec_mode)) { oversized_zipcodes_indexname = "Long Read PathZipcodes"; } else { oversized_zipcodes_indexname = "Long Read Zipcodes"; From c187b468c460c9c86c169faf47270149b6c2ebf6 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 14 May 2026 15:48:04 -0400 Subject: [PATCH 09/23] Rip out PathMinimizer distinction and decide if we want to be in recombination mode by default from payload type --- src/gbwtgraph_helper.cpp | 15 ++++- src/gbwtgraph_helper.hpp | 10 --- src/index_registry.cpp | 28 +------- src/index_registry.hpp | 2 - src/subcommand/giraffe_main.cpp | 102 +++++++++--------------------- src/subcommand/minimizer_main.cpp | 8 +-- 6 files changed, 44 insertions(+), 121 deletions(-) diff --git a/src/gbwtgraph_helper.cpp b/src/gbwtgraph_helper.cpp index 96ca15be2d..bf2c5189a7 100644 --- a/src/gbwtgraph_helper.cpp +++ b/src/gbwtgraph_helper.cpp @@ -499,12 +499,21 @@ gbwtgraph::DefaultMinimizerIndex build_minimizer_index( std::exit(EXIT_FAILURE); } - // Determine payload size and type. + // Count the distinct samples that we might use for the payload. + // TODO: We assume generic paths don't count for payloads. + std::unordered_set samples; + gbz.for_each_path_matching({PathSense::REFERENCE, PathSense::HAPLOTYPE}, {}, {}, [&](const path_handle_t& path) { + samples.insert(gbz.get_sample_name(path)); + }); + // Put the paths in if we think they will fit. + bool paths_in_payload = samples.size() <= MAX_PAYLOAD_PATHS; + + // Determine payload size and type code. size_t payload_size = 0; MinimizerIndexParameters::PayloadType payload_type = MinimizerIndexParameters::PAYLOAD_NONE; if (distance_index != nullptr) { payload_size = MinimizerIndexParameters::ZIPCODE_PAYLOAD_SIZE; - if (params.paths_in_payload) { + if (paths_in_payload) { payload_size++; payload_type = MinimizerIndexParameters::PAYLOAD_ZIPCODES_WITH_PATHS; } else { @@ -549,7 +558,7 @@ gbwtgraph::DefaultMinimizerIndex build_minimizer_index( return reinterpret_cast(&MIPayload::NO_CODE); } }; - if (params.paths_in_payload) { + if (paths_in_payload) { gbwtgraph::index_haplotypes_with_paths(gbz, index, get_payload); } else { gbwtgraph::index_haplotypes(gbz, index, get_payload); diff --git a/src/gbwtgraph_helper.hpp b/src/gbwtgraph_helper.hpp index 161bae25c2..d9ebb7923d 100644 --- a/src/gbwtgraph_helper.hpp +++ b/src/gbwtgraph_helper.hpp @@ -171,10 +171,6 @@ struct MinimizerIndexParameters { /// Whether to use syncmers instead of minimizers. bool use_syncmers = false; - /// Whether to include path information in the payload (for recombination-aware mapping). - /// Ignored if there is no zipcode payload. - bool paths_in_payload = false; - /// Whether to use weighted minimizers (cannot be used with syncmers). bool use_weighted_minimizers = false; @@ -209,12 +205,6 @@ struct MinimizerIndexParameters { return *this; } - /// Includes path information in the payload. - MinimizerIndexParameters& with_paths(bool paths_in_payload = true) { - this->paths_in_payload = paths_in_payload; - return *this; - } - /// Sets weighted minimizer parameters. MinimizerIndexParameters& weighted( bool use_weighted_minimizers, diff --git a/src/index_registry.cpp b/src/index_registry.cpp index cb09dbcbfa..04fc9b454a 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -519,8 +519,7 @@ construct_minimizers_impl( if (IndexingParameters::verbosity != IndexingParameters::None) { info(context) << "Constructing minimizer index and associated zipcodes." << endl; info(context) << " using parameters -k " << params.k << " -w " << params.w_or_s - << (params.use_weighted_minimizers ? " -W " : "") - << (params.paths_in_payload ? " with paths" : "") << endl; + << (params.use_weighted_minimizers ? " -W " : "") << endl; } assert(inputs.size() == 2); @@ -4573,25 +4572,11 @@ IndexRegistry VGIndexes::get_vg_index_registry() { // TODO: We may not always want to store the minimizer index. Rebuilding the index may be // faster than loading it from a network drive. - registry.register_recipe( - {"Long Read PathMinimizers", "Long Read PathZipcodes"}, {"Giraffe Distance Index", "Giraffe GBZ"}, - [&](const vector& inputs, const IndexingPlan* plan, AliasGraph& alias_graph, const IndexGroup& constructing) { - MinimizerIndexParameters params; - params.minimizers(IndexingParameters::long_read_minimizer_k, IndexingParameters::long_read_minimizer_w) - .with_paths(true) - .weighted(IndexingParameters::long_read_minimizer_W, IndexingParameters::minimizer_downweight_threshold) - .kmer_counting(IndexingParameters::space_efficient_counting) - .verbose(IndexingParameters::verbosity >= IndexingParameters::Debug); - return construct_minimizers_impl(inputs, plan, constructing, params); - } - ); - registry.register_recipe( {"Short Read Minimizers", "Short Read Zipcodes"}, {"Giraffe Distance Index", "Giraffe GBZ"}, [&](const vector& inputs, const IndexingPlan* plan, AliasGraph& alias_graph, const IndexGroup& constructing) { MinimizerIndexParameters params; params.minimizers(IndexingParameters::short_read_minimizer_k, IndexingParameters::short_read_minimizer_w) - .with_paths(false) .weighted(IndexingParameters::short_read_minimizer_W, IndexingParameters::minimizer_downweight_threshold) .kmer_counting(IndexingParameters::space_efficient_counting) .verbose(IndexingParameters::verbosity >= IndexingParameters::Debug); @@ -4603,7 +4588,6 @@ IndexRegistry VGIndexes::get_vg_index_registry() { [&](const vector& inputs, const IndexingPlan* plan, AliasGraph& alias_graph, const IndexGroup& constructing) { MinimizerIndexParameters params; params.minimizers(IndexingParameters::long_read_minimizer_k, IndexingParameters::long_read_minimizer_w) - .with_paths(false) .weighted(IndexingParameters::long_read_minimizer_W, IndexingParameters::minimizer_downweight_threshold) .kmer_counting(IndexingParameters::space_efficient_counting) .verbose(IndexingParameters::verbosity >= IndexingParameters::Debug); @@ -4662,16 +4646,6 @@ vector VGIndexes::get_default_long_giraffe_indexes() { return indexes; } -vector VGIndexes::get_default_long_path_giraffe_indexes() { - vector indexes{ - "Giraffe Distance Index", - "Giraffe GBZ", - "Long Read PathMinimizers", - "Long Read PathZipcodes", - }; - return indexes; -} - vector VGIndexes::get_default_haplotype_sampling_indexes() { vector indexes{ "GBZ", diff --git a/src/index_registry.hpp b/src/index_registry.hpp index 0b8067d308..78ce755e8c 100644 --- a/src/index_registry.hpp +++ b/src/index_registry.hpp @@ -165,8 +165,6 @@ struct VGIndexes { static vector get_default_short_giraffe_indexes(); /// A list of the identifiers of the default indexes to run vg giraffe on long reads static vector get_default_long_giraffe_indexes(); - /// A list of the identifiers of the default indexes to run vg giraffe on long reads with path minimizers - static vector get_default_long_path_giraffe_indexes(); /// A list of the identifiers of the default indexes to run vg haplotypes static vector get_default_haplotype_sampling_indexes(); }; diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index c561f89464..8aa6c4acdd 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -899,10 +899,7 @@ int main_giraffe(int argc, char** argv) { // If true, use a minimizer index with paths in the payloads, and do // recombination-aware chaining. If false, use minimizer-only payloads and // don't do recombination-aware chaining. - std::optional rec_mode = false; - // Whether to use recombination-aware mapping if the user did not set the option. - // Can't be a constant because it has to depend on the haplotype count in the index. - bool default_rec_mode = false; + std::optional rec_mode; // Map algorithm names to rescue algorithms std::map rescue_algorithms = { @@ -1226,13 +1223,11 @@ int main_giraffe(int argc, char** argv) { case 'm': provided_indexes.emplace("Long Read Minimizers", optarg); provided_indexes.emplace("Short Read Minimizers", optarg); - provided_indexes.emplace("Long Read PathMinimizers", optarg); break; case 'z': provided_indexes.emplace("Long Read Zipcodes", optarg); provided_indexes.emplace("Short Read Zipcodes", optarg); - provided_indexes.emplace("Long Read PathZipcodes", optarg); break; case 'd': provided_indexes.emplace("Giraffe Distance Index", optarg); @@ -1550,6 +1545,11 @@ int main_giraffe(int argc, char** argv) { logger.error() << "Paired-end alignment is not yet implemented " << "for --align-from-chains or chaining-based presets" << std::endl; } + if (rec_mode.has_value() && rec_mode.value() && !map_long_reads) { + // The user has specifically asked fro recombination-aware mapping on short reads, which isn't available. + // TODO: Get a better idea of if reads are long. + logger.error() << "Recombination-aware mode cannot be used with short reads yet." << endl; + } // Use all the provided indexes for (auto& index : provided_indexes) { @@ -1561,42 +1561,6 @@ int main_giraffe(int argc, char** argv) { // Otherwise we'd always sample when we had reads available. bool haplotype_sampling = haplotype_sampling_flag || provided_indexes.count("Haplotype Index") || !kff_filename.empty(); - if (map_long_reads) { - // We might want to default recombination-awareness to on. - if (haplotype_sampling) { - // Figure out how many samples we need in the haplotype-sampled graph. - // TODO: This assumes all reference samples are haploid. - int hap_count = reference_samples.size() + IndexingParameters::haplotype_sampling_diploid ? 2 : IndexingParameters::haplotype_sampling_num_haplotypes; - if (hap_count > MAX_PAYLOAD_PATHS) { - // We won't be able to store our sampled haplotypes in the index payload. - // We want to default recombination-awareness off. - default_rec_mode = false; - - - if (rec_mode.value_or(default_rec_mode)) { - // The user asked for recombination-aware chaining anyway. - logger.error() << "Cannot store " << hap_count << " distinct haplotypes in minimizer index payloads for --rec-mode" << std::endl; - } - } else { - // Our samples haplotypes fit in the recombination-aware chaining payload. - // We want to default recombination-awarenss on. - default_rec_mode = true; - } - } else { - // We can't count the samples we're going to downsample to, so we'd - // like to count the haplotypes we will have available in the GBZ. - // But the GBZ won't be available to look at until later; we might - // be coming from a VCF or a graph that needs a path cover or - // something. TODO: Figure this out. - - } - } - - if (rec_mode.value_or(default_rec_mode) && !map_long_reads) { - // We don't have file paths to load defined for recombination-aware short-read minimizers. - logger.error() << "Recombination-aware mode cannot be used with short reads because no short read path-payload minimizer index type has been invented." << endl; - } - string sample_scope; if (haplotype_sampling) { @@ -1701,13 +1665,8 @@ int main_giraffe(int argc, char** argv) { {"Giraffe Distance Index", {"dist"}} }; if (map_long_reads) { - if (rec_mode.value_or(default_rec_mode)) { - indexes_and_extensions.emplace(std::string("Long Read PathMinimizers"), std::vector({"longread.path.min","path.min", "min"})); - indexes_and_extensions.emplace(std::string("Long Read PathZipcodes"), std::vector({"longread.path.zipcodes", "path.zipcodes", "zipcodes"})); - } else { - indexes_and_extensions.emplace(std::string("Long Read Minimizers"), std::vector({"longread.withzip.min","withzip.min", "min"})); - indexes_and_extensions.emplace(std::string("Long Read Zipcodes"), std::vector({"longread.zipcodes", "zipcodes"})); - } + indexes_and_extensions.emplace(std::string("Long Read Minimizers"), std::vector({"longread.path.min", "longread.withzip.min", "path.min", "withzip.min", "min"})); + indexes_and_extensions.emplace(std::string("Long Read Zipcodes"), std::vector({"longread.path.zipcodes", "longread.zipcodes", "path.zipcodes", "zipcodes"})); } else { indexes_and_extensions.emplace(std::string("Short Read Minimizers"), std::vector({"shortread.withzip.min","withzip.min", "min"})); indexes_and_extensions.emplace(std::string("Short Read Zipcodes"), std::vector({"shortread.zipcodes", "zipcodes"})); @@ -1765,14 +1724,12 @@ int main_giraffe(int argc, char** argv) { } //If we're making new zipcodes, we should rebuild the minimizers too - if (!(indexes_and_extensions.count(std::string("Long Read Minimizers")) || indexes_and_extensions.count(std::string("Long Read PathMinimizers"))) && indexes_and_extensions.count(std::string("Long Read Zipcodes"))) { + if (!indexes_and_extensions.count(std::string("Long Read Minimizers")) && indexes_and_extensions.count(std::string("Long Read Zipcodes"))) { logger.info() << "Rebuilding minimizer index to include zipcodes" << endl; registry.reset(std::string("Long Read Minimizers")); - registry.reset(std::string("Long Read PathMinimizers")); - } else if ((indexes_and_extensions.count(std::string("Long Read Minimizers")) || indexes_and_extensions.count(std::string("Long Read PathMinimizers"))) && !indexes_and_extensions.count(std::string("Long Read Zipcodes"))) { + } else if (indexes_and_extensions.count(std::string("Long Read Minimizers")) && !indexes_and_extensions.count(std::string("Long Read Zipcodes"))) { logger.info() << "Rebuilding zipcodes index to match new minimizers" << endl; registry.reset(std::string("Long Read Zipcodes")); - registry.reset(std::string("Long Read PathZipcodes")); } else if (!indexes_and_extensions.count(std::string("Short Read Minimizers")) && indexes_and_extensions.count(std::string("Short Read Zipcodes"))) { logger.info() << "Rebuilding minimizer index to include zipcodes" << endl; registry.reset(std::string("Short Read Minimizers")); @@ -1791,11 +1748,7 @@ int main_giraffe(int argc, char** argv) { if (!map_long_reads) { index_targets = VGIndexes::get_default_short_giraffe_indexes(); } else { - if (rec_mode.value_or(default_rec_mode)) { - index_targets = VGIndexes::get_default_long_path_giraffe_indexes(); - } else { - index_targets = VGIndexes::get_default_long_giraffe_indexes(); - } + index_targets = VGIndexes::get_default_long_giraffe_indexes(); } #ifdef debug for (auto& needed : index_targets) { @@ -1828,15 +1781,9 @@ int main_giraffe(int argc, char** argv) { } IndexName minimizer_indexname; unique_ptr minimizer_index; - MinimizerIndexParameters::PayloadType payload_type = MinimizerIndexParameters::PAYLOAD_ZIPCODES; if (map_long_reads) { - if (rec_mode.value_or(default_rec_mode)) { - minimizer_indexname = "Long Read PathMinimizers"; - payload_type = MinimizerIndexParameters::PAYLOAD_ZIPCODES_WITH_PATHS; - } else { - // Use the long read minimizers - minimizer_indexname = "Long Read Minimizers"; - } + // Use the long read minimizers + minimizer_indexname = "Long Read Minimizers"; } else { minimizer_indexname = "Short Read Minimizers"; } @@ -1844,8 +1791,23 @@ int main_giraffe(int argc, char** argv) { logger.error() << registry.require("Giraffe Distance Index").at(0) << " is newer than " << registry.require(minimizer_indexname).at(0) << " which depends on it" << std::endl; } minimizer_index = vg::io::VPKG::load_one(registry.require(minimizer_indexname).at(0)); - require_payload(*minimizer_index, payload_type); + // Decide if we want to do recombination-aware mapping by default, based on whether we can. + + // Whether to use recombination-aware mapping if the user did not set the option. + // Can't be a constant because it has to depend on the haplotype count in the index. + bool default_rec_mode = has_payload(*minimizer_index, MinimizerIndexParameters::PAYLOAD_ZIPCODES_WITH_PATHS) && map_long_reads; + + std::vector allowed_payloads; + allowed_payloads.push_back(MinimizerIndexParameters::PAYLOAD_ZIPCODES_WITH_PATHS); + if (!rec_mode.value_or(default_rec_mode)) { + // If we're not doing recombination-aware mapping, we can also accept a + // payload without path info, as long as it still has zipcodes. + allowed_payloads.puch_back(MinimizerIndexParameters::PAYLOAD_ZIPCODES); + } + // Make sure we have a usable minimizer index payload. + require_payload(*minimizer_index, allowed_payloads); + // Grab the zipcodes if (show_progress) { logger.info() << "Loading Zipcodes" << endl; @@ -1853,11 +1815,7 @@ int main_giraffe(int argc, char** argv) { IndexName oversized_zipcodes_indexname; ZipCodeCollection oversized_zipcodes; if (map_long_reads) { - if (rec_mode.value_or(default_rec_mode)) { - oversized_zipcodes_indexname = "Long Read PathZipcodes"; - } else { - oversized_zipcodes_indexname = "Long Read Zipcodes"; - } + oversized_zipcodes_indexname = "Long Read Zipcodes"; } else { oversized_zipcodes_indexname = "Short Read Zipcodes"; } diff --git a/src/subcommand/minimizer_main.cpp b/src/subcommand/minimizer_main.cpp index 379e240463..7ff38656f8 100644 --- a/src/subcommand/minimizer_main.cpp +++ b/src/subcommand/minimizer_main.cpp @@ -159,7 +159,6 @@ void help_minimizer(char** argv) { std::cerr << "Other options:" << std::endl; std::cerr << " -z, --zipcode-name FILE store the distances that are too big in a file" << std::endl; std::cerr << " if no -z, some distances may be discarded" << std::endl; - std::cerr << " -E, --rec-mode build recombination-aware MinimizerIndex" << std::endl; std::cerr << " -p, --progress show progress information" << std::endl; std::cerr << " -t, --threads N use N threads for index construction " << "[" << get_default_threads() << "]" << std::endl; @@ -188,7 +187,6 @@ MinimizerConfig::MinimizerConfig(int argc, char** argv, int max_threads, const L while (true) { static struct option long_options[] = { - { "rec-mode", no_argument, 0, 'E' }, { "distance-index", required_argument, 0, 'd' }, { "output-name", required_argument, 0, 'o' }, { "kmer-length", required_argument, 0, 'k' }, @@ -210,15 +208,11 @@ MinimizerConfig::MinimizerConfig(int argc, char** argv, int max_threads, const L }; int option_index = 0; - c = getopt_long(argc, argv, "Ed:o:k:w:cs:Wz:pt:h?", long_options, &option_index); + c = getopt_long(argc, argv, "d:o:k:w:cs:Wz:pt:h?", long_options, &option_index); if (c == -1) { break; } // End of options. switch (c) { - case 'E': - this->params.paths_in_payload = true; - logger.warn() << "--rec-mode is still under development" << std::endl; - break; case 'd': this->distance_name = optarg; break; From d9091afe706f545462a79ba886c0a1c19a47a846 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 14 May 2026 17:00:59 -0400 Subject: [PATCH 10/23] Ignore path payloads if we're not meant to be using them --- src/minimizer_mapper.cpp | 4 ++-- src/minimizer_mapper.hpp | 8 ++++++-- src/subcommand/giraffe_main.cpp | 11 +++++++++++ 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/src/minimizer_mapper.cpp b/src/minimizer_mapper.cpp index eea5ac6918..aa7a4d1618 100644 --- a/src/minimizer_mapper.cpp +++ b/src/minimizer_mapper.cpp @@ -61,8 +61,8 @@ MinimizerMapper::MinimizerMapper(const gbwtgraph::GBWTGraph& graph, path_graph(path_graph), minimizer_index(minimizer_index), k(minimizer_index.k()), w(minimizer_index.w()), - payload_with_paths(has_payload(minimizer_index, MinimizerIndexParameters::PAYLOAD_ZIPCODES_WITH_PATHS)), uses_syncmers(minimizer_index.uses_syncmers()), + use_payload_paths(has_payload(minimizer_index, MinimizerIndexParameters::PAYLOAD_ZIPCODES_WITH_PATHS)), distance_index(distance_index), zipcodes(zipcodes), clusterer(distance_index, &graph), @@ -4461,7 +4461,7 @@ std::vector MinimizerMapper::find_seeds(const std::vector // Handle the payload. seeds.back().zipcode.fill_in_zipcode(occ.second, this->zipcodes, *(this->distance_index), hit); - if (this->payload_with_paths) { + if (this->use_payload_paths) { seeds.back().paths = occ.second[MinimizerIndexParameters::ZIPCODE_PAYLOAD_SIZE]; } } diff --git a/src/minimizer_mapper.hpp b/src/minimizer_mapper.hpp index cddd77d105..6a5052a9ac 100644 --- a/src/minimizer_mapper.hpp +++ b/src/minimizer_mapper.hpp @@ -514,6 +514,11 @@ class MinimizerMapper : public AlignerClient { string sample_name; /// Apply this read group name string read_group; + + /// Should we use path information from the minimizer index payloads? + /// By default we fill this in based on availability in the index, but you + /// can clear this if it is set to turn off recombination-aware mapping. + bool use_payload_paths; /// Have we complained about hitting the size limit for rescue? atomic_flag warned_about_rescue_size = ATOMIC_FLAG_INIT; @@ -653,9 +658,8 @@ class MinimizerMapper : public AlignerClient { // caching common information about the minimizer index int32_t k; int32_t w; - bool payload_with_paths; // Does the payload for minimizer hits include path information in addition to a zipcode? bool uses_syncmers; // TODO: We could discard the syncmer support. - + SnarlDistanceIndex* distance_index; const ZipCodeCollection* zipcodes; /// This is our primary graph. diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index 8aa6c4acdd..348717d20e 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -1985,6 +1985,14 @@ int main_giraffe(int argc, char** argv) { } } }; + auto report_paired_flag = [&](const std::string& name, bool value) { + if (value) { + params_json << ",\"" << name << "\":" << (value ? "true" : "false"); + if (show_progress) { + cerr << "--" << (value ? "" : "no-") << name << endl; + } + } + }; auto report_number = [&](const std::string& name, size_t value) { params_json << ",\"" << name << "\":" << value; if (show_progress) { @@ -2023,6 +2031,9 @@ int main_giraffe(int argc, char** argv) { report_string("rescue-algorithm", algorithm_names[rescue_algorithm]); } minimizer_mapper.rescue_algorithm = rescue_algorithm; + report_paired_flag("rec-mode", rec_mode.value_or(default_rec_mode)); + minimizer_mapper.use_payload_paths &= rec_mode.value_or(default_rec_mode); + params_json << "}" << std::endl; From d8e9be8a4409de462a61a925b7d7142205b932b5 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 14 May 2026 17:10:53 -0400 Subject: [PATCH 11/23] Set up recombination penalty values for presets and reorder options --- src/minimizer_mapper.hpp | 12 ++++++------ src/subcommand/giraffe_main.cpp | 24 +++++++++++++++--------- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/src/minimizer_mapper.hpp b/src/minimizer_mapper.hpp index 6a5052a9ac..f4387474c3 100644 --- a/src/minimizer_mapper.hpp +++ b/src/minimizer_mapper.hpp @@ -291,14 +291,11 @@ class MinimizerMapper : public AlignerClient { /// at chaining? static constexpr double default_gap_scale = 1.0; double gap_scale = default_gap_scale; - /// Recombination penalty. This is added to the cost of a transition if there are no shared haplotypes. + /// Recombination penalty for chaining. This is added to the cost of a transition if there are no shared haplotypes. static constexpr int default_rec_penalty = 0; int rec_penalty = default_rec_penalty; - /// Recombination penalty for alignment scoring. This is deducted from the score of an alignment for each required recombination. -1 means use rec_penalty. - static constexpr int default_rec_penalty_aln = -1; - int rec_penalty_aln = default_rec_penalty_aln; - /// Recombination-aware chaining bonus for avoiding losing haplotypes. Not actually tracked in chaining DP score. -1 means use rec_penalty. - static constexpr int default_rec_consistency_bonus = -1; + /// Recombination-aware chaining bonus for avoiding losing haplotypes. Not actually tracked in chaining DP score. + static constexpr int default_rec_consistency_bonus = 0; int rec_consistency_bonus = default_rec_consistency_bonus; // How many points should we treat a non-gap connection base as producing, at chaining? static constexpr double default_points_per_possible_match = 0; @@ -384,6 +381,9 @@ class MinimizerMapper : public AlignerClient { static constexpr int default_wfa_max_distance = WFAExtender::ErrorModel::default_distance().max; int wfa_max_distance = default_wfa_max_distance; + /// Recombination penalty for alignment scoring. This is deducted from the score of an alignment for each required recombination. + static constexpr int default_rec_penalty_aln = 0; + int rec_penalty_aln = default_rec_penalty_aln; /// How much should candidate alignment scores be penalized for softclipped bases? static constexpr double default_softclip_penalty = 0.0; double softclip_penalty = default_softclip_penalty; diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index 348717d20e..de5429a487 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -453,21 +453,14 @@ static std::unique_ptr get_options() { "rec-penalty", &MinimizerMapper::rec_penalty, MinimizerMapper::default_rec_penalty, - "penalty for a recombination, requires -E (ALPHA)", - int_is_nonnegative - ); - chaining_opts.add_range( - "rec-penalty-aln", - &MinimizerMapper::rec_penalty_aln, - MinimizerMapper::default_rec_penalty_aln, - "penalty for a recombination for final alignment scoring, -1 for --rec-penalty, requires -E (ALPHA)", + "penalty for a recombination in recombination-aware mode", int_is_nonnegative ); chaining_opts.add_range( "rec-consistency-bonus", &MinimizerMapper::rec_consistency_bonus, MinimizerMapper::default_rec_consistency_bonus, - "untracked bonus for chains staying consistent with haplotypes to avoid recombinations, -1 for --rec-penalty, requires -E (ALPHA)", + "untracked bonus for chains staying consistent with haplotypes to avoid recombinations in recombination-aware mode", int_is_nonnegative ); chaining_opts.add_range( @@ -604,6 +597,13 @@ static std::unique_ptr get_options() { MinimizerMapper::default_wfa_max_distance, "band distance to allow in the longest WFA connection or tail" ); + chaining_opts.add_range( + "rec-penalty-aln", + &MinimizerMapper::rec_penalty_aln, + MinimizerMapper::default_rec_penalty_aln, + "penalty per recombination for final alignment scoring in recombination-aware mode", + int_is_nonnegative + ); chaining_opts.add_range( "softclip-penalty", &MinimizerMapper::softclip_penalty, @@ -964,6 +964,8 @@ int main_giraffe(int argc, char** argv) { .add_entry("item-bonus", 2) .add_entry("item-scale", 1.0) .add_entry("gap-scale", 0.27579) + .add_entry("rec-penalty", 2) + .add_entry("rec-consistency-bonus", 12) .add_entry("chain-score-threshold", 234.0) .add_entry("min-chains", 2) .add_entry("min-chain-score-per-base", 0.24) @@ -983,6 +985,7 @@ int main_giraffe(int argc, char** argv) { .add_entry("wfa-max-mismatches", 2) .add_entry("wfa-max-mismatches-per-base", 0.05) .add_entry("wfa-max-max-mismatches", 15) + .add_entry("rec-penalty-aln", 28) .add_entry("prune-low-cplx", true); Preset r10_base; @@ -1020,6 +1023,8 @@ int main_giraffe(int argc, char** argv) { .add_entry("item-bonus", 20) .add_entry("item-scale", 1.0) .add_entry("gap-scale", 0.06759721757973396) + .add_entry("rec-penalty", 2) + .add_entry("rec-consistency-bonus", 12) .add_entry("chain-score-threshold", 160.0) .add_entry("min-chains", 2) .add_entry("max-chains-per-tree", 3) @@ -1039,6 +1044,7 @@ int main_giraffe(int argc, char** argv) { .add_entry("wfa-max-mismatches", 2) .add_entry("wfa-max-mismatches-per-base", 0.05) .add_entry("wfa-max-max-mismatches", 15) + .add_entry("rec-penalty-aln", 28) .add_entry("prune-low-cplx", true); presets.emplace("r10", r10_base); From f09b52f47683272f9be5751ead650c9b59daca76 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 14 May 2026 17:23:57 -0400 Subject: [PATCH 12/23] Remove build errors I added --- src/gbwtgraph_helper.cpp | 10 ++++------ src/gbwtgraph_helper.hpp | 6 +++++- src/index_registry.hpp | 4 ---- src/subcommand/giraffe_main.cpp | 2 +- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/src/gbwtgraph_helper.cpp b/src/gbwtgraph_helper.cpp index bf2c5189a7..24746a6130 100644 --- a/src/gbwtgraph_helper.cpp +++ b/src/gbwtgraph_helper.cpp @@ -500,13 +500,11 @@ gbwtgraph::DefaultMinimizerIndex build_minimizer_index( } // Count the distinct samples that we might use for the payload. - // TODO: We assume generic paths don't count for payloads. - std::unordered_set samples; - gbz.for_each_path_matching({PathSense::REFERENCE, PathSense::HAPLOTYPE}, {}, {}, [&](const path_handle_t& path) { - samples.insert(gbz.get_sample_name(path)); - }); + // TODO: We're counting the magic generic samples here. + size_t sample_count = gbz.graph.index->metadata.sample_names.size(); + // Put the paths in if we think they will fit. - bool paths_in_payload = samples.size() <= MAX_PAYLOAD_PATHS; + bool paths_in_payload = sample_count <= MinimizerIndexParameters::MAX_PAYLOAD_PATHS; // Determine payload size and type code. size_t payload_size = 0; diff --git a/src/gbwtgraph_helper.hpp b/src/gbwtgraph_helper.hpp index d9ebb7923d..52594d7a81 100644 --- a/src/gbwtgraph_helper.hpp +++ b/src/gbwtgraph_helper.hpp @@ -148,7 +148,11 @@ struct MinimizerIndexParameters { constexpr static size_t HASH_TABLE_MAX_WIDTH = 36; /// Number of words used for a zipcode payload. - constexpr static size_t ZIPCODE_PAYLOAD_SIZE = sizeof(ZipCode::payload_type) / sizeof(gbwtgraph::KmerEncoding::code_type); + constexpr static size_t ZIPCODE_PAYLOAD_SIZE = sizeof(ZipCode::payload_type) / sizeof(gbwtgraph::KmerEncoding::code_type); + + /// How many hapolotypoes should we expect to be able to store on a minimizer index path payload? + /// TODO: Move this to the minimizer index. + static constexpr int MAX_PAYLOAD_PATHS = 60; enum PayloadType { /// No payload. diff --git a/src/index_registry.hpp b/src/index_registry.hpp index 78ce755e8c..45180237c9 100644 --- a/src/index_registry.hpp +++ b/src/index_registry.hpp @@ -145,10 +145,6 @@ struct IndexingParameters { static int haplotype_sampling_minimizer_w; }; -/// How many hapolotypoes should we expect to be able to store on a minimizer index path payload? -/// TODO: Move this to the minimizer index. -static constexpr int MAX_PAYLOAD_PATHS = 60; - /** * A struct namespace for standard inputs */ diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index de5429a487..767cc7b40d 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -1809,7 +1809,7 @@ int main_giraffe(int argc, char** argv) { if (!rec_mode.value_or(default_rec_mode)) { // If we're not doing recombination-aware mapping, we can also accept a // payload without path info, as long as it still has zipcodes. - allowed_payloads.puch_back(MinimizerIndexParameters::PAYLOAD_ZIPCODES); + allowed_payloads.push_back(MinimizerIndexParameters::PAYLOAD_ZIPCODES); } // Make sure we have a usable minimizer index payload. require_payload(*minimizer_index, allowed_payloads); From 09f2ad4e626c70293649452b692edfc376ab16fc Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 14 May 2026 18:35:14 -0400 Subject: [PATCH 13/23] Make the old vg minimizer --rec-mode now enforce that path payloads will be there --- src/gbwtgraph_helper.cpp | 29 ++++++++++++++++++----------- src/gbwtgraph_helper.hpp | 3 ++- src/subcommand/minimizer_main.cpp | 13 +++++++++++-- test/t/46_vg_minimizer.t | 2 +- 4 files changed, 32 insertions(+), 15 deletions(-) diff --git a/src/gbwtgraph_helper.cpp b/src/gbwtgraph_helper.cpp index 24746a6130..2397a28921 100644 --- a/src/gbwtgraph_helper.cpp +++ b/src/gbwtgraph_helper.cpp @@ -487,16 +487,18 @@ gbwtgraph::DefaultMinimizerIndex build_minimizer_index( const gbwtgraph::GBZ& gbz, const SnarlDistanceIndex* distance_index, ZipCodeCollection* oversized_zipcodes, - const MinimizerIndexParameters& params + const MinimizerIndexParameters& params, + bool require_path_payloads ) { double start = gbwt::readTimer(); + Logger logger("build_minimizer_index"); + // Minimal validation to avoid inconsistent parameters that would // otherwise require manual handling. std::string err = params.validate(); if (!err.empty()) { - std::cerr << "error: [build_minimizer_index()] " << err << std::endl; - std::exit(EXIT_FAILURE); + logger.error() << err << std::endl; } // Count the distinct samples that we might use for the payload. @@ -518,6 +520,11 @@ gbwtgraph::DefaultMinimizerIndex build_minimizer_index( payload_type = MinimizerIndexParameters::PAYLOAD_ZIPCODES; } } + if (require_path_payloads && payload_type != MinimizerIndexParameters::PAYLOAD_ZIPCODES_WITH_PATHS) { + // We want to bail out before doing all the indexing if we aren't going to have path info. + logger.error() << "Cannot build minimizer index supporting recombination-aware mapping (GBZ contains " + << sample_count << "/" << MinimizerIndexParameters::MAX_PAYLOAD_PATHS << " samples)" << std::endl; + } std::string payload_str = MinimizerIndexParameters::payload_str(payload_type); // Create an empty minimizer index. @@ -530,13 +537,13 @@ gbwtgraph::DefaultMinimizerIndex build_minimizer_index( // Build the index. if (params.progress) { - std::cerr << "Building MinimizerIndex with k = " << index.k(); + auto s = logger.info() << "Building MinimizerIndex with k = " << index.k(); if (index.uses_syncmers()) { - std::cerr << ", s = " << index.s(); + s << ", s = " << index.s(); } else { - std::cerr << ", w = " << index.w(); + s << ", w = " << index.w(); } - std::cerr << ", payload = " << payload_str << std::endl; + s << ", payload = " << payload_str << std::endl; } if (distance_index == nullptr) { @@ -565,11 +572,11 @@ gbwtgraph::DefaultMinimizerIndex build_minimizer_index( // Index statistics. if (params.progress) { - std::cerr << index.size() << " keys (" << index.unique_keys() << " unique)" << std::endl; - std::cerr << "Minimizer occurrences: " << index.number_of_values() << std::endl; - std::cerr << "Load factor: " << index.load_factor() << std::endl; + logger.info() << index.size() << " keys (" << index.unique_keys() << " unique)" << std::endl; + logger.info() << "Minimizer occurrences: " << index.number_of_values() << std::endl; + logger.info() << "Load factor: " << index.load_factor() << std::endl; double seconds = gbwt::readTimer() - start; - std::cerr << "Construction time: " << seconds << " seconds" << std::endl; + logger.info() << "Construction time: " << seconds << " seconds" << std::endl; } return index; diff --git a/src/gbwtgraph_helper.hpp b/src/gbwtgraph_helper.hpp index 52594d7a81..30ff179029 100644 --- a/src/gbwtgraph_helper.hpp +++ b/src/gbwtgraph_helper.hpp @@ -250,7 +250,8 @@ gbwtgraph::DefaultMinimizerIndex build_minimizer_index( const gbwtgraph::GBZ& gbz, const SnarlDistanceIndex* distance_index, ZipCodeCollection* oversized_zipcodes, - const MinimizerIndexParameters& params + const MinimizerIndexParameters& params, + bool require_path_payloads = false ); /// Checks that the minimizer index has the expected payload type. diff --git a/src/subcommand/minimizer_main.cpp b/src/subcommand/minimizer_main.cpp index 7ff38656f8..074867685d 100644 --- a/src/subcommand/minimizer_main.cpp +++ b/src/subcommand/minimizer_main.cpp @@ -60,6 +60,9 @@ struct MinimizerConfig { // Construction parameters. MinimizerIndexParameters params; bool require_distance_index = true; + /// Should we enforce that the resulting minimizer index has path + /// payloads, for recombination-aware mapping? + bool require_path_payloads = false; // Other options. // Note that params also has a 'progress' field. @@ -99,7 +102,8 @@ int main_minimizer(int argc, char** argv) { gbz, distance_index.get(), &oversized_zipcodes, - config.params + config.params, + config.require_path_payloads ); // Close the distance index so it can't seem to be modified after the files @@ -165,6 +169,7 @@ void help_minimizer(char** argv) { std::cerr << " (using more than " << DEFAULT_MAX_THREADS << " threads rarely helps)" << std::endl; std::cerr << " --no-dist build the index without distance index annotations" << std::endl; std::cerr << " (not recommended)" << std::endl; + std::cerr << " -E, --rec-mode assert MinimizerIndex will support recombination-aware mapping" << std::endl; std::cerr << " -h, --help print this help message to stderr and exit" << std::endl; std::cerr << std::endl; } @@ -203,12 +208,13 @@ MinimizerConfig::MinimizerConfig(int argc, char** argv, int max_threads, const L { "progress", no_argument, 0, 'p' }, { "threads", required_argument, 0, 't' }, { "no-dist", no_argument, 0, OPT_NO_DIST }, + { "rec-mode", no_argument, 0, 'E' }, { "help", no_argument, 0, 'h' }, { 0, 0, 0, 0 } }; int option_index = 0; - c = getopt_long(argc, argv, "d:o:k:w:cs:Wz:pt:h?", long_options, &option_index); + c = getopt_long(argc, argv, "d:o:k:w:cs:Wz:pt:Eh?", long_options, &option_index); if (c == -1) { break; } // End of options. switch (c) @@ -265,6 +271,9 @@ MinimizerConfig::MinimizerConfig(int argc, char** argv, int max_threads, const L case OPT_NO_DIST: this->require_distance_index = false; break; + case 'E': + this->require_path_payloads = true; + break; case 'h': case '?': diff --git a/test/t/46_vg_minimizer.t b/test/t/46_vg_minimizer.t index a3a2d44160..df1d81669a 100644 --- a/test/t/46_vg_minimizer.t +++ b/test/t/46_vg_minimizer.t @@ -45,7 +45,7 @@ is $? 0 "construction with zipcode payload" # Store zipcode payload and path information in the index # Construction will not be deterministic because the snarls are not deterministic vg minimizer -t 1 -o x.mi -z x.zipcodes --rec-mode -d x.dist x.gbz -is $? 0 "construction with zipcode payload and path information" +is $? 0 "construction with zipcode payload and mandatory path information" rm -f x.vg x.dist x.mi x.zipcodes x.gbz From 4ea138cd6777ed3e144ce80f6074c1203ad45939 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 15 May 2026 17:16:25 -0400 Subject: [PATCH 14/23] Set optimized R10 recombination parameters --- src/subcommand/giraffe_main.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index 767cc7b40d..e8708cdb13 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -1024,7 +1024,7 @@ int main_giraffe(int argc, char** argv) { .add_entry("item-scale", 1.0) .add_entry("gap-scale", 0.06759721757973396) .add_entry("rec-penalty", 2) - .add_entry("rec-consistency-bonus", 12) + .add_entry("rec-consistency-bonus", 13) .add_entry("chain-score-threshold", 160.0) .add_entry("min-chains", 2) .add_entry("max-chains-per-tree", 3) @@ -1044,7 +1044,7 @@ int main_giraffe(int argc, char** argv) { .add_entry("wfa-max-mismatches", 2) .add_entry("wfa-max-mismatches-per-base", 0.05) .add_entry("wfa-max-max-mismatches", 15) - .add_entry("rec-penalty-aln", 28) + .add_entry("rec-penalty-aln", 32) .add_entry("prune-low-cplx", true); presets.emplace("r10", r10_base); From fe16fd8800bfba12e937b359c4823c507905b5e1 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 15 May 2026 17:17:00 -0400 Subject: [PATCH 15/23] Set optimized R10 min-chain-score-per-base --- src/subcommand/giraffe_main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index e8708cdb13..7f3e045e94 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -1028,7 +1028,7 @@ int main_giraffe(int argc, char** argv) { .add_entry("chain-score-threshold", 160.0) .add_entry("min-chains", 2) .add_entry("max-chains-per-tree", 3) - .add_entry("min-chain-score-per-base", 0.052) + .add_entry("min-chain-score-per-base", 0.0010573511598202133) .add_entry("max-min-chain-score", 1900.0) .add_entry("min-indel-avoid-bases", 50) .add_entry("max-skipped-bases", 1000) From fbde74259de2cc3e63fdf8b294fb4d6896619296 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 19 May 2026 11:03:13 -0400 Subject: [PATCH 16/23] Put min chain score per base back up and write down why we shouldn't be tempted to lower it --- src/subcommand/giraffe_main.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index 7f3e045e94..71189581e2 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -1028,7 +1028,10 @@ int main_giraffe(int argc, char** argv) { .add_entry("chain-score-threshold", 160.0) .add_entry("min-chains", 2) .add_entry("max-chains-per-tree", 3) - .add_entry("min-chain-score-per-base", 0.0010573511598202133) + // Lowering this can reduce wrong reads in mapping experiments, but + // seems to *increase* miscalls; see + // https://ucsc-gi.slack.com/archives/CJ2EHEH1A/p1779202719223779?thread_ts=1779127572.303779&cid=CJ2EHEH1A + .add_entry("min-chain-score-per-base", 0.052) .add_entry("max-min-chain-score", 1900.0) .add_entry("min-indel-avoid-bases", 50) .add_entry("max-skipped-bases", 1000) From 1b7a1bf2a72b32904a8ed7cd3c9f79800d13601a Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 19 May 2026 14:23:10 -0400 Subject: [PATCH 17/23] Stop mapping reads with negative scores and move responsibility for unaligned mapping generation --- src/minimizer_mapper.hpp | 11 ++- src/minimizer_mapper_from_chains.cpp | 135 +++++++++++++++------------ 2 files changed, 85 insertions(+), 61 deletions(-) diff --git a/src/minimizer_mapper.hpp b/src/minimizer_mapper.hpp index f4387474c3..4a618a8ad2 100644 --- a/src/minimizer_mapper.hpp +++ b/src/minimizer_mapper.hpp @@ -916,14 +916,21 @@ class MinimizerMapper : public AlignerClient { const std::vector>& minimizer_kept_chain_count, vector& alignments, vector& multiplicity_by_alignment, vector& alignments_to_source, - SmallBitset& minimizer_explored, aligner_stats_t& stats, bool& funnel_depleted, LazyRNG& rng, Funnel& funnel) const; + SmallBitset& minimizer_explored, aligner_stats_t& stats, LazyRNG& rng, Funnel& funnel) const; + /** + * Select the max_multimaps best alignments from alignments into mappings. + * + * Skips alignments that overlap too much with previous alignments. + * + * If no alignments have a positive score, responsible for creating an unmapped mapping. + */ void pick_mappings_from_alignments(Alignment& aln, const std::vector& alignments, const std::vector& multiplicity_by_alignment, const std::vector& alignments_to_source, const std::vector& chain_score_estimates, std::vector& mappings, std::vector& scores, std::vector& multiplicity_by_mapping, - bool& funnel_depleted, LazyRNG& rng, Funnel& funnel) const; + LazyRNG& rng, Funnel& funnel) const; diff --git a/src/minimizer_mapper_from_chains.cpp b/src/minimizer_mapper_from_chains.cpp index 5a9ec5e3a0..e1bfd77441 100644 --- a/src/minimizer_mapper_from_chains.cpp +++ b/src/minimizer_mapper_from_chains.cpp @@ -835,8 +835,6 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { // Track statistics about how many bases were aligned by diffrent methods, and how much time was used. aligner_stats_t stats; - bool funnel_depleted = false; - // This maps from alignment index back to chain index, for // tracing back to minimizers for MAPQ. Can hold // numeric_limits::max() for an unaligned alignment. @@ -846,7 +844,7 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { if (alignments.size() == 0) { do_alignment_on_chains(aln, seeds, minimizers, seed_anchors, chains, chain_source_tree, multiplicity_by_chain, chain_score_estimates, minimizer_kept_chain_count, alignments, multiplicity_by_alignment, - alignments_to_source, minimizer_explored, stats, funnel_depleted, rng, funnel); + alignments_to_source, minimizer_explored, stats, rng, funnel); } for (size_t alignment_index = 0; alignment_index < alignments.size(); ++alignment_index) { @@ -903,9 +901,10 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { vector scores; //The multiplicities of mappings vector multiplicity_by_mapping; - + + // Collect the chosen mappings, or create the unmapped-read mapping. pick_mappings_from_alignments(aln, alignments, multiplicity_by_alignment, alignments_to_source, chain_score_estimates, - mappings, scores, multiplicity_by_mapping, funnel_depleted, rng, funnel); + mappings, scores, multiplicity_by_mapping, rng, funnel); if (track_provenance) { funnel.substage("mapq"); @@ -1884,7 +1883,6 @@ void MinimizerMapper::do_alignment_on_chains(Alignment& aln, const std::vector& alignments, vector& multiplicity_by_alignment, vector& alignments_to_source, SmallBitset& minimizer_explored, aligner_stats_t& stats, - bool& funnel_depleted, LazyRNG& rng, Funnel& funnel) const { if (track_provenance) { @@ -2204,41 +2202,27 @@ void MinimizerMapper::do_alignment_on_chains(Alignment& aln, const std::vector::max()); - multiplicity_by_alignment.emplace_back(0); - // Stop telling the funnel about filters and items. - funnel_depleted = true; - } else { - //chain_count_by_alignment is currently the number of better or equal chains that were used - // We really want the number of chains not including the ones that represent the same mapping - // TODO: This isn't very efficient - for (size_t i = 0 ; i < chain_count_by_alignment.size() ; ++i) { - size_t chain_i = alignments_to_source[i]; - for (size_t j = 0 ; j < chain_count_by_alignment.size() ; ++j) { - size_t chain_j = alignments_to_source[j]; - if (i != j && - chain_score_estimates[chain_i] >= chain_score_estimates[chain_j] && - chain_ranges_are_equivalent(seeds[chains[chain_i].front()], - seeds[chains[chain_i].back()], - seeds[chains[chain_j].front()], - seeds[chains[chain_j].back()])) { - --chain_count_by_alignment[i]; - } + //chain_count_by_alignment is currently the number of better or equal chains that were used + // We really want the number of chains not including the ones that represent the same mapping + // TODO: This isn't very efficient + for (size_t i = 0 ; i < chain_count_by_alignment.size() ; ++i) { + size_t chain_i = alignments_to_source[i]; + for (size_t j = 0 ; j < chain_count_by_alignment.size() ; ++j) { + size_t chain_j = alignments_to_source[j]; + if (i != j && + chain_score_estimates[chain_i] >= chain_score_estimates[chain_j] && + chain_ranges_are_equivalent(seeds[chains[chain_i].front()], + seeds[chains[chain_i].back()], + seeds[chains[chain_j].front()], + seeds[chains[chain_j].back()])) { + --chain_count_by_alignment[i]; } } - for (size_t i = 0 ; i < multiplicity_by_alignment.size() ; ++i) { - multiplicity_by_alignment[i] += (chain_count_by_alignment[i] >= alignments.size() - ? ((double)chain_count_by_alignment[i] - (double) alignments.size()) - : 0.0); - } + } + for (size_t i = 0 ; i < multiplicity_by_alignment.size() ; ++i) { + multiplicity_by_alignment[i] += (chain_count_by_alignment[i] >= alignments.size() + ? ((double)chain_count_by_alignment[i] - (double) alignments.size()) + : 0.0); } } @@ -2249,7 +2233,7 @@ void MinimizerMapper::pick_mappings_from_alignments(Alignment& aln, const std::v std::vector& mappings, std::vector& scores, std::vector& multiplicity_by_mapping, - bool& funnel_depleted, LazyRNG& rng, + LazyRNG& rng, Funnel& funnel) const { // Look for duplicate alignments by using this collection of node IDs and orientations @@ -2294,14 +2278,13 @@ void MinimizerMapper::pick_mappings_from_alignments(Alignment& aln, const std::v if (this->sort_by_chain_score) { // Use the chain's score to rank the alignments size_t chain_number = alignments_to_source.at(alignment_number); - if (chain_number == std::numeric_limits::max()) { - // This is an unaligned alignment, score 0. - return 0; - } + // Unaligned alignments from no chain are no longer allowed in; + // this function is supposed to make them. So all alignments + // reference into chain_score_estimates. return chain_score_estimates.at(chain_number); } else { // Use base-level alignment score to rank alignments - // Tiebreak by identity (always > 0 and <= 1) + // Tiebreak by identity (which is always 0 to 1) return alignments.at(alignment_number).score() + identity(alignments.at(alignment_number).path()); } }; @@ -2316,39 +2299,51 @@ void MinimizerMapper::pick_mappings_from_alignments(Alignment& aln, const std::v // This alignment makes it // Called in score order + // Filter to alignments with strictly positive scores + if (alignments[alignment_num].score() <= 0) { + if (track_provenance) { + funnel.fail("nonzero-score", alignment_num); + } + return false; + } else { + if (track_provenance) { + funnel.pass("nonzero-score", alignment_num); + } + } + // Do the unique node fraction filter double unique_node_fraction = get_fraction_unique(alignment_num); if (unique_node_fraction < min_unique_node_fraction) { // If not enough of the alignment is from unique nodes, drop it. - if (track_provenance && !funnel_depleted) { + if (track_provenance) { funnel.fail("min-unique-node-fraction", alignment_num, unique_node_fraction); } if (show_work) { #pragma omp critical (cerr) { cerr << log_name() << "alignment " << alignment_num << " rejected because only " << unique_node_fraction << " of it is from nodes not already used" << endl; - if (track_correctness && !funnel_depleted && funnel.was_correct(alignment_num)) { + if (track_correctness && funnel.was_correct(alignment_num)) { cerr << log_name() << "\tCORRECT!" << endl; } } } return false; } else { - if (track_provenance && !funnel_depleted) { + if (track_provenance) { funnel.pass("min-unique-node-fraction", alignment_num, unique_node_fraction); } if (show_work) { #pragma omp critical (cerr) { cerr << log_name() << "alignment " << alignment_num << " accepted because " << unique_node_fraction << " of it is from nodes not already used" << endl; - if (track_correctness && !funnel_depleted && funnel.was_correct(alignment_num)) { + if (track_correctness && funnel.was_correct(alignment_num)) { cerr << log_name() << "\tCORRECT!" << endl; } } } } - if (track_provenance && !funnel_depleted) { + if (track_provenance) { // Tell the funnel funnel.pass("max-multimaps", alignment_num); } @@ -2364,7 +2359,7 @@ void MinimizerMapper::pick_mappings_from_alignments(Alignment& aln, const std::v // Remember the multiplicity multiplicity_by_mapping.emplace_back(multiplicity_by_alignment[alignment_num]); - if (track_provenance && !funnel_depleted) { + if (track_provenance) { // Tell the funnel funnel.project(alignment_num); funnel.score(funnel.latest(), scores.back()); @@ -2379,14 +2374,14 @@ void MinimizerMapper::pick_mappings_from_alignments(Alignment& aln, const std::v double unique_node_fraction = get_fraction_unique(alignment_num); if (unique_node_fraction < min_unique_node_fraction) { // If not enough of the alignment is from unique nodes, drop it. - if (track_provenance && !funnel_depleted) { + if (track_provenance) { funnel.fail("min-unique-node-fraction", alignment_num, unique_node_fraction); } if (show_work) { #pragma omp critical (cerr) { cerr << log_name() << "alignment " << alignment_num << " rejected because only " << unique_node_fraction << " of it is from nodes not already used" << endl; - if (track_correctness && !funnel_depleted && funnel.was_correct(alignment_num)) { + if (track_correctness && funnel.was_correct(alignment_num)) { cerr << log_name() << "\tCORRECT!" << endl; } } @@ -2394,14 +2389,14 @@ void MinimizerMapper::pick_mappings_from_alignments(Alignment& aln, const std::v // If we fail the unique node fraction filter, we won't count as a secondary for MAPQ return; } else { - if (track_provenance && !funnel_depleted) { + if (track_provenance) { funnel.pass("min-unique-node-fraction", alignment_num, unique_node_fraction); } if (show_work) { #pragma omp critical (cerr) { cerr << log_name() << "alignment " << alignment_num << " accepted because " << unique_node_fraction << " of it is from nodes not already used" << endl; - if (track_correctness && !funnel_depleted && funnel.was_correct(alignment_num)) { + if (track_correctness && funnel.was_correct(alignment_num)) { cerr << log_name() << "\tCORRECT!" << endl; } } @@ -2412,14 +2407,36 @@ void MinimizerMapper::pick_mappings_from_alignments(Alignment& aln, const std::v scores.emplace_back(alignments[alignment_num].score()); multiplicity_by_mapping.emplace_back(multiplicity_by_alignment[alignment_num]); - if (track_provenance && !funnel_depleted) { + if (track_provenance) { funnel.fail("max-multimaps", alignment_num); } }, [&](size_t alignment_num) { - // This alignment does not have a sufficiently good score - // Score threshold is 0; this should never happen - crash_unless(false); + // This alignment does not have a sufficiently good score. + // It may have been penalized into negative score. + + if (track_provenance) { + // Call this a fail of the nonzero score filter, even though that + // really filters on alignment score and not sorting score, which + // can be different. + // + // TODO: Remove feature to sort by chain score? + funnel.fail("nonzero-score", alignment_num); + } }); + + if (mappings.empty()) { + // We didn't find any mappings, so make an unmapped one. + + scores.emplace_back(0); + mappings.emplace_back(aln); + multiplicity_by_mapping.emplace_back(0); + + if (track_provenance) { + // Tell the funnel + funnel.introduce(); + funnel.score(funnel.latest(), scores.back()); + } + } } double MinimizerMapper::get_read_coverage( From 41f7494d27f487eeccd01814b203018f42b38f22 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 19 May 2026 14:24:11 -0400 Subject: [PATCH 18/23] Add test file --- test/reads/small.recombined.fq | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 test/reads/small.recombined.fq diff --git a/test/reads/small.recombined.fq b/test/reads/small.recombined.fq new file mode 100644 index 0000000000..e724391d97 --- /dev/null +++ b/test/reads/small.recombined.fq @@ -0,0 +1,4 @@ +@recombined +ATACCAACTCTCTGGGTCCTGGTGCTATGTGTAACTAGTAATGGTAATGGATATGTTGGGCTTCTTCCTTTGATTTATTTGAAGTAACGTTTGACAATCTATCACCAGGGGTAATGTGGGGAAATGGAAAGAATACAAAGATCTGGAGCCAGACAAATCTGGGTCCAAGTCCTCACTTTGCCGCAGA ++ +CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC From 6d989b2d7decc521a34197580e6b3003fd3c3411 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 19 May 2026 14:20:06 -0400 Subject: [PATCH 19/23] Set up testing for rec-aware mapping by default and remove test for rejecting path-payload minimizers --- test/t/46_vg_minimizer.t | 7 +++++-- test/t/50_vg_giraffe.t | 20 ++++++++++++++------ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/test/t/46_vg_minimizer.t b/test/t/46_vg_minimizer.t index df1d81669a..067e7a8d10 100644 --- a/test/t/46_vg_minimizer.t +++ b/test/t/46_vg_minimizer.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 11 +plan tests 12 # Indexing a single graph @@ -42,10 +42,13 @@ is $(md5sum x.mi | cut -f 1 -d\ ) 2cf6a0fdf9d4fe6ee9b42ae2f093971b "setting -k - vg minimizer -t 1 -o x.mi -z x.zipcodes -d x.dist x.gbz is $? 0 "construction with zipcode payload" +# Make sure the default is to include path information; we know this graph has +# few enough paths +is "$(vg describe x.mi | grep 'payload =' | cut -f2 -d'=' | sed 's/^ *//')" "zipcodes with paths" "zipcodes include paths by default" + # Store zipcode payload and path information in the index # Construction will not be deterministic because the snarls are not deterministic vg minimizer -t 1 -o x.mi -z x.zipcodes --rec-mode -d x.dist x.gbz is $? 0 "construction with zipcode payload and mandatory path information" - rm -f x.vg x.dist x.mi x.zipcodes x.gbz diff --git a/test/t/50_vg_giraffe.t b/test/t/50_vg_giraffe.t index 5ec9a33612..56c07d4021 100644 --- a/test/t/50_vg_giraffe.t +++ b/test/t/50_vg_giraffe.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 82 +plan tests 83 vg construct -a -r small/x.fa -v small/x.vcf.gz >x.vg vg index -x x.xg x.vg @@ -27,11 +27,6 @@ rm -f x-haps.gbwt x-paths.gbwt x-merged.gbwt vg giraffe -Z x.giraffe.gbz -m x.shortread.withzip.min -z x.shortread.zipcodes -d x.dist -f reads/small.middle.ref.fq > mapped1.gam is "${?}" "0" "a read can be mapped with gbz + min + zips + dist specified without crashing" -vg minimizer -d x.dist --rec-mode -o wrong.min -z wrong.zipcodes xx.gbz -vg giraffe -Z x.giraffe.gbz -m wrong.min -z wrong.zipcodes -d x.dist -f reads/small.middle.ref.fq > mapped1.gam -is "${?}" "1" "a minimizer index with the wrong payload type is rejected" -rm -f wrong.min wrong.zipcodes - rm -f x.shortread.withzip.min rm -f x.shortread.zipcodes vg giraffe -Z x.giraffe.gbz -d x.dist -f reads/small.middle.ref.fq > mapped1.gam @@ -69,6 +64,19 @@ rm -f log.txt vg giraffe -Z x.giraffe.gbz -f reads/small.middle.ref.mismatched.fq -b chaining-sr >/dev/null is "${?}" "0" "a read with a mismatch can be mapped with the short read chaining preset" +rm -f x.longread.withzip.min +rm -f x.longread.zipcodes + +vg giraffe -Z x.giraffe.gbz -f reads/small.recombined.fq -b hifi >mapped.gam +is "$(vg view -aj mapped.gam | jq '.annotation.chain.rec_count')" "1" "recombinations are counted by default when mapping with the hifi preset" + +vg giraffe -Z x.giraffe.gbz -f reads/small.recombined.fq -b hifi --rec-penalty-aln 9999 >mapped.gam +is "$(vg view -aj mapped.gam | jq '.score // 0')" "0" "recombination alignment score penalty is used by default and can unmap a read" + +rm -f mapped.gam +rm -f x.longread.withzip.min +rm -f x.longread.zipcodes + rm -Rf grid-out mkdir grid-out vg giraffe -Z x.giraffe.gbz -f reads/small.middle.ref.fq --output-basename grid-out/file --hard-hit-cap 5:6 From 93df42ea5672c82bbb3dcec49336bd45efafe0e0 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 19 May 2026 12:17:19 -0700 Subject: [PATCH 20/23] Add missing include --- src/subcommand/giraffe_main.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index 71189581e2..39b3b42306 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include From 9d17b71f788beda4829b11c42e6fa2b0c0040d0f Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 19 May 2026 18:22:46 -0400 Subject: [PATCH 21/23] Wrap help text --- src/subcommand/minimizer_main.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/subcommand/minimizer_main.cpp b/src/subcommand/minimizer_main.cpp index 074867685d..f2f69da695 100644 --- a/src/subcommand/minimizer_main.cpp +++ b/src/subcommand/minimizer_main.cpp @@ -169,7 +169,8 @@ void help_minimizer(char** argv) { std::cerr << " (using more than " << DEFAULT_MAX_THREADS << " threads rarely helps)" << std::endl; std::cerr << " --no-dist build the index without distance index annotations" << std::endl; std::cerr << " (not recommended)" << std::endl; - std::cerr << " -E, --rec-mode assert MinimizerIndex will support recombination-aware mapping" << std::endl; + std::cerr << " -E, --rec-mode assert MinimizerIndex will support" << std::endl; + std::cerr << " recombination-aware mapping" << std::endl; std::cerr << " -h, --help print this help message to stderr and exit" << std::endl; std::cerr << std::endl; } From 554a6120c710cdb44842926734880408269e05a8 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 20 May 2026 13:18:02 -0400 Subject: [PATCH 22/23] Use GBWTGraph's idea of max samples, and sniff for path cover samples --- deps/gbwtgraph | 2 +- src/gbwtgraph_helper.cpp | 38 ++++++++++++++++++++++++++++++++------ src/gbwtgraph_helper.hpp | 4 ---- 3 files changed, 33 insertions(+), 11 deletions(-) diff --git a/deps/gbwtgraph b/deps/gbwtgraph index a248fe58bd..bcc248bff4 160000 --- a/deps/gbwtgraph +++ b/deps/gbwtgraph @@ -1 +1 @@ -Subproject commit a248fe58bd59da9ee8328558199f01fbe2503a0e +Subproject commit bcc248bff469bcb6d69a9e37be1dab43580f2417 diff --git a/src/gbwtgraph_helper.cpp b/src/gbwtgraph_helper.cpp index d3dbbf7142..02d855c253 100644 --- a/src/gbwtgraph_helper.cpp +++ b/src/gbwtgraph_helper.cpp @@ -517,15 +517,34 @@ gbwtgraph::DefaultMinimizerIndex build_minimizer_index( // TODO: We're counting the magic generic samples here. size_t sample_count = gbz.graph.index->metadata.sample_names.size(); - // Put the paths in if we think they will fit. - bool paths_in_payload = sample_count <= MinimizerIndexParameters::MAX_PAYLOAD_PATHS; + // Consider reasons we can't index paths. + // Don't put the paths in if we think they won't fit. + bool too_many_samples = sample_count > gbwtgraph::MAX_PATH_IDS; + // Don't put the paths in if they would fit but were generated as a path + // cover. + bool has_path_cover = false; + if (!too_many_samples) { + // Check if any samples in the GBZ are path cover samples + for (size_t i = 0; i < sample_count; i++) { + // Look at the name of each sample + std::string sample_name = gbz.graph.index->metadata.sample(i); + // See if it starts with the path cover prefix. + // TODO: Use C++20 starts_with when available + // See + if (sample_name.rfind(gbwtgraph::COVER_PATH_SAMPLE_PREFIX, 0) == 0) { + has_path_cover = true; + break; + } + } + } // Determine payload size and type code. size_t payload_size = 0; MinimizerIndexParameters::PayloadType payload_type = MinimizerIndexParameters::PAYLOAD_NONE; if (distance_index != nullptr) { payload_size = MinimizerIndexParameters::ZIPCODE_PAYLOAD_SIZE; - if (paths_in_payload) { + if (!too_many_samples && !has_path_cover) { + // We can track the paths for recombination-aware mapping payload_size++; payload_type = MinimizerIndexParameters::PAYLOAD_ZIPCODES_WITH_PATHS; } else { @@ -534,8 +553,15 @@ gbwtgraph::DefaultMinimizerIndex build_minimizer_index( } if (require_path_payloads && payload_type != MinimizerIndexParameters::PAYLOAD_ZIPCODES_WITH_PATHS) { // We want to bail out before doing all the indexing if we aren't going to have path info. - logger.error() << "Cannot build minimizer index supporting recombination-aware mapping (GBZ contains " - << sample_count << "/" << MinimizerIndexParameters::MAX_PAYLOAD_PATHS << " samples)" << std::endl; + auto s = logger.error() << "Cannot build minimizer index supporting recombination-aware mapping"; + if (too_many_samples) { + s << " because GBZ contains " << sample_count << " samples, more than the limit of " << gbwtgraph::MAX_PATH_IDS << std::endl; + } else if (has_path_cover) { + s << " because GBZ contains path cover samples starting with \"" + << gbwtgraph::COVER_PATH_SAMPLE_PREFIX << "\"" << std::endl; + } else { + s << std::endl; + } } std::string payload_str = MinimizerIndexParameters::payload_str(payload_type); @@ -578,7 +604,7 @@ gbwtgraph::DefaultMinimizerIndex build_minimizer_index( } return reinterpret_cast(&payload_by_offset[nid - min_node_id]); }; - if (paths_in_payload) { + if (payload_type == MinimizerIndexParameters::PAYLOAD_ZIPCODES_WITH_PATHS) { gbwtgraph::index_haplotypes_with_paths(gbz, index, get_payload); } else { gbwtgraph::index_haplotypes(gbz, index, get_payload); diff --git a/src/gbwtgraph_helper.hpp b/src/gbwtgraph_helper.hpp index 30ff179029..df344f78e1 100644 --- a/src/gbwtgraph_helper.hpp +++ b/src/gbwtgraph_helper.hpp @@ -150,10 +150,6 @@ struct MinimizerIndexParameters { /// Number of words used for a zipcode payload. constexpr static size_t ZIPCODE_PAYLOAD_SIZE = sizeof(ZipCode::payload_type) / sizeof(gbwtgraph::KmerEncoding::code_type); - /// How many hapolotypoes should we expect to be able to store on a minimizer index path payload? - /// TODO: Move this to the minimizer index. - static constexpr int MAX_PAYLOAD_PATHS = 60; - enum PayloadType { /// No payload. PAYLOAD_NONE, From d09498930e22cf78ca917b739ce4f5c028c16133 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 20 May 2026 13:35:11 -0400 Subject: [PATCH 23/23] Add test for reporting on path covers and fix error message generation --- src/gbwtgraph_helper.cpp | 13 +++++++------ test/t/46_vg_minimizer.t | 10 ++++++++-- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/gbwtgraph_helper.cpp b/src/gbwtgraph_helper.cpp index 02d855c253..e9dbfda99e 100644 --- a/src/gbwtgraph_helper.cpp +++ b/src/gbwtgraph_helper.cpp @@ -553,15 +553,16 @@ gbwtgraph::DefaultMinimizerIndex build_minimizer_index( } if (require_path_payloads && payload_type != MinimizerIndexParameters::PAYLOAD_ZIPCODES_WITH_PATHS) { // We want to bail out before doing all the indexing if we aren't going to have path info. - auto s = logger.error() << "Cannot build minimizer index supporting recombination-aware mapping"; + std::stringstream ss; + ss << "Cannot build minimizer index supporting recombination-aware mapping"; if (too_many_samples) { - s << " because GBZ contains " << sample_count << " samples, more than the limit of " << gbwtgraph::MAX_PATH_IDS << std::endl; + ss << " because GBZ contains " << sample_count << " samples, more than the limit of " << gbwtgraph::MAX_PATH_IDS; } else if (has_path_cover) { - s << " because GBZ contains path cover samples starting with \"" - << gbwtgraph::COVER_PATH_SAMPLE_PREFIX << "\"" << std::endl; - } else { - s << std::endl; + ss << " because GBZ contains path cover samples starting with \"" + << gbwtgraph::COVER_PATH_SAMPLE_PREFIX << "\""; } + ss << std::endl; + logger.error() << ss.str(); } std::string payload_str = MinimizerIndexParameters::payload_str(payload_type); diff --git a/test/t/46_vg_minimizer.t b/test/t/46_vg_minimizer.t index 067e7a8d10..ef67f49599 100644 --- a/test/t/46_vg_minimizer.t +++ b/test/t/46_vg_minimizer.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 12 +plan tests 14 # Indexing a single graph @@ -51,4 +51,10 @@ is "$(vg describe x.mi | grep 'payload =' | cut -f2 -d'=' | sed 's/^ *//')" "zip vg minimizer -t 1 -o x.mi -z x.zipcodes --rec-mode -d x.dist x.gbz is $? 0 "construction with zipcode payload and mandatory path information" -rm -f x.vg x.dist x.mi x.zipcodes x.gbz +# Build a path cover index +vg gbwt -P -x x.vg -g x_cover.gbz +vg minimizer -t 1 -o x.mi -z x.zipcodes --rec-mode -d x.dist x_cover.gbz 2>error.txt +is $? "1" "construction with mandatory path information fails on a path cover GBZ" +is $(grep 'path cover' error.txt | wc -l) "1" "error message references path cover as the problem" + +rm -f x.vg x.dist x.mi x.zipcodes x.gbz x_cover.gbz error.txt