diff --git a/deps/gbwtgraph b/deps/gbwtgraph index a248fe58bd..bcc248bff4 160000 --- a/deps/gbwtgraph +++ b/deps/gbwtgraph @@ -1 +1 @@ -Subproject commit a248fe58bd59da9ee8328558199f01fbe2503a0e +Subproject commit bcc248bff469bcb6d69a9e37be1dab43580f2417 diff --git a/src/algorithms/chain_items.cpp b/src/algorithms/chain_items.cpp index 884ea95c73..30c8fec26f 100644 --- a/src/algorithms/chain_items.cpp +++ b/src/algorithms/chain_items.cpp @@ -640,13 +640,9 @@ TracedScore chain_items_dp(vector& chain_scores, const HandleGraph& graph, int gap_open, int gap_extension, + const ChainScoringScheme& scheme, const transition_iterator& for_each_transition, - int item_bonus, - double item_scale, - double gap_scale, - double points_per_possible_match, size_t max_indel_bases, - int recomb_penalty, bool show_work) { DiagramExplainer diagram(show_work); @@ -659,7 +655,9 @@ TracedScore chain_items_dp(vector& chain_scores, cerr << "Chaining group of " << to_chain.size() << " items" << endl; } - crash_unless(recomb_penalty >= 0); + crash_unless(scheme.recombination_penalty >= 0); + crash_unless(scheme.consistency_bonus >= 0); + // Compute a base seed average length. // TODO: Weight anchors differently? @@ -683,11 +681,11 @@ TracedScore chain_items_dp(vector& chain_scores, // We store the bonus used to select the current winning predecessor for // each seed in this vector, which runs alongside the DP table. // - // Starting from nowhere means full path conservation, so bonus = recomb_penalty. - std::vector eval_bonuses(to_chain.size(), recomb_penalty); + // Starting from nowhere means full path conservation, so bonus = scheme.consistency_bonus. + std::vector eval_bonuses(to_chain.size(), scheme.consistency_bonus); for (size_t i = 0; i < to_chain.size(); i++) { // Set up DP table so we can start anywhere with that item's score, scaled and with bonus applied. - chain_scores[i] = {(int)(to_chain[i].score() * item_scale + item_bonus), TracedScore::nowhere(), to_chain[i].anchor_end_paths()}; + chain_scores[i] = {(int)(to_chain[i].score() * scheme.item_scale + scheme.item_bonus), TracedScore::nowhere(), to_chain[i].anchor_end_paths()}; } // We will run this over every transition in a good DP order. @@ -706,7 +704,7 @@ TracedScore chain_items_dp(vector& chain_scores, auto& here = to_chain[transition.to_anchor]; // How many points is it worth to collect? - auto item_points = here.score() * item_scale + item_bonus; + auto item_points = here.score() * scheme.item_scale + scheme.item_bonus; std::string here_gvnode; if (diagram) { @@ -714,10 +712,10 @@ TracedScore chain_items_dp(vector& chain_scores, } // If we come from nowhere, we get those points. - // This also has full path conservation (bonus = recomb_penalty). + // This also has full path conservation (bonus = scheme.consistency_bonus). { TracedScore from_nowhere = {(int)item_points, TracedScore::nowhere(), here.anchor_end_paths()}; - int nowhere_bonus = recomb_penalty; + int nowhere_bonus = scheme.consistency_bonus; int eval_nowhere = from_nowhere.score + nowhere_bonus; int eval_current = chain_scores[transition.to_anchor].score + eval_bonuses[transition.to_anchor]; if (eval_nowhere > eval_current) { @@ -787,13 +785,13 @@ TracedScore chain_items_dp(vector& chain_scores, // // But we account for anchor length in the item points, so don't use it // here. - jump_points = -score_chain_gap(indel_length, base_seed_length) * gap_scale; + jump_points = -score_chain_gap(indel_length, base_seed_length) * scheme.gap_scale; // add recombination penalty if necessary - jump_points -= check_recombination(chain_scores[transition.from_anchor], here) * recomb_penalty; + jump_points -= check_recombination(chain_scores[transition.from_anchor], here) * scheme.recombination_penalty; // We can also account for the non-indel material, which we assume will have some identity in it. - jump_points += possible_match_length * points_per_possible_match; + jump_points += possible_match_length * scheme.points_per_possible_match; } if (jump_points != numeric_limits::min()) { @@ -805,15 +803,15 @@ TracedScore chain_items_dp(vector& chain_scores, .set_shared_paths(here.anchor_paths()); // Evaluate heuristic to preserve path flexibility without inflating actual scoring DP. - // Bonus = fraction of conserved paths * recomb_penalty. + // Bonus = fraction of conserved paths * scheme.consistency_bonus. // Bonus is 0 when recombination occurs (no shared paths). int eval_bonus_from = 0; - if (recomb_penalty > 0) { + if (scheme.consistency_bonus > 0) { int pre_count = __builtin_popcountll(source_score.paths); if (pre_count > 0 && (source_score.paths & here.anchor_start_paths()) != 0) { // No recombination: bonus = fraction of paths conserved * penalty int post_count = __builtin_popcountll(from_source_score.paths); - eval_bonus_from = (recomb_penalty * post_count) / pre_count; + eval_bonus_from = (scheme.consistency_bonus * post_count) / pre_count; } // Recombination case (no shared paths): bonus stays 0 } @@ -895,7 +893,7 @@ TracedScore chain_items_dp(vector& chain_scores, if (diagram) { // Draw the item in the diagram - auto item_points = here.score() * item_scale + item_bonus; + auto item_points = here.score() * scheme.item_scale + scheme.item_bonus; std::string here_gvnode = "i" + std::to_string(to_anchor); std::stringstream label_stream; label_stream << "#" << to_anchor << " " << here << " = " << item_points @@ -941,8 +939,7 @@ TracedScore chain_items_dp(vector& chain_scores, vector, int>> chain_items_traceback(const vector& chain_scores, const VectorView& to_chain, const TracedScore& best_past_ending_score_ever, - int item_bonus, - double item_scale, + const ChainScoringScheme& scheme, size_t max_tracebacks) { // We will fill this in with all the tracebacks, and then sort and truncate. @@ -990,7 +987,7 @@ vector, int>> chain_items_traceback(const vector& to_chain, const HandleGraph& graph, int gap_open, int gap_extension, - int recomb_penalty, + const ChainScoringScheme& scheme, size_t max_chains, const transition_iterator& for_each_transition, - int item_bonus, - double item_scale, - double gap_scale, - double points_per_possible_match, size_t max_indel_bases, bool show_work) { @@ -1057,20 +1050,16 @@ ChainsResult find_best_chains(const VectorView& to_chain, graph, gap_open, gap_extension, + scheme, for_each_transition, - item_bonus, - item_scale, - gap_scale, - points_per_possible_match, max_indel_bases, - recomb_penalty, - show_work); + show_work); #ifdef debug_chaining std::cerr << "[REC INFO] Recombination number for chain: " << best_past_ending_score_ever.rec_num << "\tscore: " << best_past_ending_score_ever.score << "\tpaths: " << best_past_ending_score_ever.paths << std::endl; #endif // Then do the tracebacks vector, int>> tracebacks = chain_items_traceback( - chain_scores, to_chain, best_past_ending_score_ever, item_bonus, item_scale, max_chains); + chain_scores, to_chain, best_past_ending_score_ever, scheme, max_chains); if (tracebacks.empty()) { // Somehow we got nothing @@ -1181,12 +1170,8 @@ pair> find_best_chain(const VectorView& to_chain, const HandleGraph& graph, int gap_open, int gap_extension, - int recomb_penalty, + const ChainScoringScheme& scheme, const transition_iterator& for_each_transition, - int item_bonus, - double item_scale, - double gap_scale, - double points_per_possible_match, size_t max_indel_bases) { ChainsResult cr = find_best_chains( @@ -1195,13 +1180,9 @@ pair> find_best_chain(const VectorView& to_chain, graph, gap_open, gap_extension, - recomb_penalty, + scheme, 1, for_each_transition, - item_bonus, - item_scale, - gap_scale, - points_per_possible_match, max_indel_bases ); return cr.chains.front().scored_chain; diff --git a/src/algorithms/chain_items.hpp b/src/algorithms/chain_items.hpp index b28d04a0fc..5fca96f941 100644 --- a/src/algorithms/chain_items.hpp +++ b/src/algorithms/chain_items.hpp @@ -382,6 +382,7 @@ ostream& operator<<(ostream& out, const TracedScore& value); */ void sort_anchor_indexes(const std::vector& items, std::vector& indexes); +/// Represents a possible transition between anchors struct transition_info { // Index of the source anchor within the list of anchors size_t from_anchor; @@ -392,14 +393,37 @@ struct transition_info { // Distance between anchors in the read size_t read_distance; - // Constructor; read_distance defaults to max if not given + /// Build transition info from loose values inline transition_info(size_t from, size_t to, size_t graph_dist, size_t read_dist = std::numeric_limits::max()) : from_anchor(from), to_anchor(to), graph_distance(graph_dist), read_distance(read_dist) {} }; + +/// Represents the scoring scheme for chains, to determine which are best. +/// Doesn't cover the parameters that really belong to an alignment scoring +/// scheme (like gap open and extend). +struct ChainScoringScheme { + /// Score bonus for each item collected + int item_bonus = 0; + /// Scale to apply to each item's own score + double item_scale = 1.0; + /// Scale to apply to the scores of gaps + double gap_scale = 1.0; + /// Add this many points per potential match between two seeds + double points_per_possible_match = 0; + /// Penalize this many points per recombination + int recombination_penalty = 0; + /// Apply a bonus during alternative selection (but not to actual DP + /// scores) of this many points when matching haplotype paths are + /// preserved, scaled by fraction of haplotypes preserved. + int consistency_bonus = 0; +}; + /// A single chain result: scored chain plus the recombination count observed /// on its endpoint. +/// TODO: Is there a better name for the abstraction this is getting at? struct ChainWithRec { + // TODO: Shouldn't we split this into 2 fields? std::pair> scored_chain; // Positions (anchor indices) in the chain that introduce a recombination // event between anchors. These correspond to anchors where we had to @@ -418,6 +442,8 @@ struct ChainWithRec { /// Result of finding best chains: a list of chains each paired with the /// recombination count observed at that chain's endpoint. +/// TODO: Can we get rid of this once we're sure it won't need more fields? +/// TODO: Is there a better name for this? struct ChainsResult { std::vector chains; }; @@ -525,8 +551,7 @@ std::vector calculate_transition_read_distances( * * Input items must be sorted by start position in the read. * - * Takes the given per-item bonus for each item collected, and scales item - * scores by the given scale. + * Uses the given scoring scheme to score chains. * * Uses a transition iterator to enumerate where we can come from to reach an * item. @@ -538,15 +563,12 @@ TracedScore chain_items_dp(vector& chain_scores, const VectorView& to_chain, const SnarlDistanceIndex& distance_index, const HandleGraph& graph, + // TODO: We should maybe just take an EditAlignmentScorer here. int gap_open, int gap_extension, + const ChainScoringScheme& scheme = ChainScoringScheme(), const transition_iterator& for_each_transition = lookback_transition_iterator(150, 0, 100), - int item_bonus = 0, - double item_scale = 1.0, - double gap_scale = 1.0, - double points_per_possible_match = 0, size_t max_indel_bases = 100, - int recomb_penalty = 0, bool show_work = false ); @@ -567,8 +589,7 @@ TracedScore chain_items_dp(vector& chain_scores, vector, int>> chain_items_traceback(const vector& chain_scores, const VectorView& to_chain, const TracedScore& best_past_ending_score_ever, - int item_bonus = 0, - double item_scale = 1.0, + const ChainScoringScheme& scheme = ChainScoringScheme(), size_t max_tracebacks = 1); @@ -586,13 +607,9 @@ ChainsResult find_best_chains(const VectorView& to_chain, const HandleGraph& graph, int gap_open, int gap_extension, - int recomb_penalty = 0, + const ChainScoringScheme& scheme = ChainScoringScheme(), size_t max_chains = 1, const transition_iterator& for_each_transition = lookback_transition_iterator(150, 0, 100), - int item_bonus = 0, - double item_scale = 1.0, - double gap_scale = 1.0, - double points_per_possible_match = 0, size_t max_indel_bases = 100, bool show_work = false); @@ -610,12 +627,8 @@ pair> find_best_chain(const VectorView& to_chain, const HandleGraph& graph, int gap_open, int gap_extension, - int recomb_penalty = 0, + const ChainScoringScheme& scheme = ChainScoringScheme(), const transition_iterator& for_each_transition = lookback_transition_iterator(150, 0, 100), - int item_bonus = 0, - double item_scale = 1.0, - double gap_scale = 1.0, - double points_per_possible_match = 0, size_t max_indel_bases = 100); /** diff --git a/src/gbwtgraph_helper.cpp b/src/gbwtgraph_helper.cpp index 51bb1419a3..e9dbfda99e 100644 --- a/src/gbwtgraph_helper.cpp +++ b/src/gbwtgraph_helper.cpp @@ -499,30 +499,71 @@ gbwtgraph::DefaultMinimizerIndex build_minimizer_index( const gbwtgraph::GBZ& gbz, const SnarlDistanceIndex* distance_index, ZipCodeCollection* oversized_zipcodes, - const MinimizerIndexParameters& params + const MinimizerIndexParameters& params, + bool require_path_payloads ) { double start = gbwt::readTimer(); + Logger logger("build_minimizer_index"); + // Minimal validation to avoid inconsistent parameters that would // otherwise require manual handling. std::string err = params.validate(); if (!err.empty()) { - std::cerr << "error: [build_minimizer_index()] " << err << std::endl; - std::exit(EXIT_FAILURE); + logger.error() << err << std::endl; + } + + // Count the distinct samples that we might use for the payload. + // TODO: We're counting the magic generic samples here. + size_t sample_count = gbz.graph.index->metadata.sample_names.size(); + + // Consider reasons we can't index paths. + // Don't put the paths in if we think they won't fit. + bool too_many_samples = sample_count > gbwtgraph::MAX_PATH_IDS; + // Don't put the paths in if they would fit but were generated as a path + // cover. + bool has_path_cover = false; + if (!too_many_samples) { + // Check if any samples in the GBZ are path cover samples + for (size_t i = 0; i < sample_count; i++) { + // Look at the name of each sample + std::string sample_name = gbz.graph.index->metadata.sample(i); + // See if it starts with the path cover prefix. + // TODO: Use C++20 starts_with when available + // See + if (sample_name.rfind(gbwtgraph::COVER_PATH_SAMPLE_PREFIX, 0) == 0) { + has_path_cover = true; + break; + } + } } - // Determine payload size and type. + // Determine payload size and type code. size_t payload_size = 0; MinimizerIndexParameters::PayloadType payload_type = MinimizerIndexParameters::PAYLOAD_NONE; if (distance_index != nullptr) { payload_size = MinimizerIndexParameters::ZIPCODE_PAYLOAD_SIZE; - if (params.paths_in_payload) { + if (!too_many_samples && !has_path_cover) { + // We can track the paths for recombination-aware mapping payload_size++; payload_type = MinimizerIndexParameters::PAYLOAD_ZIPCODES_WITH_PATHS; } else { payload_type = MinimizerIndexParameters::PAYLOAD_ZIPCODES; } } + if (require_path_payloads && payload_type != MinimizerIndexParameters::PAYLOAD_ZIPCODES_WITH_PATHS) { + // We want to bail out before doing all the indexing if we aren't going to have path info. + std::stringstream ss; + ss << "Cannot build minimizer index supporting recombination-aware mapping"; + if (too_many_samples) { + ss << " because GBZ contains " << sample_count << " samples, more than the limit of " << gbwtgraph::MAX_PATH_IDS; + } else if (has_path_cover) { + ss << " because GBZ contains path cover samples starting with \"" + << gbwtgraph::COVER_PATH_SAMPLE_PREFIX << "\""; + } + ss << std::endl; + logger.error() << ss.str(); + } std::string payload_str = MinimizerIndexParameters::payload_str(payload_type); // Create an empty minimizer index. @@ -535,13 +576,13 @@ gbwtgraph::DefaultMinimizerIndex build_minimizer_index( // Build the index. if (params.progress) { - std::cerr << "Building MinimizerIndex with k = " << index.k(); + auto s = logger.info() << "Building MinimizerIndex with k = " << index.k(); if (index.uses_syncmers()) { - std::cerr << ", s = " << index.s(); + s << ", s = " << index.s(); } else { - std::cerr << ", w = " << index.w(); + s << ", w = " << index.w(); } - std::cerr << ", payload = " << payload_str << std::endl; + s << ", payload = " << payload_str << std::endl; } if (distance_index == nullptr) { @@ -564,7 +605,7 @@ gbwtgraph::DefaultMinimizerIndex build_minimizer_index( } return reinterpret_cast(&payload_by_offset[nid - min_node_id]); }; - if (params.paths_in_payload) { + if (payload_type == MinimizerIndexParameters::PAYLOAD_ZIPCODES_WITH_PATHS) { gbwtgraph::index_haplotypes_with_paths(gbz, index, get_payload); } else { gbwtgraph::index_haplotypes(gbz, index, get_payload); @@ -573,11 +614,11 @@ gbwtgraph::DefaultMinimizerIndex build_minimizer_index( // Index statistics. if (params.progress) { - std::cerr << index.size() << " keys (" << index.unique_keys() << " unique)" << std::endl; - std::cerr << "Minimizer occurrences: " << index.number_of_values() << std::endl; - std::cerr << "Load factor: " << index.load_factor() << std::endl; + logger.info() << index.size() << " keys (" << index.unique_keys() << " unique)" << std::endl; + logger.info() << "Minimizer occurrences: " << index.number_of_values() << std::endl; + logger.info() << "Load factor: " << index.load_factor() << std::endl; double seconds = gbwt::readTimer() - start; - std::cerr << "Construction time: " << seconds << " seconds" << std::endl; + logger.info() << "Construction time: " << seconds << " seconds" << std::endl; } return index; diff --git a/src/gbwtgraph_helper.hpp b/src/gbwtgraph_helper.hpp index 161bae25c2..df344f78e1 100644 --- a/src/gbwtgraph_helper.hpp +++ b/src/gbwtgraph_helper.hpp @@ -148,7 +148,7 @@ struct MinimizerIndexParameters { constexpr static size_t HASH_TABLE_MAX_WIDTH = 36; /// Number of words used for a zipcode payload. - constexpr static size_t ZIPCODE_PAYLOAD_SIZE = sizeof(ZipCode::payload_type) / sizeof(gbwtgraph::KmerEncoding::code_type); + constexpr static size_t ZIPCODE_PAYLOAD_SIZE = sizeof(ZipCode::payload_type) / sizeof(gbwtgraph::KmerEncoding::code_type); enum PayloadType { /// No payload. @@ -171,10 +171,6 @@ struct MinimizerIndexParameters { /// Whether to use syncmers instead of minimizers. bool use_syncmers = false; - /// Whether to include path information in the payload (for recombination-aware mapping). - /// Ignored if there is no zipcode payload. - bool paths_in_payload = false; - /// Whether to use weighted minimizers (cannot be used with syncmers). bool use_weighted_minimizers = false; @@ -209,12 +205,6 @@ struct MinimizerIndexParameters { return *this; } - /// Includes path information in the payload. - MinimizerIndexParameters& with_paths(bool paths_in_payload = true) { - this->paths_in_payload = paths_in_payload; - return *this; - } - /// Sets weighted minimizer parameters. MinimizerIndexParameters& weighted( bool use_weighted_minimizers, @@ -256,7 +246,8 @@ gbwtgraph::DefaultMinimizerIndex build_minimizer_index( const gbwtgraph::GBZ& gbz, const SnarlDistanceIndex* distance_index, ZipCodeCollection* oversized_zipcodes, - const MinimizerIndexParameters& params + const MinimizerIndexParameters& params, + bool require_path_payloads = false ); /// Checks that the minimizer index has the expected payload type. diff --git a/src/index_registry.cpp b/src/index_registry.cpp index cb09dbcbfa..04fc9b454a 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -519,8 +519,7 @@ construct_minimizers_impl( if (IndexingParameters::verbosity != IndexingParameters::None) { info(context) << "Constructing minimizer index and associated zipcodes." << endl; info(context) << " using parameters -k " << params.k << " -w " << params.w_or_s - << (params.use_weighted_minimizers ? " -W " : "") - << (params.paths_in_payload ? " with paths" : "") << endl; + << (params.use_weighted_minimizers ? " -W " : "") << endl; } assert(inputs.size() == 2); @@ -4573,25 +4572,11 @@ IndexRegistry VGIndexes::get_vg_index_registry() { // TODO: We may not always want to store the minimizer index. Rebuilding the index may be // faster than loading it from a network drive. - registry.register_recipe( - {"Long Read PathMinimizers", "Long Read PathZipcodes"}, {"Giraffe Distance Index", "Giraffe GBZ"}, - [&](const vector& inputs, const IndexingPlan* plan, AliasGraph& alias_graph, const IndexGroup& constructing) { - MinimizerIndexParameters params; - params.minimizers(IndexingParameters::long_read_minimizer_k, IndexingParameters::long_read_minimizer_w) - .with_paths(true) - .weighted(IndexingParameters::long_read_minimizer_W, IndexingParameters::minimizer_downweight_threshold) - .kmer_counting(IndexingParameters::space_efficient_counting) - .verbose(IndexingParameters::verbosity >= IndexingParameters::Debug); - return construct_minimizers_impl(inputs, plan, constructing, params); - } - ); - registry.register_recipe( {"Short Read Minimizers", "Short Read Zipcodes"}, {"Giraffe Distance Index", "Giraffe GBZ"}, [&](const vector& inputs, const IndexingPlan* plan, AliasGraph& alias_graph, const IndexGroup& constructing) { MinimizerIndexParameters params; params.minimizers(IndexingParameters::short_read_minimizer_k, IndexingParameters::short_read_minimizer_w) - .with_paths(false) .weighted(IndexingParameters::short_read_minimizer_W, IndexingParameters::minimizer_downweight_threshold) .kmer_counting(IndexingParameters::space_efficient_counting) .verbose(IndexingParameters::verbosity >= IndexingParameters::Debug); @@ -4603,7 +4588,6 @@ IndexRegistry VGIndexes::get_vg_index_registry() { [&](const vector& inputs, const IndexingPlan* plan, AliasGraph& alias_graph, const IndexGroup& constructing) { MinimizerIndexParameters params; params.minimizers(IndexingParameters::long_read_minimizer_k, IndexingParameters::long_read_minimizer_w) - .with_paths(false) .weighted(IndexingParameters::long_read_minimizer_W, IndexingParameters::minimizer_downweight_threshold) .kmer_counting(IndexingParameters::space_efficient_counting) .verbose(IndexingParameters::verbosity >= IndexingParameters::Debug); @@ -4662,16 +4646,6 @@ vector VGIndexes::get_default_long_giraffe_indexes() { return indexes; } -vector VGIndexes::get_default_long_path_giraffe_indexes() { - vector indexes{ - "Giraffe Distance Index", - "Giraffe GBZ", - "Long Read PathMinimizers", - "Long Read PathZipcodes", - }; - return indexes; -} - vector VGIndexes::get_default_haplotype_sampling_indexes() { vector indexes{ "GBZ", diff --git a/src/index_registry.hpp b/src/index_registry.hpp index 7f20a5fd37..45180237c9 100644 --- a/src/index_registry.hpp +++ b/src/index_registry.hpp @@ -161,8 +161,6 @@ struct VGIndexes { static vector get_default_short_giraffe_indexes(); /// A list of the identifiers of the default indexes to run vg giraffe on long reads static vector get_default_long_giraffe_indexes(); - /// A list of the identifiers of the default indexes to run vg giraffe on long reads with path minimizers - static vector get_default_long_path_giraffe_indexes(); /// A list of the identifiers of the default indexes to run vg haplotypes static vector get_default_haplotype_sampling_indexes(); }; diff --git a/src/minimizer_mapper.cpp b/src/minimizer_mapper.cpp index eea5ac6918..aa7a4d1618 100644 --- a/src/minimizer_mapper.cpp +++ b/src/minimizer_mapper.cpp @@ -61,8 +61,8 @@ MinimizerMapper::MinimizerMapper(const gbwtgraph::GBWTGraph& graph, path_graph(path_graph), minimizer_index(minimizer_index), k(minimizer_index.k()), w(minimizer_index.w()), - payload_with_paths(has_payload(minimizer_index, MinimizerIndexParameters::PAYLOAD_ZIPCODES_WITH_PATHS)), uses_syncmers(minimizer_index.uses_syncmers()), + use_payload_paths(has_payload(minimizer_index, MinimizerIndexParameters::PAYLOAD_ZIPCODES_WITH_PATHS)), distance_index(distance_index), zipcodes(zipcodes), clusterer(distance_index, &graph), @@ -4461,7 +4461,7 @@ std::vector MinimizerMapper::find_seeds(const std::vector // Handle the payload. seeds.back().zipcode.fill_in_zipcode(occ.second, this->zipcodes, *(this->distance_index), hit); - if (this->payload_with_paths) { + if (this->use_payload_paths) { seeds.back().paths = occ.second[MinimizerIndexParameters::ZIPCODE_PAYLOAD_SIZE]; } } diff --git a/src/minimizer_mapper.hpp b/src/minimizer_mapper.hpp index 48924f0a2a..4a618a8ad2 100644 --- a/src/minimizer_mapper.hpp +++ b/src/minimizer_mapper.hpp @@ -291,9 +291,12 @@ class MinimizerMapper : public AlignerClient { /// at chaining? static constexpr double default_gap_scale = 1.0; double gap_scale = default_gap_scale; - /// Recombination penalty for chaining. This is added to the score of a transition if there are no shared haplotypes. - static constexpr int default_rec_penalty_chain = 0; - int rec_penalty_chain = default_rec_penalty_chain; + /// Recombination penalty for chaining. This is added to the cost of a transition if there are no shared haplotypes. + static constexpr int default_rec_penalty = 0; + int rec_penalty = default_rec_penalty; + /// Recombination-aware chaining bonus for avoiding losing haplotypes. Not actually tracked in chaining DP score. + static constexpr int default_rec_consistency_bonus = 0; + int rec_consistency_bonus = default_rec_consistency_bonus; // How many points should we treat a non-gap connection base as producing, at chaining? static constexpr double default_points_per_possible_match = 0; double points_per_possible_match = default_points_per_possible_match; @@ -378,6 +381,9 @@ class MinimizerMapper : public AlignerClient { static constexpr int default_wfa_max_distance = WFAExtender::ErrorModel::default_distance().max; int wfa_max_distance = default_wfa_max_distance; + /// Recombination penalty for alignment scoring. This is deducted from the score of an alignment for each required recombination. + static constexpr int default_rec_penalty_aln = 0; + int rec_penalty_aln = default_rec_penalty_aln; /// How much should candidate alignment scores be penalized for softclipped bases? static constexpr double default_softclip_penalty = 0.0; double softclip_penalty = default_softclip_penalty; @@ -508,6 +514,11 @@ class MinimizerMapper : public AlignerClient { string sample_name; /// Apply this read group name string read_group; + + /// Should we use path information from the minimizer index payloads? + /// By default we fill this in based on availability in the index, but you + /// can clear this if it is set to turn off recombination-aware mapping. + bool use_payload_paths; /// Have we complained about hitting the size limit for rescue? atomic_flag warned_about_rescue_size = ATOMIC_FLAG_INIT; @@ -647,9 +658,8 @@ class MinimizerMapper : public AlignerClient { // caching common information about the minimizer index int32_t k; int32_t w; - bool payload_with_paths; // Does the payload for minimizer hits include path information in addition to a zipcode? bool uses_syncmers; // TODO: We could discard the syncmer support. - + SnarlDistanceIndex* distance_index; const ZipCodeCollection* zipcodes; /// This is our primary graph. @@ -906,14 +916,21 @@ class MinimizerMapper : public AlignerClient { const std::vector>& minimizer_kept_chain_count, vector& alignments, vector& multiplicity_by_alignment, vector& alignments_to_source, - SmallBitset& minimizer_explored, aligner_stats_t& stats, bool& funnel_depleted, LazyRNG& rng, Funnel& funnel) const; + SmallBitset& minimizer_explored, aligner_stats_t& stats, LazyRNG& rng, Funnel& funnel) const; + /** + * Select the max_multimaps best alignments from alignments into mappings. + * + * Skips alignments that overlap too much with previous alignments. + * + * If no alignments have a positive score, responsible for creating an unmapped mapping. + */ void pick_mappings_from_alignments(Alignment& aln, const std::vector& alignments, const std::vector& multiplicity_by_alignment, const std::vector& alignments_to_source, const std::vector& chain_score_estimates, std::vector& mappings, std::vector& scores, std::vector& multiplicity_by_mapping, - bool& funnel_depleted, LazyRNG& rng, Funnel& funnel) const; + LazyRNG& rng, Funnel& funnel) const; diff --git a/src/minimizer_mapper_from_chains.cpp b/src/minimizer_mapper_from_chains.cpp index 3c8cb7b50e..e1bfd77441 100644 --- a/src/minimizer_mapper_from_chains.cpp +++ b/src/minimizer_mapper_from_chains.cpp @@ -835,8 +835,6 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { // Track statistics about how many bases were aligned by diffrent methods, and how much time was used. aligner_stats_t stats; - bool funnel_depleted = false; - // This maps from alignment index back to chain index, for // tracing back to minimizers for MAPQ. Can hold // numeric_limits::max() for an unaligned alignment. @@ -846,7 +844,7 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { if (alignments.size() == 0) { do_alignment_on_chains(aln, seeds, minimizers, seed_anchors, chains, chain_source_tree, multiplicity_by_chain, chain_score_estimates, minimizer_kept_chain_count, alignments, multiplicity_by_alignment, - alignments_to_source, minimizer_explored, stats, funnel_depleted, rng, funnel); + alignments_to_source, minimizer_explored, stats, rng, funnel); } for (size_t alignment_index = 0; alignment_index < alignments.size(); ++alignment_index) { @@ -878,11 +876,11 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { size_t chain_index = alignments_to_source[alignment_index]; if (chain_index != std::numeric_limits::max() && chain_index < chain_rec_counts.size()) { set_annotation(alignments[alignment_index], "chain.rec_count", (double) chain_rec_counts[chain_index]); - if (rec_penalty_chain != 0) { + if (rec_penalty != 0) { // Penalize the score of alignment candidates according to the number of recombinations their chains required. // This allows alignments that required fewer recombinations in their chains to win. // TODO: We'd also eventaully like to count recombinations that we don't know are needed until base-level DP. - int64_t penalty = static_cast(rec_penalty_chain) * static_cast(chain_rec_counts[chain_index]); + int64_t penalty = static_cast(rec_penalty_aln == -1 ? rec_penalty : rec_penalty_aln) * static_cast(chain_rec_counts[chain_index]); int64_t penalized_score = static_cast(alignments[alignment_index].score()) - penalty; alignments[alignment_index].set_score(static_cast(penalized_score)); } @@ -903,9 +901,10 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { vector scores; //The multiplicities of mappings vector multiplicity_by_mapping; - + + // Collect the chosen mappings, or create the unmapped-read mapping. pick_mappings_from_alignments(aln, alignments, multiplicity_by_alignment, alignments_to_source, chain_score_estimates, - mappings, scores, multiplicity_by_mapping, funnel_depleted, rng, funnel); + mappings, scores, multiplicity_by_mapping, rng, funnel); if (track_provenance) { funnel.substage("mapq"); @@ -1575,19 +1574,26 @@ void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& graph_lookback_limit, read_lookback_limit ); + // TODO: Should we just inherit from ChainScoringScheme? Or should + // we set one up as a member? + algorithms::ChainScoringScheme scheme { + this->item_bonus, + this->item_scale, + this->gap_scale, + this->points_per_possible_match, + this->rec_penalty, + // TODO: Do this once at setup? + this->rec_consistency_bonus == -1 ? this->rec_penalty : this->rec_consistency_bonus, + }; chain_results = algorithms::find_best_chains( anchor_view, *distance_index, gbwt_graph, get_regular_aligner()->scorer->gap_open, get_regular_aligner()->scorer->gap_extension, - this->rec_penalty_chain, + scheme, this->max_alignments, for_each_transition, - this->item_bonus, - this->item_scale, - this->gap_scale, - this->points_per_possible_match, indel_limit, show_work ); @@ -1877,7 +1883,6 @@ void MinimizerMapper::do_alignment_on_chains(Alignment& aln, const std::vector& alignments, vector& multiplicity_by_alignment, vector& alignments_to_source, SmallBitset& minimizer_explored, aligner_stats_t& stats, - bool& funnel_depleted, LazyRNG& rng, Funnel& funnel) const { if (track_provenance) { @@ -2197,41 +2202,27 @@ void MinimizerMapper::do_alignment_on_chains(Alignment& aln, const std::vector::max()); - multiplicity_by_alignment.emplace_back(0); - // Stop telling the funnel about filters and items. - funnel_depleted = true; - } else { - //chain_count_by_alignment is currently the number of better or equal chains that were used - // We really want the number of chains not including the ones that represent the same mapping - // TODO: This isn't very efficient - for (size_t i = 0 ; i < chain_count_by_alignment.size() ; ++i) { - size_t chain_i = alignments_to_source[i]; - for (size_t j = 0 ; j < chain_count_by_alignment.size() ; ++j) { - size_t chain_j = alignments_to_source[j]; - if (i != j && - chain_score_estimates[chain_i] >= chain_score_estimates[chain_j] && - chain_ranges_are_equivalent(seeds[chains[chain_i].front()], - seeds[chains[chain_i].back()], - seeds[chains[chain_j].front()], - seeds[chains[chain_j].back()])) { - --chain_count_by_alignment[i]; - } + //chain_count_by_alignment is currently the number of better or equal chains that were used + // We really want the number of chains not including the ones that represent the same mapping + // TODO: This isn't very efficient + for (size_t i = 0 ; i < chain_count_by_alignment.size() ; ++i) { + size_t chain_i = alignments_to_source[i]; + for (size_t j = 0 ; j < chain_count_by_alignment.size() ; ++j) { + size_t chain_j = alignments_to_source[j]; + if (i != j && + chain_score_estimates[chain_i] >= chain_score_estimates[chain_j] && + chain_ranges_are_equivalent(seeds[chains[chain_i].front()], + seeds[chains[chain_i].back()], + seeds[chains[chain_j].front()], + seeds[chains[chain_j].back()])) { + --chain_count_by_alignment[i]; } } - for (size_t i = 0 ; i < multiplicity_by_alignment.size() ; ++i) { - multiplicity_by_alignment[i] += (chain_count_by_alignment[i] >= alignments.size() - ? ((double)chain_count_by_alignment[i] - (double) alignments.size()) - : 0.0); - } + } + for (size_t i = 0 ; i < multiplicity_by_alignment.size() ; ++i) { + multiplicity_by_alignment[i] += (chain_count_by_alignment[i] >= alignments.size() + ? ((double)chain_count_by_alignment[i] - (double) alignments.size()) + : 0.0); } } @@ -2242,7 +2233,7 @@ void MinimizerMapper::pick_mappings_from_alignments(Alignment& aln, const std::v std::vector& mappings, std::vector& scores, std::vector& multiplicity_by_mapping, - bool& funnel_depleted, LazyRNG& rng, + LazyRNG& rng, Funnel& funnel) const { // Look for duplicate alignments by using this collection of node IDs and orientations @@ -2287,14 +2278,13 @@ void MinimizerMapper::pick_mappings_from_alignments(Alignment& aln, const std::v if (this->sort_by_chain_score) { // Use the chain's score to rank the alignments size_t chain_number = alignments_to_source.at(alignment_number); - if (chain_number == std::numeric_limits::max()) { - // This is an unaligned alignment, score 0. - return 0; - } + // Unaligned alignments from no chain are no longer allowed in; + // this function is supposed to make them. So all alignments + // reference into chain_score_estimates. return chain_score_estimates.at(chain_number); } else { // Use base-level alignment score to rank alignments - // Tiebreak by identity (always > 0 and <= 1) + // Tiebreak by identity (which is always 0 to 1) return alignments.at(alignment_number).score() + identity(alignments.at(alignment_number).path()); } }; @@ -2309,39 +2299,51 @@ void MinimizerMapper::pick_mappings_from_alignments(Alignment& aln, const std::v // This alignment makes it // Called in score order + // Filter to alignments with strictly positive scores + if (alignments[alignment_num].score() <= 0) { + if (track_provenance) { + funnel.fail("nonzero-score", alignment_num); + } + return false; + } else { + if (track_provenance) { + funnel.pass("nonzero-score", alignment_num); + } + } + // Do the unique node fraction filter double unique_node_fraction = get_fraction_unique(alignment_num); if (unique_node_fraction < min_unique_node_fraction) { // If not enough of the alignment is from unique nodes, drop it. - if (track_provenance && !funnel_depleted) { + if (track_provenance) { funnel.fail("min-unique-node-fraction", alignment_num, unique_node_fraction); } if (show_work) { #pragma omp critical (cerr) { cerr << log_name() << "alignment " << alignment_num << " rejected because only " << unique_node_fraction << " of it is from nodes not already used" << endl; - if (track_correctness && !funnel_depleted && funnel.was_correct(alignment_num)) { + if (track_correctness && funnel.was_correct(alignment_num)) { cerr << log_name() << "\tCORRECT!" << endl; } } } return false; } else { - if (track_provenance && !funnel_depleted) { + if (track_provenance) { funnel.pass("min-unique-node-fraction", alignment_num, unique_node_fraction); } if (show_work) { #pragma omp critical (cerr) { cerr << log_name() << "alignment " << alignment_num << " accepted because " << unique_node_fraction << " of it is from nodes not already used" << endl; - if (track_correctness && !funnel_depleted && funnel.was_correct(alignment_num)) { + if (track_correctness && funnel.was_correct(alignment_num)) { cerr << log_name() << "\tCORRECT!" << endl; } } } } - if (track_provenance && !funnel_depleted) { + if (track_provenance) { // Tell the funnel funnel.pass("max-multimaps", alignment_num); } @@ -2357,7 +2359,7 @@ void MinimizerMapper::pick_mappings_from_alignments(Alignment& aln, const std::v // Remember the multiplicity multiplicity_by_mapping.emplace_back(multiplicity_by_alignment[alignment_num]); - if (track_provenance && !funnel_depleted) { + if (track_provenance) { // Tell the funnel funnel.project(alignment_num); funnel.score(funnel.latest(), scores.back()); @@ -2372,14 +2374,14 @@ void MinimizerMapper::pick_mappings_from_alignments(Alignment& aln, const std::v double unique_node_fraction = get_fraction_unique(alignment_num); if (unique_node_fraction < min_unique_node_fraction) { // If not enough of the alignment is from unique nodes, drop it. - if (track_provenance && !funnel_depleted) { + if (track_provenance) { funnel.fail("min-unique-node-fraction", alignment_num, unique_node_fraction); } if (show_work) { #pragma omp critical (cerr) { cerr << log_name() << "alignment " << alignment_num << " rejected because only " << unique_node_fraction << " of it is from nodes not already used" << endl; - if (track_correctness && !funnel_depleted && funnel.was_correct(alignment_num)) { + if (track_correctness && funnel.was_correct(alignment_num)) { cerr << log_name() << "\tCORRECT!" << endl; } } @@ -2387,14 +2389,14 @@ void MinimizerMapper::pick_mappings_from_alignments(Alignment& aln, const std::v // If we fail the unique node fraction filter, we won't count as a secondary for MAPQ return; } else { - if (track_provenance && !funnel_depleted) { + if (track_provenance) { funnel.pass("min-unique-node-fraction", alignment_num, unique_node_fraction); } if (show_work) { #pragma omp critical (cerr) { cerr << log_name() << "alignment " << alignment_num << " accepted because " << unique_node_fraction << " of it is from nodes not already used" << endl; - if (track_correctness && !funnel_depleted && funnel.was_correct(alignment_num)) { + if (track_correctness && funnel.was_correct(alignment_num)) { cerr << log_name() << "\tCORRECT!" << endl; } } @@ -2405,14 +2407,36 @@ void MinimizerMapper::pick_mappings_from_alignments(Alignment& aln, const std::v scores.emplace_back(alignments[alignment_num].score()); multiplicity_by_mapping.emplace_back(multiplicity_by_alignment[alignment_num]); - if (track_provenance && !funnel_depleted) { + if (track_provenance) { funnel.fail("max-multimaps", alignment_num); } }, [&](size_t alignment_num) { - // This alignment does not have a sufficiently good score - // Score threshold is 0; this should never happen - crash_unless(false); + // This alignment does not have a sufficiently good score. + // It may have been penalized into negative score. + + if (track_provenance) { + // Call this a fail of the nonzero score filter, even though that + // really filters on alignment score and not sorting score, which + // can be different. + // + // TODO: Remove feature to sort by chain score? + funnel.fail("nonzero-score", alignment_num); + } }); + + if (mappings.empty()) { + // We didn't find any mappings, so make an unmapped one. + + scores.emplace_back(0); + mappings.emplace_back(aln); + multiplicity_by_mapping.emplace_back(0); + + if (track_provenance) { + // Tell the funnel + funnel.introduce(); + funnel.score(funnel.latest(), scores.back()); + } + } } double MinimizerMapper::get_read_coverage( diff --git a/src/subcommand/chain_main.cpp b/src/subcommand/chain_main.cpp index b595077e80..c8108e0aab 100644 --- a/src/subcommand/chain_main.cpp +++ b/src/subcommand/chain_main.cpp @@ -38,7 +38,7 @@ int main_chain(int argc, char** argv) { Logger logger("vg chain"); bool show_progress = false; - int recomb_penalty = 0; + int recombination_penalty = 0; int c; optind = 2; // force optind past command positional argument @@ -67,7 +67,7 @@ int main_chain(int argc, char** argv) { break; case 'r': - recomb_penalty = atof(optarg); + recombination_penalty = atof(optarg); break; case 'h': @@ -284,7 +284,10 @@ int main_chain(int argc, char** argv) { #endif // Do the chaining. We assume items is already sorted right. - std::pair> score_and_chain = vg::algorithms::find_best_chain(items, distance_index, graph, scorer.scorer->gap_open, scorer.scorer->gap_extension, recomb_penalty); + // TODO: Replace with designated initializer list when we get C++20 + vg::algorithms::ChainScoringScheme scheme; + scheme.recombination_penalty = recombination_penalty; + std::pair> score_and_chain = vg::algorithms::find_best_chain(items, distance_index, graph, scorer.scorer->gap_open, scorer.scorer->gap_extension, scheme); logger.info() << "Best chain gets score " << score_and_chain.first << std::endl; diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index 1337a2c629..39b3b42306 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -450,10 +451,17 @@ static std::unique_ptr get_options() { double_is_nonnegative ); chaining_opts.add_range( - "rec-penalty-chain", - &MinimizerMapper::rec_penalty_chain, - MinimizerMapper::default_rec_penalty_chain, - "penalty for a recombination when chaining, requires -E (ALPHA)", + "rec-penalty", + &MinimizerMapper::rec_penalty, + MinimizerMapper::default_rec_penalty, + "penalty for a recombination in recombination-aware mode", + int_is_nonnegative + ); + chaining_opts.add_range( + "rec-consistency-bonus", + &MinimizerMapper::rec_consistency_bonus, + MinimizerMapper::default_rec_consistency_bonus, + "untracked bonus for chains staying consistent with haplotypes to avoid recombinations in recombination-aware mode", int_is_nonnegative ); chaining_opts.add_range( @@ -590,6 +598,13 @@ static std::unique_ptr get_options() { MinimizerMapper::default_wfa_max_distance, "band distance to allow in the longest WFA connection or tail" ); + chaining_opts.add_range( + "rec-penalty-aln", + &MinimizerMapper::rec_penalty_aln, + MinimizerMapper::default_rec_penalty_aln, + "penalty per recombination for final alignment scoring in recombination-aware mode", + int_is_nonnegative + ); chaining_opts.add_range( "softclip-penalty", &MinimizerMapper::softclip_penalty, @@ -715,7 +730,8 @@ void help_giraffe(char** argv, const BaseOptionGroup& parser, const std::map rec_mode; // Map algorithm names to rescue algorithms std::map rescue_algorithms = { @@ -944,6 +965,8 @@ int main_giraffe(int argc, char** argv) { .add_entry("item-bonus", 2) .add_entry("item-scale", 1.0) .add_entry("gap-scale", 0.27579) + .add_entry("rec-penalty", 2) + .add_entry("rec-consistency-bonus", 12) .add_entry("chain-score-threshold", 234.0) .add_entry("min-chains", 2) .add_entry("min-chain-score-per-base", 0.24) @@ -963,6 +986,7 @@ int main_giraffe(int argc, char** argv) { .add_entry("wfa-max-mismatches", 2) .add_entry("wfa-max-mismatches-per-base", 0.05) .add_entry("wfa-max-max-mismatches", 15) + .add_entry("rec-penalty-aln", 28) .add_entry("prune-low-cplx", true); Preset r10_base; @@ -1000,9 +1024,14 @@ int main_giraffe(int argc, char** argv) { .add_entry("item-bonus", 20) .add_entry("item-scale", 1.0) .add_entry("gap-scale", 0.06759721757973396) + .add_entry("rec-penalty", 2) + .add_entry("rec-consistency-bonus", 13) .add_entry("chain-score-threshold", 160.0) .add_entry("min-chains", 2) .add_entry("max-chains-per-tree", 3) + // Lowering this can reduce wrong reads in mapping experiments, but + // seems to *increase* miscalls; see + // https://ucsc-gi.slack.com/archives/CJ2EHEH1A/p1779202719223779?thread_ts=1779127572.303779&cid=CJ2EHEH1A .add_entry("min-chain-score-per-base", 0.052) .add_entry("max-min-chain-score", 1900.0) .add_entry("min-indel-avoid-bases", 50) @@ -1019,6 +1048,7 @@ int main_giraffe(int argc, char** argv) { .add_entry("wfa-max-mismatches", 2) .add_entry("wfa-max-mismatches-per-base", 0.05) .add_entry("wfa-max-max-mismatches", 15) + .add_entry("rec-penalty-aln", 32) .add_entry("prune-low-cplx", true); presets.emplace("r10", r10_base); @@ -1125,6 +1155,7 @@ int main_giraffe(int argc, char** argv) { {"report-name", required_argument, 0, OPT_REPORT_NAME}, {"parameter-preset", required_argument, 0, 'b'}, {"rec-mode", no_argument, 0, 'E'}, + {"no-rec-mode", no_argument, 0, OPT_NO_REC_MODE}, {"rescue-algorithm", required_argument, 0, 'A'}, {"fragment-mean", required_argument, 0, OPT_FRAGMENT_MEAN }, {"fragment-stdev", required_argument, 0, OPT_FRAGMENT_STDEV }, @@ -1202,13 +1233,11 @@ int main_giraffe(int argc, char** argv) { case 'm': provided_indexes.emplace("Long Read Minimizers", optarg); provided_indexes.emplace("Short Read Minimizers", optarg); - provided_indexes.emplace("Long Read PathMinimizers", optarg); break; case 'z': provided_indexes.emplace("Long Read Zipcodes", optarg); provided_indexes.emplace("Short Read Zipcodes", optarg); - provided_indexes.emplace("Long Read PathZipcodes", optarg); break; case 'd': provided_indexes.emplace("Giraffe Distance Index", optarg); @@ -1337,12 +1366,18 @@ int main_giraffe(int argc, char** argv) { found->second.apply(*parser); } } + // TODO: Can we make the presets know if they are long read + // presets and not list their names here? What if someone + // manually turns on chaining? if (param_preset == "hifi" || param_preset == "r10") { map_long_reads = true; } break; case 'E': - use_path_minimizer = true; + rec_mode = true; + break; + case OPT_NO_REC_MODE: + rec_mode = false; break; case 'A': { @@ -1520,9 +1555,10 @@ int main_giraffe(int argc, char** argv) { logger.error() << "Paired-end alignment is not yet implemented " << "for --align-from-chains or chaining-based presets" << std::endl; } - if (use_path_minimizer && !map_long_reads) { - // We don't have file paths to load defined for recombination-aware short-read minimizers. - logger.error() << "Path minimizers cannot be used with short reads." << endl; + if (rec_mode.has_value() && rec_mode.value() && !map_long_reads) { + // The user has specifically asked fro recombination-aware mapping on short reads, which isn't available. + // TODO: Get a better idea of if reads are long. + logger.error() << "Recombination-aware mode cannot be used with short reads yet." << endl; } // Use all the provided indexes @@ -1534,7 +1570,7 @@ int main_giraffe(int argc, char** argv) { // or when either --haplotype-name or --kff-name is given. // Otherwise we'd always sample when we had reads available. bool haplotype_sampling = haplotype_sampling_flag || provided_indexes.count("Haplotype Index") || !kff_filename.empty(); - + string sample_scope; if (haplotype_sampling) { @@ -1639,13 +1675,8 @@ int main_giraffe(int argc, char** argv) { {"Giraffe Distance Index", {"dist"}} }; if (map_long_reads) { - if (use_path_minimizer) { - indexes_and_extensions.emplace(std::string("Long Read PathMinimizers"), std::vector({"longread.path.min","path.min", "min"})); - indexes_and_extensions.emplace(std::string("Long Read PathZipcodes"), std::vector({"longread.path.zipcodes", "path.zipcodes", "zipcodes"})); - } else { - indexes_and_extensions.emplace(std::string("Long Read Minimizers"), std::vector({"longread.withzip.min","withzip.min", "min"})); - indexes_and_extensions.emplace(std::string("Long Read Zipcodes"), std::vector({"longread.zipcodes", "zipcodes"})); - } + indexes_and_extensions.emplace(std::string("Long Read Minimizers"), std::vector({"longread.path.min", "longread.withzip.min", "path.min", "withzip.min", "min"})); + indexes_and_extensions.emplace(std::string("Long Read Zipcodes"), std::vector({"longread.path.zipcodes", "longread.zipcodes", "path.zipcodes", "zipcodes"})); } else { indexes_and_extensions.emplace(std::string("Short Read Minimizers"), std::vector({"shortread.withzip.min","withzip.min", "min"})); indexes_and_extensions.emplace(std::string("Short Read Zipcodes"), std::vector({"shortread.zipcodes", "zipcodes"})); @@ -1703,14 +1734,12 @@ int main_giraffe(int argc, char** argv) { } //If we're making new zipcodes, we should rebuild the minimizers too - if (!(indexes_and_extensions.count(std::string("Long Read Minimizers")) || indexes_and_extensions.count(std::string("Long Read PathMinimizers"))) && indexes_and_extensions.count(std::string("Long Read Zipcodes"))) { + if (!indexes_and_extensions.count(std::string("Long Read Minimizers")) && indexes_and_extensions.count(std::string("Long Read Zipcodes"))) { logger.info() << "Rebuilding minimizer index to include zipcodes" << endl; registry.reset(std::string("Long Read Minimizers")); - registry.reset(std::string("Long Read PathMinimizers")); - } else if ((indexes_and_extensions.count(std::string("Long Read Minimizers")) || indexes_and_extensions.count(std::string("Long Read PathMinimizers"))) && !indexes_and_extensions.count(std::string("Long Read Zipcodes"))) { + } else if (indexes_and_extensions.count(std::string("Long Read Minimizers")) && !indexes_and_extensions.count(std::string("Long Read Zipcodes"))) { logger.info() << "Rebuilding zipcodes index to match new minimizers" << endl; registry.reset(std::string("Long Read Zipcodes")); - registry.reset(std::string("Long Read PathZipcodes")); } else if (!indexes_and_extensions.count(std::string("Short Read Minimizers")) && indexes_and_extensions.count(std::string("Short Read Zipcodes"))) { logger.info() << "Rebuilding minimizer index to include zipcodes" << endl; registry.reset(std::string("Short Read Minimizers")); @@ -1729,11 +1758,7 @@ int main_giraffe(int argc, char** argv) { if (!map_long_reads) { index_targets = VGIndexes::get_default_short_giraffe_indexes(); } else { - if (use_path_minimizer) { - index_targets = VGIndexes::get_default_long_path_giraffe_indexes(); - } else { - index_targets = VGIndexes::get_default_long_giraffe_indexes(); - } + index_targets = VGIndexes::get_default_long_giraffe_indexes(); } #ifdef debug for (auto& needed : index_targets) { @@ -1766,15 +1791,9 @@ int main_giraffe(int argc, char** argv) { } IndexName minimizer_indexname; unique_ptr minimizer_index; - MinimizerIndexParameters::PayloadType payload_type = MinimizerIndexParameters::PAYLOAD_ZIPCODES; if (map_long_reads) { - if (use_path_minimizer) { - minimizer_indexname = "Long Read PathMinimizers"; - payload_type = MinimizerIndexParameters::PAYLOAD_ZIPCODES_WITH_PATHS; - } else { - // Use the long read minimizers - minimizer_indexname = "Long Read Minimizers"; - } + // Use the long read minimizers + minimizer_indexname = "Long Read Minimizers"; } else { minimizer_indexname = "Short Read Minimizers"; } @@ -1782,8 +1801,23 @@ int main_giraffe(int argc, char** argv) { logger.error() << registry.require("Giraffe Distance Index").at(0) << " is newer than " << registry.require(minimizer_indexname).at(0) << " which depends on it" << std::endl; } minimizer_index = vg::io::VPKG::load_one(registry.require(minimizer_indexname).at(0)); - require_payload(*minimizer_index, payload_type); + // Decide if we want to do recombination-aware mapping by default, based on whether we can. + + // Whether to use recombination-aware mapping if the user did not set the option. + // Can't be a constant because it has to depend on the haplotype count in the index. + bool default_rec_mode = has_payload(*minimizer_index, MinimizerIndexParameters::PAYLOAD_ZIPCODES_WITH_PATHS) && map_long_reads; + + std::vector allowed_payloads; + allowed_payloads.push_back(MinimizerIndexParameters::PAYLOAD_ZIPCODES_WITH_PATHS); + if (!rec_mode.value_or(default_rec_mode)) { + // If we're not doing recombination-aware mapping, we can also accept a + // payload without path info, as long as it still has zipcodes. + allowed_payloads.push_back(MinimizerIndexParameters::PAYLOAD_ZIPCODES); + } + // Make sure we have a usable minimizer index payload. + require_payload(*minimizer_index, allowed_payloads); + // Grab the zipcodes if (show_progress) { logger.info() << "Loading Zipcodes" << endl; @@ -1791,11 +1825,7 @@ int main_giraffe(int argc, char** argv) { IndexName oversized_zipcodes_indexname; ZipCodeCollection oversized_zipcodes; if (map_long_reads) { - if (use_path_minimizer) { - oversized_zipcodes_indexname = "Long Read PathZipcodes"; - } else { - oversized_zipcodes_indexname = "Long Read Zipcodes"; - } + oversized_zipcodes_indexname = "Long Read Zipcodes"; } else { oversized_zipcodes_indexname = "Short Read Zipcodes"; } @@ -1965,6 +1995,14 @@ int main_giraffe(int argc, char** argv) { } } }; + auto report_paired_flag = [&](const std::string& name, bool value) { + if (value) { + params_json << ",\"" << name << "\":" << (value ? "true" : "false"); + if (show_progress) { + cerr << "--" << (value ? "" : "no-") << name << endl; + } + } + }; auto report_number = [&](const std::string& name, size_t value) { params_json << ",\"" << name << "\":" << value; if (show_progress) { @@ -2003,6 +2041,9 @@ int main_giraffe(int argc, char** argv) { report_string("rescue-algorithm", algorithm_names[rescue_algorithm]); } minimizer_mapper.rescue_algorithm = rescue_algorithm; + report_paired_flag("rec-mode", rec_mode.value_or(default_rec_mode)); + minimizer_mapper.use_payload_paths &= rec_mode.value_or(default_rec_mode); + params_json << "}" << std::endl; diff --git a/src/subcommand/minimizer_main.cpp b/src/subcommand/minimizer_main.cpp index 379e240463..f2f69da695 100644 --- a/src/subcommand/minimizer_main.cpp +++ b/src/subcommand/minimizer_main.cpp @@ -60,6 +60,9 @@ struct MinimizerConfig { // Construction parameters. MinimizerIndexParameters params; bool require_distance_index = true; + /// Should we enforce that the resulting minimizer index has path + /// payloads, for recombination-aware mapping? + bool require_path_payloads = false; // Other options. // Note that params also has a 'progress' field. @@ -99,7 +102,8 @@ int main_minimizer(int argc, char** argv) { gbz, distance_index.get(), &oversized_zipcodes, - config.params + config.params, + config.require_path_payloads ); // Close the distance index so it can't seem to be modified after the files @@ -159,13 +163,14 @@ void help_minimizer(char** argv) { std::cerr << "Other options:" << std::endl; std::cerr << " -z, --zipcode-name FILE store the distances that are too big in a file" << std::endl; std::cerr << " if no -z, some distances may be discarded" << std::endl; - std::cerr << " -E, --rec-mode build recombination-aware MinimizerIndex" << std::endl; std::cerr << " -p, --progress show progress information" << std::endl; std::cerr << " -t, --threads N use N threads for index construction " << "[" << get_default_threads() << "]" << std::endl; std::cerr << " (using more than " << DEFAULT_MAX_THREADS << " threads rarely helps)" << std::endl; std::cerr << " --no-dist build the index without distance index annotations" << std::endl; std::cerr << " (not recommended)" << std::endl; + std::cerr << " -E, --rec-mode assert MinimizerIndex will support" << std::endl; + std::cerr << " recombination-aware mapping" << std::endl; std::cerr << " -h, --help print this help message to stderr and exit" << std::endl; std::cerr << std::endl; } @@ -188,7 +193,6 @@ MinimizerConfig::MinimizerConfig(int argc, char** argv, int max_threads, const L while (true) { static struct option long_options[] = { - { "rec-mode", no_argument, 0, 'E' }, { "distance-index", required_argument, 0, 'd' }, { "output-name", required_argument, 0, 'o' }, { "kmer-length", required_argument, 0, 'k' }, @@ -205,20 +209,17 @@ MinimizerConfig::MinimizerConfig(int argc, char** argv, int max_threads, const L { "progress", no_argument, 0, 'p' }, { "threads", required_argument, 0, 't' }, { "no-dist", no_argument, 0, OPT_NO_DIST }, + { "rec-mode", no_argument, 0, 'E' }, { "help", no_argument, 0, 'h' }, { 0, 0, 0, 0 } }; int option_index = 0; - c = getopt_long(argc, argv, "Ed:o:k:w:cs:Wz:pt:h?", long_options, &option_index); + c = getopt_long(argc, argv, "d:o:k:w:cs:Wz:pt:Eh?", long_options, &option_index); if (c == -1) { break; } // End of options. switch (c) { - case 'E': - this->params.paths_in_payload = true; - logger.warn() << "--rec-mode is still under development" << std::endl; - break; case 'd': this->distance_name = optarg; break; @@ -271,6 +272,9 @@ MinimizerConfig::MinimizerConfig(int argc, char** argv, int max_threads, const L case OPT_NO_DIST: this->require_distance_index = false; break; + case 'E': + this->require_path_payloads = true; + break; case 'h': case '?': diff --git a/src/subcommand/options.hpp b/src/subcommand/options.hpp index 0f1d38a3f2..54e8fd62bd 100644 --- a/src/subcommand/options.hpp +++ b/src/subcommand/options.hpp @@ -150,7 +150,7 @@ struct TickChainLink { /// May not delegate to a different item. virtual bool tick_along_chain(); - /// Reset along the chain, makign this item and all parents take on their + /// Reset along the chain, making this item and all parents take on their /// initial values. virtual void reset_along_chain(); }; @@ -883,7 +883,7 @@ struct BaseOptionGroup : public TickChainLink { /// otherwies. virtual bool set(const BaseValuation& entry) = 0; - /// Fill in entry with the value of the correspondign option, if we have + /// Fill in entry with the value of the corresponding option, if we have /// that option. If so, return true. virtual bool query(BaseValuation& entry) const = 0; diff --git a/test/reads/small.recombined.fq b/test/reads/small.recombined.fq new file mode 100644 index 0000000000..e724391d97 --- /dev/null +++ b/test/reads/small.recombined.fq @@ -0,0 +1,4 @@ +@recombined +ATACCAACTCTCTGGGTCCTGGTGCTATGTGTAACTAGTAATGGTAATGGATATGTTGGGCTTCTTCCTTTGATTTATTTGAAGTAACGTTTGACAATCTATCACCAGGGGTAATGTGGGGAAATGGAAAGAATACAAAGATCTGGAGCCAGACAAATCTGGGTCCAAGTCCTCACTTTGCCGCAGA ++ +CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC diff --git a/test/t/46_vg_minimizer.t b/test/t/46_vg_minimizer.t index a3a2d44160..ef67f49599 100644 --- a/test/t/46_vg_minimizer.t +++ b/test/t/46_vg_minimizer.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 11 +plan tests 14 # Indexing a single graph @@ -42,10 +42,19 @@ is $(md5sum x.mi | cut -f 1 -d\ ) 2cf6a0fdf9d4fe6ee9b42ae2f093971b "setting -k - vg minimizer -t 1 -o x.mi -z x.zipcodes -d x.dist x.gbz is $? 0 "construction with zipcode payload" +# Make sure the default is to include path information; we know this graph has +# few enough paths +is "$(vg describe x.mi | grep 'payload =' | cut -f2 -d'=' | sed 's/^ *//')" "zipcodes with paths" "zipcodes include paths by default" + # Store zipcode payload and path information in the index # Construction will not be deterministic because the snarls are not deterministic vg minimizer -t 1 -o x.mi -z x.zipcodes --rec-mode -d x.dist x.gbz -is $? 0 "construction with zipcode payload and path information" +is $? 0 "construction with zipcode payload and mandatory path information" +# Build a path cover index +vg gbwt -P -x x.vg -g x_cover.gbz +vg minimizer -t 1 -o x.mi -z x.zipcodes --rec-mode -d x.dist x_cover.gbz 2>error.txt +is $? "1" "construction with mandatory path information fails on a path cover GBZ" +is $(grep 'path cover' error.txt | wc -l) "1" "error message references path cover as the problem" -rm -f x.vg x.dist x.mi x.zipcodes x.gbz +rm -f x.vg x.dist x.mi x.zipcodes x.gbz x_cover.gbz error.txt diff --git a/test/t/50_vg_giraffe.t b/test/t/50_vg_giraffe.t index 5ec9a33612..56c07d4021 100644 --- a/test/t/50_vg_giraffe.t +++ b/test/t/50_vg_giraffe.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 82 +plan tests 83 vg construct -a -r small/x.fa -v small/x.vcf.gz >x.vg vg index -x x.xg x.vg @@ -27,11 +27,6 @@ rm -f x-haps.gbwt x-paths.gbwt x-merged.gbwt vg giraffe -Z x.giraffe.gbz -m x.shortread.withzip.min -z x.shortread.zipcodes -d x.dist -f reads/small.middle.ref.fq > mapped1.gam is "${?}" "0" "a read can be mapped with gbz + min + zips + dist specified without crashing" -vg minimizer -d x.dist --rec-mode -o wrong.min -z wrong.zipcodes xx.gbz -vg giraffe -Z x.giraffe.gbz -m wrong.min -z wrong.zipcodes -d x.dist -f reads/small.middle.ref.fq > mapped1.gam -is "${?}" "1" "a minimizer index with the wrong payload type is rejected" -rm -f wrong.min wrong.zipcodes - rm -f x.shortread.withzip.min rm -f x.shortread.zipcodes vg giraffe -Z x.giraffe.gbz -d x.dist -f reads/small.middle.ref.fq > mapped1.gam @@ -69,6 +64,19 @@ rm -f log.txt vg giraffe -Z x.giraffe.gbz -f reads/small.middle.ref.mismatched.fq -b chaining-sr >/dev/null is "${?}" "0" "a read with a mismatch can be mapped with the short read chaining preset" +rm -f x.longread.withzip.min +rm -f x.longread.zipcodes + +vg giraffe -Z x.giraffe.gbz -f reads/small.recombined.fq -b hifi >mapped.gam +is "$(vg view -aj mapped.gam | jq '.annotation.chain.rec_count')" "1" "recombinations are counted by default when mapping with the hifi preset" + +vg giraffe -Z x.giraffe.gbz -f reads/small.recombined.fq -b hifi --rec-penalty-aln 9999 >mapped.gam +is "$(vg view -aj mapped.gam | jq '.score // 0')" "0" "recombination alignment score penalty is used by default and can unmap a read" + +rm -f mapped.gam +rm -f x.longread.withzip.min +rm -f x.longread.zipcodes + rm -Rf grid-out mkdir grid-out vg giraffe -Z x.giraffe.gbz -f reads/small.middle.ref.fq --output-basename grid-out/file --hard-hit-cap 5:6