From 4b3b06c80ec228c5dba4169f02b86f2fae2dca14 Mon Sep 17 00:00:00 2001 From: faithokamoto Date: Thu, 14 May 2026 16:57:39 -0700 Subject: [PATCH 1/9] minor speedup --- src/zip_code_tree.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/zip_code_tree.cpp b/src/zip_code_tree.cpp index 2a7933adc3..a994cefdcf 100644 --- a/src/zip_code_tree.cpp +++ b/src/zip_code_tree.cpp @@ -361,15 +361,15 @@ void ZipCodeForest::add_child_to_chain(forest_growing_state_t& forest_state, con forest_state.open_chains.back().second = flank_distances.at(seed_index) > forest_state.distance_limit; } else if (first_type != ZipCodeTree::CHAIN_START) { // For all except the first thing in a node/chain, we need to add an edge - size_t distance_between = std::numeric_limits::max(); - if (is_trivial_chain || current_type == ZipCode::ROOT_NODE || + size_t distance_between; + if (!is_trivial_chain && current_type != ZipCode::ROOT_NODE && forest_state.sibling_indices_at_depth[chain_depth][0].chain_component - == current_seed.zipcode.get_chain_component(depth)) { - if (previous_offset != std::numeric_limits::max()) { - // In same & reachable chain component, so can find distance from last thing - distance_between = (std::max(current_offset, previous_offset) - - std::min(current_offset, previous_offset)); - } + != current_seed.zipcode.get_chain_component(depth)) { + // In different componenets, so no edge is possible + distance_between = std::numeric_limits::max(); + } else { + // In same & reachable chain component, so can find distance from last thing + distance_between = std::max(current_offset, previous_offset) - std::min(current_offset, previous_offset); } if (chain_depth == 0 && distance_between > forest_state.distance_limit) { From afb622b06bddc101dcb89677ce2e7e9e4fbf9654 Mon Sep 17 00:00:00 2001 From: faithokamoto Date: Thu, 14 May 2026 17:44:15 -0700 Subject: [PATCH 2/9] look up less --- src/zip_code_tree.cpp | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/src/zip_code_tree.cpp b/src/zip_code_tree.cpp index a994cefdcf..1b9096233e 100644 --- a/src/zip_code_tree.cpp +++ b/src/zip_code_tree.cpp @@ -1988,8 +1988,9 @@ void ZipCodeTree::distance_iterator::unimplemented_error() { } auto ZipCodeTree::distance_iterator::tick() -> bool { + tree_item_t current = current_item(); #ifdef debug_parse - std::cerr << "Tick for state " << pos.state << " on symbol " << current_item().get_type() + std::cerr << "Tick for state " << pos.state << " on symbol " << current.get_type() << " at entry " << pos.index << std::endl; std::cerr << "Stack: "; std::stack temp_stack = pos.stack_data; @@ -2006,9 +2007,9 @@ auto ZipCodeTree::distance_iterator::tick() -> bool { // Initial state. // // Stack is empty and we must be at a seed to start at. - if (current_item().get_type() == SEED) { + if (current.get_type() == SEED) { #ifdef debug_parse - std::cerr << "Skip over seed " << current_item().get_value() << std::endl; + std::cerr << "Skip over seed " << current.get_value() << std::endl; #endif push(0); pos.state = S_SCAN_CHAIN; @@ -2023,10 +2024,10 @@ auto ZipCodeTree::distance_iterator::tick() -> bool { // that running distances to use at the other chains in the snarl, and // under that running distances to use for the other chains in the // snarl's parent snarl, etc. - if (current_item().get_type() == SEED) { + if (current.get_type() == SEED) { // Calculate which direction this seed would be traversed in - bool cur_is_rev = pos.right_to_left ? current_item().get_is_reversed() - : !current_item().get_is_reversed(); + bool cur_is_rev = pos.right_to_left ? current.get_is_reversed() + : !current.get_is_reversed(); bool origin_is_rev = original_right_to_left ? zip_code_tree->at(original_index).get_is_reversed() : !zip_code_tree->at(original_index).get_is_reversed(); if (top() > 0 && cur_is_rev == origin_is_rev) { @@ -2035,7 +2036,7 @@ auto ZipCodeTree::distance_iterator::tick() -> bool { crash_unless(depth() > 0); #endif #ifdef debug_parse - std::cerr << "Yield seed " << current_item().get_value() << ", distance " << top() << std::endl; + std::cerr << "Yield seed " << current.get_value() << ", distance " << top() << std::endl; #endif return true; } else { @@ -2043,14 +2044,14 @@ auto ZipCodeTree::distance_iterator::tick() -> bool { // or it is in the opposite read orientation, // thus it could not be used anyhow: we skip it #ifdef debug_parse - std::cerr << "Skip seed " << current_item().get_value() << std::endl; + std::cerr << "Skip seed " << current.get_value() << std::endl; #endif } } else if (entered_snarl()) { // Running distance along chain is on stack, // and will need to be added to all the stored distances. - initialize_snarl(current_item().get_type() == SNARL_START ? 0 - : std::numeric_limits::max()); + initialize_snarl(current.get_type() == SNARL_START ? 0 + : std::numeric_limits::max()); } else if (exited_chain()) { if (depth() == 1) { // We never entered the parent snarl of this chain, so stack up @@ -2086,9 +2087,9 @@ auto ZipCodeTree::distance_iterator::tick() -> bool { // or running distance to cross the snarl, will be under it. continue_snarl(); } - } else if (current_item().get_type() == EDGE) { + } else if (current.get_type() == EDGE) { // Add value to running distance - top() = SnarlDistanceIndex::sum(top(), current_item().get_value()); + top() = SnarlDistanceIndex::sum(top(), current.get_value()); if (top() > distance_limit || top() == std::numeric_limits::max()) { // Skip over the rest of this chain if (depth() == 1) { @@ -2096,7 +2097,7 @@ auto ZipCodeTree::distance_iterator::tick() -> bool { // So if the distance along the chain is too much, there // are not going to be any results with a smaller distance. #ifdef debug_parse - std::cerr << "Halt because adding " << current_item().get_value() << " bp " + std::cerr << "Halt because adding " << current.get_value() << " bp " << "gives distance " << top() << " > " << distance_limit << std::endl; #endif halt(); @@ -2116,7 +2117,7 @@ auto ZipCodeTree::distance_iterator::tick() -> bool { // // Stack has at the top running distances to use for each chain still // to be visited, and under those the same for the snarl above, etc. - if (current_item().get_type() == EDGE) { + if (current.get_type() == EDGE) { // Hit the distance matrix; finished snarl // Top of the stack should be the parent running distance // Jump to the start of the snarl @@ -2143,13 +2144,13 @@ auto ZipCodeTree::distance_iterator::tick() -> bool { // Cyclic snarls are traversed by "bouncing": entering each chain // from the left side, starting with the leftmost, and then once we // hit the other end of the snarl, entering each chain from the right. - if (current_item().get_type() == SNARL_END) { + if (current.get_type() == SNARL_END) { // Finished left sides, now doing right sides. #ifdef check_parse crash_unless(!pos.right_to_left); #endif pos.right_to_left = true; - } else if (current_item().get_type() == EDGE) { + } else if (current.get_type() == EDGE) { // Hit distance matrix again; finished snarl // Put original direction over distance to bound swap(); From 8f8789e5d9127831c3e5e97d5de32d5e849460d0 Mon Sep 17 00:00:00 2001 From: faithokamoto Date: Fri, 15 May 2026 10:05:22 -0700 Subject: [PATCH 3/9] look up even less --- src/zip_code_tree.cpp | 109 ++++++++++++++++++++++-------------------- src/zip_code_tree.hpp | 37 +++++++++----- 2 files changed, 82 insertions(+), 64 deletions(-) diff --git a/src/zip_code_tree.cpp b/src/zip_code_tree.cpp index 1b9096233e..05aa4f624f 100644 --- a/src/zip_code_tree.cpp +++ b/src/zip_code_tree.cpp @@ -354,6 +354,8 @@ void ZipCodeForest::add_child_to_chain(forest_growing_state_t& forest_state, con ///////////////////// Record distance from the previous thing in chain/node // Or add a new tree if the distance is too far auto first_type = forest_state.sibling_indices_at_depth[chain_depth][0].type; + // Memory in case we look it up, to avoid looking it up again + size_t chain_comp = std::numeric_limits::max(); if (chain_depth > 0 && first_type == ZipCodeTree::CHAIN_START) { // If this is the first thing in a non-root chain or node, // check whether it is connected to the front of the snarl @@ -362,9 +364,12 @@ void ZipCodeForest::add_child_to_chain(forest_growing_state_t& forest_state, con } else if (first_type != ZipCodeTree::CHAIN_START) { // For all except the first thing in a node/chain, we need to add an edge size_t distance_between; - if (!is_trivial_chain && current_type != ZipCode::ROOT_NODE && - forest_state.sibling_indices_at_depth[chain_depth][0].chain_component - != current_seed.zipcode.get_chain_component(depth)) { + if (!is_trivial_chain && current_type != ZipCode::ROOT_NODE) { + // This will be needed + chain_comp = current_seed.zipcode.get_chain_component(depth); + } + if (!is_trivial_chain && current_type != ZipCode::ROOT_NODE && + forest_state.sibling_indices_at_depth[chain_depth][0].chain_component != chain_comp) { // In different componenets, so no edge is possible distance_between = std::numeric_limits::max(); } else { @@ -497,8 +502,10 @@ void ZipCodeForest::add_child_to_chain(forest_growing_state_t& forest_state, con if (!is_trivial_chain && current_type != ZipCode::ROOT_NODE) { // If needed, remember the chain component - forest_state.sibling_indices_at_depth[chain_depth].back().chain_component - = current_seed.zipcode.get_chain_component(depth); + if (chain_comp == std::numeric_limits::max()) { + chain_comp = current_seed.zipcode.get_chain_component(depth); + } + forest_state.sibling_indices_at_depth[chain_depth].back().chain_component = chain_comp; } #ifdef DEBUG_ZIP_CODE_TREE cerr << "Added sibling with type " << current_type << endl; @@ -1448,7 +1455,7 @@ auto ZipCodeTree::seed_iterator::operator++() -> seed_iterator& { if (cyclic_snarl_nestedness > 0) { #ifdef debug_parse std::cerr << "Reversing direction in cyclic snarl at seed " - << current_item().get_value() << std::endl; + << current_item.get_value() << std::endl; #endif // We're currently going right to left, which means we just finished // our first traversal starting from this seed. Need to also do a @@ -1524,7 +1531,8 @@ ZipCodeTree::distance_iterator::distance_iterator(size_t start_index, size_t distance_limit) : original_index(start_index), end_index(right_to_left ? 0 : (zip_code_tree.size() - 1)), zip_code_tree(&zip_code_tree), original_right_to_left(right_to_left), distance_limit(distance_limit), - pos(start_index, right_to_left, std::stack(), chain_numbers, S_START) { + pos(start_index, right_to_left, std::stack(), chain_numbers, S_START), + current_item(zip_code_tree.at(start_index)) { if (done()) { // We are an end iterator. Nothing else to do. return; @@ -1560,7 +1568,7 @@ auto ZipCodeTree::distance_iterator::operator++() -> distance_iterator& { // a seed that has been ticked and yielded, or to end. if (!done()) { #ifdef debug_parse - std::cerr << "Skipping over a " << current_item().get_type() + std::cerr << "Skipping over a " << current_item.get_type() << " which we assume was handled already." << std::endl; #endif move_index(); @@ -1581,15 +1589,15 @@ auto ZipCodeTree::distance_iterator::operator*() const -> seed_result_t { // We are always at a seed, so show that seed #ifdef check_parse crash_unless(!done()); - crash_unless(current_item().get_type() == SEED); + crash_unless(current_item.get_type() == SEED); crash_unless(!pos.stack_data.empty()); #endif // We know the running distance to this seed will be at the top of the stack // If we're at the exact same position, the distance must be 0 size_t distance = pos.stack_data.top(); - bool is_reversed = pos.right_to_left ? current_item().get_is_reversed() - : !current_item().get_is_reversed(); - return {distance, current_item().get_value(), is_reversed}; + bool is_reversed = pos.right_to_left ? current_item.get_is_reversed() + : !current_item.get_is_reversed(); + return {distance, current_item.get_value(), is_reversed}; } auto ZipCodeTree::distance_iterator::push(size_t value) -> void { @@ -1805,7 +1813,7 @@ void ZipCodeTree::distance_iterator::skip_chain() { // Discard distance for this chain pop(); // Jump to other bound - pos.index = pop(); + change_index_to(pop()); #ifdef debug_parse std::cerr << "Skip chain, jump to index " << pos.index << std::endl; #endif @@ -1815,7 +1823,7 @@ void ZipCodeTree::distance_iterator::skip_chain() { void ZipCodeTree::distance_iterator::initialize_chain() { // Where *would* we jump, if we jumped? - push(pos.index + current_item().get_other_bound_offset()); + push(pos.index + current_item.get_other_bound_offset()); swap(); if (top() > distance_limit || top() == std::numeric_limits::max()) { #ifdef debug_parse @@ -1834,7 +1842,7 @@ void ZipCodeTree::distance_iterator::initialize_chain() { void ZipCodeTree::distance_iterator::save_opposite_cyclic_snarl_exit(size_t chain_num) { std::stack save_stack = pos.stack_data; std::stack save_chain_numbers = pos.chain_numbers; - size_t snarl_start_i = pos.index - current_item().get_value(); + size_t snarl_start_i = pos.index - current_item.get_value(); // Exit out the bound that we're NOT pointing at // (L->R exits at the start, R->L at the end) @@ -1869,7 +1877,7 @@ void ZipCodeTree::distance_iterator::initialize_snarl(size_t chain_num) { << " going " << (pos.right_to_left ? "right to left" : "left to right") << " with running distance " << top() << std::endl; #endif - if (current_item().get_value() == std::numeric_limits::max()) { + if (current_item.get_value() == std::numeric_limits::max()) { // This is a root-level snarl, which has no distance matrix #ifdef debug_parse std::cerr << "Tried to initialize a root-level snarl; halting now." << std::endl; @@ -1880,7 +1888,7 @@ void ZipCodeTree::distance_iterator::initialize_snarl(size_t chain_num) { } // Grab distances for this snarl - size_t snarl_start_i = pos.index - current_item().get_value(); + size_t snarl_start_i = pos.index - current_item.get_value(); bool is_cyclic = zip_code_tree->at(snarl_start_i).get_is_cyclic(); bool start_right_to_left = pos.right_to_left; @@ -1892,9 +1900,9 @@ void ZipCodeTree::distance_iterator::initialize_snarl(size_t chain_num) { // Drop chain running distance pop(); // Jump to snarl end - pos.index = snarl_start_i; + change_index_to(snarl_start_i); if (!pos.right_to_left) { - pos.index += zip_code_tree->at(snarl_start_i).get_other_bound_offset(); + shift_index_by(zip_code_tree->at(snarl_start_i).get_other_bound_offset()); } #ifdef debug_parse std::cerr << "Skip cyclic snarl, jump to index " << pos.index << std::endl; @@ -1930,9 +1938,9 @@ void ZipCodeTree::distance_iterator::initialize_snarl(size_t chain_num) { #endif // We want to start with the leftmost chain // Jump to CHAIN_COUNT - pos.index = snarl_start_i + 1; + change_index_to(snarl_start_i + 1); // Distance matrix has chains & bounds - size_t node_count = current_item().get_value() + 1; + size_t node_count = current_item.get_value() + 1; // Skip past distance matrix with N chains and thus N+1 nodes // A cyclic snarl's matrix has 2 rows per chain plus 2 for bounds, // and it also stores self-loop distances (i.e. main diagonal) @@ -1940,8 +1948,8 @@ void ZipCodeTree::distance_iterator::initialize_snarl(size_t chain_num) { // A DAG snarl's matrix has 1 row per chain plus 2 for bounds // and it does NOT store self-loop distances (i.e. no main diagonal) // That means (N+1) rows, so the matrix is the (N+1) triangle number - pos.index += is_cyclic ? (node_count*2 * (node_count*2 + 1)) / 2 - : (node_count * (node_count + 1)) / 2; + shift_index_by(is_cyclic ? (node_count*2 * (node_count*2 + 1)) / 2 + : (node_count * (node_count + 1)) / 2); #ifdef debug_parse std::cerr << "Jump to index " << pos.index << " after distance matrix" << std::endl; #endif @@ -1950,7 +1958,7 @@ void ZipCodeTree::distance_iterator::initialize_snarl(size_t chain_num) { void ZipCodeTree::distance_iterator::continue_snarl() { // Different scanning states based on snarl type - if (current_item().get_is_cyclic()) { + if (current_item.get_is_cyclic()) { #ifdef debug_parse std::cerr << "Continuing cyclic snarl" << std::endl; #endif @@ -1965,6 +1973,7 @@ void ZipCodeTree::distance_iterator::continue_snarl() { void ZipCodeTree::distance_iterator::use_saved_traversal() { pos = std::move(pending_traversals.top()); + current_item = zip_code_tree->at(pos.index); pending_traversals.pop(); // Swap end index to match new direction end_index = pos.right_to_left ? 0 : (zip_code_tree->size() - 1); @@ -1979,18 +1988,17 @@ auto ZipCodeTree::distance_iterator::halt() -> void { #ifdef debug_parse std::cerr << "Halt iteration!" << std::endl; #endif - pos.index = end_index; + change_index_to(end_index); } void ZipCodeTree::distance_iterator::unimplemented_error() { - throw std::domain_error("Unimplemented symbol " + std::to_string(current_item().get_type()) + throw std::domain_error("Unimplemented symbol " + std::to_string(current_item.get_type()) + " for state " + std::to_string(pos.state)); } auto ZipCodeTree::distance_iterator::tick() -> bool { - tree_item_t current = current_item(); #ifdef debug_parse - std::cerr << "Tick for state " << pos.state << " on symbol " << current.get_type() + std::cerr << "Tick for state " << pos.state << " on symbol " << current_item.get_type() << " at entry " << pos.index << std::endl; std::cerr << "Stack: "; std::stack temp_stack = pos.stack_data; @@ -2007,9 +2015,9 @@ auto ZipCodeTree::distance_iterator::tick() -> bool { // Initial state. // // Stack is empty and we must be at a seed to start at. - if (current.get_type() == SEED) { + if (current_item.get_type() == SEED) { #ifdef debug_parse - std::cerr << "Skip over seed " << current.get_value() << std::endl; + std::cerr << "Skip over seed " << current_item.get_value() << std::endl; #endif push(0); pos.state = S_SCAN_CHAIN; @@ -2024,10 +2032,10 @@ auto ZipCodeTree::distance_iterator::tick() -> bool { // that running distances to use at the other chains in the snarl, and // under that running distances to use for the other chains in the // snarl's parent snarl, etc. - if (current.get_type() == SEED) { + if (current_item.get_type() == SEED) { // Calculate which direction this seed would be traversed in - bool cur_is_rev = pos.right_to_left ? current.get_is_reversed() - : !current.get_is_reversed(); + bool cur_is_rev = pos.right_to_left ? current_item.get_is_reversed() + : !current_item.get_is_reversed(); bool origin_is_rev = original_right_to_left ? zip_code_tree->at(original_index).get_is_reversed() : !zip_code_tree->at(original_index).get_is_reversed(); if (top() > 0 && cur_is_rev == origin_is_rev) { @@ -2036,7 +2044,7 @@ auto ZipCodeTree::distance_iterator::tick() -> bool { crash_unless(depth() > 0); #endif #ifdef debug_parse - std::cerr << "Yield seed " << current.get_value() << ", distance " << top() << std::endl; + std::cerr << "Yield seed " << current_item.get_value() << ", distance " << top() << std::endl; #endif return true; } else { @@ -2044,13 +2052,13 @@ auto ZipCodeTree::distance_iterator::tick() -> bool { // or it is in the opposite read orientation, // thus it could not be used anyhow: we skip it #ifdef debug_parse - std::cerr << "Skip seed " << current.get_value() << std::endl; + std::cerr << "Skip seed " << current_item.get_value() << std::endl; #endif } } else if (entered_snarl()) { // Running distance along chain is on stack, // and will need to be added to all the stored distances. - initialize_snarl(current.get_type() == SNARL_START ? 0 + initialize_snarl(current_item.get_type() == SNARL_START ? 0 : std::numeric_limits::max()); } else if (exited_chain()) { if (depth() == 1) { @@ -2087,9 +2095,9 @@ auto ZipCodeTree::distance_iterator::tick() -> bool { // or running distance to cross the snarl, will be under it. continue_snarl(); } - } else if (current.get_type() == EDGE) { + } else if (current_item.get_type() == EDGE) { // Add value to running distance - top() = SnarlDistanceIndex::sum(top(), current.get_value()); + top() = SnarlDistanceIndex::sum(top(), current_item.get_value()); if (top() > distance_limit || top() == std::numeric_limits::max()) { // Skip over the rest of this chain if (depth() == 1) { @@ -2097,7 +2105,7 @@ auto ZipCodeTree::distance_iterator::tick() -> bool { // So if the distance along the chain is too much, there // are not going to be any results with a smaller distance. #ifdef debug_parse - std::cerr << "Halt because adding " << current.get_value() << " bp " + std::cerr << "Halt because adding " << current_item.get_value() << " bp " << "gives distance " << top() << " > " << distance_limit << std::endl; #endif halt(); @@ -2117,13 +2125,13 @@ auto ZipCodeTree::distance_iterator::tick() -> bool { // // Stack has at the top running distances to use for each chain still // to be visited, and under those the same for the snarl above, etc. - if (current.get_type() == EDGE) { + if (current_item.get_type() == EDGE) { // Hit the distance matrix; finished snarl // Top of the stack should be the parent running distance // Jump to the start of the snarl // Use snarl start offset from previous chain - ++pos.index; - pos.index -= current_item().get_value() ; + shift_index_by(1); + shift_index_by(-current_item.get_value()); // Resume scanning chain pos.state = S_SCAN_CHAIN; } else if (exited_snarl()) { @@ -2144,13 +2152,13 @@ auto ZipCodeTree::distance_iterator::tick() -> bool { // Cyclic snarls are traversed by "bouncing": entering each chain // from the left side, starting with the leftmost, and then once we // hit the other end of the snarl, entering each chain from the right. - if (current.get_type() == SNARL_END) { + if (current_item.get_type() == SNARL_END) { // Finished left sides, now doing right sides. #ifdef check_parse crash_unless(!pos.right_to_left); #endif pos.right_to_left = true; - } else if (current.get_type() == EDGE) { + } else if (current_item.get_type() == EDGE) { // Hit distance matrix again; finished snarl // Put original direction over distance to bound swap(); @@ -2160,14 +2168,13 @@ auto ZipCodeTree::distance_iterator::tick() -> bool { #endif // The top of the stack will be the original direction we were going pos.right_to_left = (pop() == 1); - ++pos.index; - size_t snarl_start_i = pos.index - current_item().get_value(); - if (pos.right_to_left) { - // Jump to snarl start - pos.index = snarl_start_i; - } else { + shift_index_by(1); + size_t snarl_start_i = pos.index - current_item.get_value(); + // Jump to snarl start + change_index_to(snarl_start_i); + if (!pos.right_to_left) { // Jump to snarl end - pos.index = zip_code_tree->at(snarl_start_i).get_other_bound_offset() + snarl_start_i; + shift_index_by(current_item.get_other_bound_offset()); } #ifdef debug_parse std::cerr << "Jump to index " << pos.index << endl; diff --git a/src/zip_code_tree.hpp b/src/zip_code_tree.hpp index 42af074ab3..eba2d9243c 100644 --- a/src/zip_code_tree.hpp +++ b/src/zip_code_tree.hpp @@ -514,9 +514,9 @@ class ZipCodeTree { // This might happen due to race conditions added // with the save-and-reload behavior if (pos.right_to_left) { - --pos.index; + shift_index_by(-1); } else { - ++pos.index; + shift_index_by(1); } } if (done() && !pending_traversals.empty()) { @@ -597,6 +597,8 @@ class ZipCodeTree { }; /// The iterator's current position (state, direction, stack, etc.) iteration_position pos; + /// Cheap lookup for the item at that position + tree_item_t current_item; /// Starting positions of other traversals that we will do later /// When the current traversal can no longer go on, /// we pop one of these and set the current position to it @@ -609,6 +611,18 @@ class ZipCodeTree { /// as the current one, e.g. using an edge C1_R -> SNARL_START void save_opposite_cyclic_snarl_exit(size_t chain_num); + /// Shift the current index & item + inline void shift_index_by(int shift) { + pos.index += shift; + current_item = zip_code_tree->at(pos.index); + } + + /// Teleport the current index + inline void change_index_to(int new_index) { + pos.index = new_index; + current_item = zip_code_tree->at(pos.index); + } + // Now we define a mini stack language so we can do a // not-really-a-pushdown-automaton to parse the distance strings. @@ -666,26 +680,23 @@ class ZipCodeTree { /// Throw a domain_error that the current state/symbol combo is unimplemented. void unimplemented_error(); - /// What item does index point to? - inline tree_item_t current_item() const { return zip_code_tree->at(pos. index); } - /// Check if the current symbol is an entrance/exit, /// based on the direction the iterator is going (right_to_left) inline bool entered_snarl() const { - return (pos.right_to_left && current_item().get_type() == ZipCodeTree::SNARL_END) - || (!pos.right_to_left && current_item().get_type() == ZipCodeTree::SNARL_START); + return (pos.right_to_left && current_item.get_type() == ZipCodeTree::SNARL_END) + || (!pos.right_to_left && current_item.get_type() == ZipCodeTree::SNARL_START); } inline bool exited_snarl() const { - return (pos.right_to_left && current_item().get_type() == ZipCodeTree::SNARL_START) - || (!pos.right_to_left && current_item().get_type() == ZipCodeTree::SNARL_END); + return (pos.right_to_left && current_item.get_type() == ZipCodeTree::SNARL_START) + || (!pos.right_to_left && current_item.get_type() == ZipCodeTree::SNARL_END); } inline bool entered_chain() const { - return (pos.right_to_left && current_item().get_type() == ZipCodeTree::CHAIN_END) - || (!pos.right_to_left && current_item().get_type() == ZipCodeTree::CHAIN_START); + return (pos.right_to_left && current_item.get_type() == ZipCodeTree::CHAIN_END) + || (!pos.right_to_left && current_item.get_type() == ZipCodeTree::CHAIN_START); } inline bool exited_chain() const { - return (pos.right_to_left && current_item().get_type() == ZipCodeTree::CHAIN_START) - || (!pos.right_to_left && current_item().get_type() == ZipCodeTree::CHAIN_END); + return (pos.right_to_left && current_item.get_type() == ZipCodeTree::CHAIN_START) + || (!pos.right_to_left && current_item.get_type() == ZipCodeTree::CHAIN_END); } /// Skip the current chain, jumping to the matching end From 686b032cac4f7f0afc1874d1af0d24c9f216c6cd Mon Sep 17 00:00:00 2001 From: faithokamoto Date: Fri, 15 May 2026 16:15:17 -0700 Subject: [PATCH 4/9] reuse old lookup --- src/zip_code_tree.cpp | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/src/zip_code_tree.cpp b/src/zip_code_tree.cpp index 05aa4f624f..4b14f78019 100644 --- a/src/zip_code_tree.cpp +++ b/src/zip_code_tree.cpp @@ -354,8 +354,7 @@ void ZipCodeForest::add_child_to_chain(forest_growing_state_t& forest_state, con ///////////////////// Record distance from the previous thing in chain/node // Or add a new tree if the distance is too far auto first_type = forest_state.sibling_indices_at_depth[chain_depth][0].type; - // Memory in case we look it up, to avoid looking it up again - size_t chain_comp = std::numeric_limits::max(); + size_t chain_comp = forest_state.sort_values_by_seed.at(seed_index).get_chain_component(); if (chain_depth > 0 && first_type == ZipCodeTree::CHAIN_START) { // If this is the first thing in a non-root chain or node, // check whether it is connected to the front of the snarl @@ -364,10 +363,6 @@ void ZipCodeForest::add_child_to_chain(forest_growing_state_t& forest_state, con } else if (first_type != ZipCodeTree::CHAIN_START) { // For all except the first thing in a node/chain, we need to add an edge size_t distance_between; - if (!is_trivial_chain && current_type != ZipCode::ROOT_NODE) { - // This will be needed - chain_comp = current_seed.zipcode.get_chain_component(depth); - } if (!is_trivial_chain && current_type != ZipCode::ROOT_NODE && forest_state.sibling_indices_at_depth[chain_depth][0].chain_component != chain_comp) { // In different componenets, so no edge is possible @@ -502,9 +497,6 @@ void ZipCodeForest::add_child_to_chain(forest_growing_state_t& forest_state, con if (!is_trivial_chain && current_type != ZipCode::ROOT_NODE) { // If needed, remember the chain component - if (chain_comp == std::numeric_limits::max()) { - chain_comp = current_seed.zipcode.get_chain_component(depth); - } forest_state.sibling_indices_at_depth[chain_depth].back().chain_component = chain_comp; } #ifdef DEBUG_ZIP_CODE_TREE @@ -2246,7 +2238,7 @@ void ZipCodeForest::sort_one_interval(forest_growing_state_t& forest_state, cons cerr << "\t\tThis is the root snarl so sort by connected component: " << seed.zipcode.get_distance_index_address(0) << endl; #endif - sort_values_by_seed[zipcode_sort_order[i]].set_sort_value( seed.zipcode.get_distance_index_address(0)); + sort_values_by_seed[zipcode_sort_order[i]].set_sort_value(seed.zipcode.get_distance_index_address(0)); sort_values_by_seed[zipcode_sort_order[i]].set_code_type(seed.zipcode.get_code_type(0)); } else if (interval.code_type == ZipCode::NODE || interval.code_type == ZipCode::ROOT_NODE || seed.zipcode.max_depth() == interval.depth) { @@ -2257,7 +2249,7 @@ void ZipCodeForest::sort_one_interval(forest_growing_state_t& forest_state, cons #endif sort_values_by_seed[zipcode_sort_order[i]].set_sort_value( is_rev(seed.pos) ? seed.zipcode.get_length(interval.depth) - offset(seed.pos) - : offset(seed.pos)); + : offset(seed.pos)); sort_values_by_seed[zipcode_sort_order[i]].set_code_type(ZipCode::NODE); } else if (interval.code_type == ZipCode::CHAIN || interval.code_type == ZipCode::ROOT_CHAIN) { #ifdef DEBUG_ZIP_CODE_SORTING From 788506b8521e7fcb68db9c00a2acf7a5fdfc9369 Mon Sep 17 00:00:00 2001 From: faithokamoto Date: Fri, 15 May 2026 17:52:29 -0700 Subject: [PATCH 5/9] try out this helper function --- src/zip_code.cpp | 75 +++++++++++++++++++++++++++++++++++++++++++ src/zip_code.hpp | 5 +++ src/zip_code_tree.cpp | 30 +++++++++-------- 3 files changed, 97 insertions(+), 13 deletions(-) diff --git a/src/zip_code.cpp b/src/zip_code.cpp index 4699a24494..d06f31255d 100644 --- a/src/zip_code.cpp +++ b/src/zip_code.cpp @@ -640,6 +640,81 @@ size_t ZipCode::get_chain_component(const size_t& depth) const { } } +tuple ZipCode::get_chain_child_sort_info(const size_t& depth) const { +#ifdef DEBUG_ZIPCODE + // Check our assumptions + assert(NODE_OFFSET_OFFSET == 0); + assert(NODE_LENGTH_OFFSET == 1); + assert(NODE_IS_REVERSED_OFFSET == 2); + assert(NODE_CHAIN_COMPONENT_OFFSET == 3); + + assert(SNARL_IS_REGULAR_OFFSET == 0); + assert(SNARL_OFFSET_IN_CHAIN_OFFSET == 1); + assert(SNARL_LENGTH_OFFSET == 2); + assert(SNARL_CHAIN_COMPONENT_OFFSET == 4); +#endif + if (depth == 0) { + throw std::runtime_error("Depth 0 is not a child. Do your zipcode, minimizer, and graph files match?"); + } + + ZipCode::code_type_t code_type; + size_t offset_in_chain; + size_t length; + size_t chain_component; + bool is_reversed; + + if (decoder[depth].is_chain) { + // is_chain so could be a chain or a node + if (decoder[depth-1].is_chain) { + // If the thing before this was also a chain, then it is a node + code_type = ZipCode::NODE; + size_t zip_value; + size_t zip_index = decoder[depth].offset; + // First is the offset + std::tie(zip_value, zip_index) = zipcode.get_value_and_next_index(zip_index); + offset_in_chain = zip_value == 0 ? std::numeric_limits::max() : zip_value-1; + // Next is the length + std::tie(zip_value, zip_index) = zipcode.get_value_and_next_index(zip_index); + length = 0 ? std::numeric_limits::max() : zip_value-1; + // Next is the reversedness + std::tie(zip_value, zip_index) = zipcode.get_value_and_next_index(zip_index); + is_reversed = zip_value; + // Finally is the chain component + std::tie(zip_value, zip_index) = zipcode.get_value_and_next_index(zip_index); + chain_component = zip_value; + } else { + throw std::runtime_error("A chain is not a child of a chain. Do your zipcode, minimizer, and graph files match?"); + } + } else { + // Definitely a snarl + size_t zip_value; + size_t zip_index = decoder[depth].offset; + std::tie(zip_value, zip_index) = zipcode.get_value_and_next_index(zip_index); + // First is the snarl type + if (zip_value == 0) { + code_type = ZipCode::IRREGULAR_SNARL; + } else if (zip_value == 1) { + code_type = ZipCode::REGULAR_SNARL; + } else { + code_type = ZipCode::CYCLIC_SNARL; + } + // Next is the offset + std::tie(zip_value, zip_index) = zipcode.get_value_and_next_index(zip_index); + offset_in_chain = zip_value == 0 ? std::numeric_limits::max() : zip_value-1; + // Next is the length + std::tie(zip_value, zip_index) = zipcode.get_value_and_next_index(zip_index); + length = 0 ? std::numeric_limits::max() : zip_value-1; + // Two from now is the chain component + std::tie(zip_value, zip_index) = zipcode.get_value_and_next_index(zip_index); + std::tie(zip_value, zip_index) = zipcode.get_value_and_next_index(zip_index); + chain_component = zip_value; + // Snarls are always considered to be forward + is_reversed = false; + } + + return std::make_tuple(code_type, offset_in_chain, length, chain_component, is_reversed); +} + size_t ZipCode::get_last_chain_component(const size_t& depth, bool get_end) const { if (!decoder[depth].is_chain) { diff --git a/src/zip_code.hpp b/src/zip_code.hpp index d3ed284190..3b7d59269c 100644 --- a/src/zip_code.hpp +++ b/src/zip_code.hpp @@ -317,6 +317,11 @@ class ZipCode { ///For snarls, this will be the component of the start node size_t get_chain_component(const size_t& depth) const ; + /// Get all info needed to sort a child of a chain + /// This avoids paging through the same information multiple times + /// Order is type, offset, length, chain component, and reversedness + tuple get_chain_child_sort_info(const size_t& depth) const; + ///Get the chain component of the last node in the chain /// This behaves like the distance index get_chain_component- /// for looping chains it returns the last component if get_end is true, diff --git a/src/zip_code_tree.cpp b/src/zip_code_tree.cpp index 4b14f78019..1140f9cd9f 100644 --- a/src/zip_code_tree.cpp +++ b/src/zip_code_tree.cpp @@ -2262,36 +2262,40 @@ void ZipCodeForest::sort_one_interval(forest_growing_state_t& forest_state, cons // with offset in node of 0 (node 3 if chain is traversed forward) // See sort_value_t for more details - size_t prefix_sum = seed.zipcode.get_offset_in_chain(interval.depth+1); - - ZipCode::code_type_t child_type = seed.zipcode.get_code_type(interval.depth+1); - sort_values_by_seed[zipcode_sort_order[i]].set_code_type(child_type); - size_t chain_component = seed.zipcode.get_chain_component(interval.depth+1); + ZipCode::code_type_t code_type; + size_t offset_in_chain; + size_t length; + size_t chain_component; + bool is_reversed; + std::tie(code_type, offset_in_chain, length, chain_component, is_reversed) \ + = seed.zipcode.get_chain_child_sort_info(interval.depth+1); + + sort_values_by_seed[zipcode_sort_order[i]].set_code_type(code_type); sort_values_by_seed[zipcode_sort_order[i]].set_chain_component(chain_component); min_component = std::min(min_component, chain_component); max_component = std::max(max_component, chain_component); - if (child_type == ZipCode::REGULAR_SNARL - || child_type == ZipCode::IRREGULAR_SNARL - || child_type == ZipCode::CYCLIC_SNARL) { + if (code_type == ZipCode::REGULAR_SNARL + || code_type == ZipCode::IRREGULAR_SNARL + || code_type == ZipCode::CYCLIC_SNARL) { // For a snarl, the order is prefix_sum*3+1 - sort_values_by_seed[zipcode_sort_order[i]].set_sort_value(prefix_sum); + sort_values_by_seed[zipcode_sort_order[i]].set_sort_value(offset_in_chain); sort_values_by_seed[zipcode_sort_order[i]].set_chain_order(1); } else { // Order depends on where the position falls in the node - bool node_is_rev = seed.zipcode.get_is_reversed_in_parent(interval.depth+1) != is_rev(seed.pos); - size_t node_offset = node_is_rev ? seed.zipcode.get_length(interval.depth+1) - offset(seed.pos) + bool node_is_rev = is_reversed != is_rev(seed.pos); + size_t node_offset = node_is_rev ? length - offset(seed.pos) : offset(seed.pos); sort_values_by_seed[zipcode_sort_order[i]].set_sort_value( - SnarlDistanceIndex::sum(prefix_sum, node_offset)); + SnarlDistanceIndex::sum(offset_in_chain, node_offset)); sort_values_by_seed[zipcode_sort_order[i]].set_chain_order(node_offset == 0 ? 2 : 0); } #ifdef DEBUG_ZIP_CODE_SORTING cerr << "Prefix sum " << sort_values_by_seed[zipcode_sort_order[i]].get_distance_value() << " and sort value " << sort_values_by_seed[zipcode_sort_order[i]].get_sort_value() - << " and type " << child_type << endl; + << " and type " << code_type << endl; #endif } else { #ifdef DEBUG_ZIP_CODE_SORTING From 95dc0c3b40bfd2c779aaf8c611aa9432a038a9b7 Mon Sep 17 00:00:00 2001 From: faithokamoto Date: Fri, 15 May 2026 22:09:18 -0700 Subject: [PATCH 6/9] maybe use an iterator hehe --- src/zip_code_tree.cpp | 64 +++++++++++++++++++++---------------------- src/zip_code_tree.hpp | 22 +++++++-------- 2 files changed, 43 insertions(+), 43 deletions(-) diff --git a/src/zip_code_tree.cpp b/src/zip_code_tree.cpp index 1140f9cd9f..ce705cee3d 100644 --- a/src/zip_code_tree.cpp +++ b/src/zip_code_tree.cpp @@ -1524,7 +1524,7 @@ ZipCodeTree::distance_iterator::distance_iterator(size_t start_index, original_index(start_index), end_index(right_to_left ? 0 : (zip_code_tree.size() - 1)), zip_code_tree(&zip_code_tree), original_right_to_left(right_to_left), distance_limit(distance_limit), pos(start_index, right_to_left, std::stack(), chain_numbers, S_START), - current_item(zip_code_tree.at(start_index)) { + current_item(zip_code_tree.begin() + start_index) { if (done()) { // We are an end iterator. Nothing else to do. return; @@ -1560,7 +1560,7 @@ auto ZipCodeTree::distance_iterator::operator++() -> distance_iterator& { // a seed that has been ticked and yielded, or to end. if (!done()) { #ifdef debug_parse - std::cerr << "Skipping over a " << current_item.get_type() + std::cerr << "Skipping over a " << (*current_item).get_type() << " which we assume was handled already." << std::endl; #endif move_index(); @@ -1581,15 +1581,15 @@ auto ZipCodeTree::distance_iterator::operator*() const -> seed_result_t { // We are always at a seed, so show that seed #ifdef check_parse crash_unless(!done()); - crash_unless(current_item.get_type() == SEED); + crash_unless((*current_item).get_type() == SEED); crash_unless(!pos.stack_data.empty()); #endif // We know the running distance to this seed will be at the top of the stack // If we're at the exact same position, the distance must be 0 size_t distance = pos.stack_data.top(); - bool is_reversed = pos.right_to_left ? current_item.get_is_reversed() - : !current_item.get_is_reversed(); - return {distance, current_item.get_value(), is_reversed}; + bool is_reversed = pos.right_to_left ? (*current_item).get_is_reversed() + : !(*current_item).get_is_reversed(); + return {distance, (*current_item).get_value(), is_reversed}; } auto ZipCodeTree::distance_iterator::push(size_t value) -> void { @@ -1815,7 +1815,7 @@ void ZipCodeTree::distance_iterator::skip_chain() { void ZipCodeTree::distance_iterator::initialize_chain() { // Where *would* we jump, if we jumped? - push(pos.index + current_item.get_other_bound_offset()); + push(pos.index + (*current_item).get_other_bound_offset()); swap(); if (top() > distance_limit || top() == std::numeric_limits::max()) { #ifdef debug_parse @@ -1834,7 +1834,7 @@ void ZipCodeTree::distance_iterator::initialize_chain() { void ZipCodeTree::distance_iterator::save_opposite_cyclic_snarl_exit(size_t chain_num) { std::stack save_stack = pos.stack_data; std::stack save_chain_numbers = pos.chain_numbers; - size_t snarl_start_i = pos.index - current_item.get_value(); + size_t snarl_start_i = pos.index - (*current_item).get_value(); // Exit out the bound that we're NOT pointing at // (L->R exits at the start, R->L at the end) @@ -1869,7 +1869,7 @@ void ZipCodeTree::distance_iterator::initialize_snarl(size_t chain_num) { << " going " << (pos.right_to_left ? "right to left" : "left to right") << " with running distance " << top() << std::endl; #endif - if (current_item.get_value() == std::numeric_limits::max()) { + if ((*current_item).get_value() == std::numeric_limits::max()) { // This is a root-level snarl, which has no distance matrix #ifdef debug_parse std::cerr << "Tried to initialize a root-level snarl; halting now." << std::endl; @@ -1880,7 +1880,7 @@ void ZipCodeTree::distance_iterator::initialize_snarl(size_t chain_num) { } // Grab distances for this snarl - size_t snarl_start_i = pos.index - current_item.get_value(); + size_t snarl_start_i = pos.index - (*current_item).get_value(); bool is_cyclic = zip_code_tree->at(snarl_start_i).get_is_cyclic(); bool start_right_to_left = pos.right_to_left; @@ -1932,7 +1932,7 @@ void ZipCodeTree::distance_iterator::initialize_snarl(size_t chain_num) { // Jump to CHAIN_COUNT change_index_to(snarl_start_i + 1); // Distance matrix has chains & bounds - size_t node_count = current_item.get_value() + 1; + size_t node_count = (*current_item).get_value() + 1; // Skip past distance matrix with N chains and thus N+1 nodes // A cyclic snarl's matrix has 2 rows per chain plus 2 for bounds, // and it also stores self-loop distances (i.e. main diagonal) @@ -1950,7 +1950,7 @@ void ZipCodeTree::distance_iterator::initialize_snarl(size_t chain_num) { void ZipCodeTree::distance_iterator::continue_snarl() { // Different scanning states based on snarl type - if (current_item.get_is_cyclic()) { + if ((*current_item).get_is_cyclic()) { #ifdef debug_parse std::cerr << "Continuing cyclic snarl" << std::endl; #endif @@ -1965,7 +1965,7 @@ void ZipCodeTree::distance_iterator::continue_snarl() { void ZipCodeTree::distance_iterator::use_saved_traversal() { pos = std::move(pending_traversals.top()); - current_item = zip_code_tree->at(pos.index); + current_item = zip_code_tree->begin() + pos.index; pending_traversals.pop(); // Swap end index to match new direction end_index = pos.right_to_left ? 0 : (zip_code_tree->size() - 1); @@ -1984,13 +1984,13 @@ auto ZipCodeTree::distance_iterator::halt() -> void { } void ZipCodeTree::distance_iterator::unimplemented_error() { - throw std::domain_error("Unimplemented symbol " + std::to_string(current_item.get_type()) + throw std::domain_error("Unimplemented symbol " + std::to_string((*current_item).get_type()) + " for state " + std::to_string(pos.state)); } auto ZipCodeTree::distance_iterator::tick() -> bool { #ifdef debug_parse - std::cerr << "Tick for state " << pos.state << " on symbol " << current_item.get_type() + std::cerr << "Tick for state " << pos.state << " on symbol " << (*current_item).get_type() << " at entry " << pos.index << std::endl; std::cerr << "Stack: "; std::stack temp_stack = pos.stack_data; @@ -2007,9 +2007,9 @@ auto ZipCodeTree::distance_iterator::tick() -> bool { // Initial state. // // Stack is empty and we must be at a seed to start at. - if (current_item.get_type() == SEED) { + if ((*current_item).get_type() == SEED) { #ifdef debug_parse - std::cerr << "Skip over seed " << current_item.get_value() << std::endl; + std::cerr << "Skip over seed " << (*current_item).get_value() << std::endl; #endif push(0); pos.state = S_SCAN_CHAIN; @@ -2024,10 +2024,10 @@ auto ZipCodeTree::distance_iterator::tick() -> bool { // that running distances to use at the other chains in the snarl, and // under that running distances to use for the other chains in the // snarl's parent snarl, etc. - if (current_item.get_type() == SEED) { + if ((*current_item).get_type() == SEED) { // Calculate which direction this seed would be traversed in - bool cur_is_rev = pos.right_to_left ? current_item.get_is_reversed() - : !current_item.get_is_reversed(); + bool cur_is_rev = pos.right_to_left ? (*current_item).get_is_reversed() + : !(*current_item).get_is_reversed(); bool origin_is_rev = original_right_to_left ? zip_code_tree->at(original_index).get_is_reversed() : !zip_code_tree->at(original_index).get_is_reversed(); if (top() > 0 && cur_is_rev == origin_is_rev) { @@ -2036,7 +2036,7 @@ auto ZipCodeTree::distance_iterator::tick() -> bool { crash_unless(depth() > 0); #endif #ifdef debug_parse - std::cerr << "Yield seed " << current_item.get_value() << ", distance " << top() << std::endl; + std::cerr << "Yield seed " << (*current_item).get_value() << ", distance " << top() << std::endl; #endif return true; } else { @@ -2044,13 +2044,13 @@ auto ZipCodeTree::distance_iterator::tick() -> bool { // or it is in the opposite read orientation, // thus it could not be used anyhow: we skip it #ifdef debug_parse - std::cerr << "Skip seed " << current_item.get_value() << std::endl; + std::cerr << "Skip seed " << (*current_item).get_value() << std::endl; #endif } } else if (entered_snarl()) { // Running distance along chain is on stack, // and will need to be added to all the stored distances. - initialize_snarl(current_item.get_type() == SNARL_START ? 0 + initialize_snarl((*current_item).get_type() == SNARL_START ? 0 : std::numeric_limits::max()); } else if (exited_chain()) { if (depth() == 1) { @@ -2087,9 +2087,9 @@ auto ZipCodeTree::distance_iterator::tick() -> bool { // or running distance to cross the snarl, will be under it. continue_snarl(); } - } else if (current_item.get_type() == EDGE) { + } else if ((*current_item).get_type() == EDGE) { // Add value to running distance - top() = SnarlDistanceIndex::sum(top(), current_item.get_value()); + top() = SnarlDistanceIndex::sum(top(), (*current_item).get_value()); if (top() > distance_limit || top() == std::numeric_limits::max()) { // Skip over the rest of this chain if (depth() == 1) { @@ -2097,7 +2097,7 @@ auto ZipCodeTree::distance_iterator::tick() -> bool { // So if the distance along the chain is too much, there // are not going to be any results with a smaller distance. #ifdef debug_parse - std::cerr << "Halt because adding " << current_item.get_value() << " bp " + std::cerr << "Halt because adding " << (*current_item).get_value() << " bp " << "gives distance " << top() << " > " << distance_limit << std::endl; #endif halt(); @@ -2117,13 +2117,13 @@ auto ZipCodeTree::distance_iterator::tick() -> bool { // // Stack has at the top running distances to use for each chain still // to be visited, and under those the same for the snarl above, etc. - if (current_item.get_type() == EDGE) { + if ((*current_item).get_type() == EDGE) { // Hit the distance matrix; finished snarl // Top of the stack should be the parent running distance // Jump to the start of the snarl // Use snarl start offset from previous chain shift_index_by(1); - shift_index_by(-current_item.get_value()); + shift_index_by(-(*current_item).get_value()); // Resume scanning chain pos.state = S_SCAN_CHAIN; } else if (exited_snarl()) { @@ -2144,13 +2144,13 @@ auto ZipCodeTree::distance_iterator::tick() -> bool { // Cyclic snarls are traversed by "bouncing": entering each chain // from the left side, starting with the leftmost, and then once we // hit the other end of the snarl, entering each chain from the right. - if (current_item.get_type() == SNARL_END) { + if ((*current_item).get_type() == SNARL_END) { // Finished left sides, now doing right sides. #ifdef check_parse crash_unless(!pos.right_to_left); #endif pos.right_to_left = true; - } else if (current_item.get_type() == EDGE) { + } else if ((*current_item).get_type() == EDGE) { // Hit distance matrix again; finished snarl // Put original direction over distance to bound swap(); @@ -2161,12 +2161,12 @@ auto ZipCodeTree::distance_iterator::tick() -> bool { // The top of the stack will be the original direction we were going pos.right_to_left = (pop() == 1); shift_index_by(1); - size_t snarl_start_i = pos.index - current_item.get_value(); + size_t snarl_start_i = pos.index - (*current_item).get_value(); // Jump to snarl start change_index_to(snarl_start_i); if (!pos.right_to_left) { // Jump to snarl end - shift_index_by(current_item.get_other_bound_offset()); + shift_index_by((*current_item).get_other_bound_offset()); } #ifdef debug_parse std::cerr << "Jump to index " << pos.index << endl; diff --git a/src/zip_code_tree.hpp b/src/zip_code_tree.hpp index eba2d9243c..5b3b39151c 100644 --- a/src/zip_code_tree.hpp +++ b/src/zip_code_tree.hpp @@ -598,7 +598,7 @@ class ZipCodeTree { /// The iterator's current position (state, direction, stack, etc.) iteration_position pos; /// Cheap lookup for the item at that position - tree_item_t current_item; + vector::const_iterator current_item; /// Starting positions of other traversals that we will do later /// When the current traversal can no longer go on, /// we pop one of these and set the current position to it @@ -613,14 +613,14 @@ class ZipCodeTree { /// Shift the current index & item inline void shift_index_by(int shift) { + current_item += shift; pos.index += shift; - current_item = zip_code_tree->at(pos.index); } /// Teleport the current index inline void change_index_to(int new_index) { + current_item += new_index - pos.index; pos.index = new_index; - current_item = zip_code_tree->at(pos.index); } // Now we define a mini stack language so we can do a @@ -683,20 +683,20 @@ class ZipCodeTree { /// Check if the current symbol is an entrance/exit, /// based on the direction the iterator is going (right_to_left) inline bool entered_snarl() const { - return (pos.right_to_left && current_item.get_type() == ZipCodeTree::SNARL_END) - || (!pos.right_to_left && current_item.get_type() == ZipCodeTree::SNARL_START); + return (pos.right_to_left && (*current_item).get_type() == ZipCodeTree::SNARL_END) + || (!pos.right_to_left && (*current_item).get_type() == ZipCodeTree::SNARL_START); } inline bool exited_snarl() const { - return (pos.right_to_left && current_item.get_type() == ZipCodeTree::SNARL_START) - || (!pos.right_to_left && current_item.get_type() == ZipCodeTree::SNARL_END); + return (pos.right_to_left && (*current_item).get_type() == ZipCodeTree::SNARL_START) + || (!pos.right_to_left && (*current_item).get_type() == ZipCodeTree::SNARL_END); } inline bool entered_chain() const { - return (pos.right_to_left && current_item.get_type() == ZipCodeTree::CHAIN_END) - || (!pos.right_to_left && current_item.get_type() == ZipCodeTree::CHAIN_START); + return (pos.right_to_left && (*current_item).get_type() == ZipCodeTree::CHAIN_END) + || (!pos.right_to_left && (*current_item).get_type() == ZipCodeTree::CHAIN_START); } inline bool exited_chain() const { - return (pos.right_to_left && current_item.get_type() == ZipCodeTree::CHAIN_START) - || (!pos.right_to_left && current_item.get_type() == ZipCodeTree::CHAIN_END); + return (pos.right_to_left && (*current_item).get_type() == ZipCodeTree::CHAIN_START) + || (!pos.right_to_left && (*current_item).get_type() == ZipCodeTree::CHAIN_END); } /// Skip the current chain, jumping to the matching end From 4b7d7f6be2b9d43dd83110b059c91ada60e95dcd Mon Sep 17 00:00:00 2001 From: faithokamoto Date: Sat, 16 May 2026 13:48:57 -0700 Subject: [PATCH 7/9] avoid over-reallocating --- src/algorithms/chain_items.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/algorithms/chain_items.cpp b/src/algorithms/chain_items.cpp index ebc46da684..846a890e66 100644 --- a/src/algorithms/chain_items.cpp +++ b/src/algorithms/chain_items.cpp @@ -299,6 +299,9 @@ std::vector generate_zip_tree_transitions( const std::unordered_map& seed_to_ending) { std::vector all_transitions; + // Pre-guess that we'll need at least as many transitions + // as elements of ziptree (probably an undercount) + all_transitions.reserve(zip_code_tree.get_tree_size()); for (auto seed_itr = zip_code_tree.begin(); seed_itr != zip_code_tree.end(); ++seed_itr) { // For each destination seed left to right @@ -501,6 +504,7 @@ std::vector calculate_transition_read_distances( size_t max_read_lookback_bases) { std::vector filtered_transitions; + filtered_transitions.reserve(all_transitions.size()); for (auto transition : all_transitions) { // Emit a transition between a source and destination anchor, or skip if actually unreachable. From 74e51996a73ccff73b15a40b20eaeb5b2f39bf8f Mon Sep 17 00:00:00 2001 From: faithokamoto Date: Mon, 18 May 2026 12:22:28 -0700 Subject: [PATCH 8/9] save less stuff --- src/algorithms/chain_items.cpp | 273 ++++++++------------------------- src/algorithms/chain_items.hpp | 38 ++--- 2 files changed, 73 insertions(+), 238 deletions(-) diff --git a/src/algorithms/chain_items.cpp b/src/algorithms/chain_items.cpp index 846a890e66..c7aaf1e4b1 100644 --- a/src/algorithms/chain_items.cpp +++ b/src/algorithms/chain_items.cpp @@ -13,7 +13,6 @@ //#define debug_chaining //#define debug_transition -//#define debug_missing_transition //#define debug_dp namespace vg { @@ -261,31 +260,17 @@ transition_iterator zip_tree_transition_iterator(const std::vector all_transitions = generate_zip_tree_transitions(seeds, zip_code_tree, max_graph_lookback_bases, + max_read_lookback_bases, to_chain, seed_to_starting, seed_to_ending); -#ifdef debug_missing_transition - bool has_missing = \ - find_missing_zip_tree_transitions(seeds, zip_code_tree, max_graph_lookback_bases, - seed_to_starting, seed_to_ending, distance_index, - all_transitions); - if (has_missing) { - throw std::runtime_error("Zipcode tree iterator failed to output some transitions"); - } else { - cerr << "No missing transitions" << endl; - } -#endif - - std::vector filtered_transitions = - calculate_transition_read_distances(all_transitions, to_chain, max_read_lookback_bases); - // Sort the transitions so we handle them in an allowed order for dynamic programming. - std::sort(filtered_transitions.begin(), filtered_transitions.end(), + std::sort(all_transitions.begin(), all_transitions.end(), [&](const transition_info& a, const transition_info& b) { // Return true if a's destination seed is before b's in the read, and false otherwise. return to_chain[a.to_anchor].read_start() < to_chain[b.to_anchor].read_start(); }); - for (auto& transition : filtered_transitions) { + for (auto& transition : all_transitions) { callback(transition); } }; @@ -295,12 +280,13 @@ std::vector generate_zip_tree_transitions( const std::vector& seeds, const ZipCodeTree& zip_code_tree, size_t max_graph_lookback_bases, + size_t max_read_lookback_bases, + const VectorView& to_chain, const std::unordered_map& seed_to_starting, const std::unordered_map& seed_to_ending) { std::vector all_transitions; - // Pre-guess that we'll need at least as many transitions - // as elements of ziptree (probably an undercount) + // Save hopefully enough space for the transitions all_transitions.reserve(zip_code_tree.get_tree_size()); for (auto seed_itr = zip_code_tree.begin(); seed_itr != zip_code_tree.end(); ++seed_itr) { @@ -361,8 +347,9 @@ std::vector generate_zip_tree_transitions( std::cerr << "\t\tFound transition from #" << found_source_anchor->second << " to #" << cur_dest_anchor.second << std::endl; #endif - all_transitions.emplace_back(found_source_anchor->second, cur_dest_anchor.second, - source_seed.distance); + add_transition_if_legal(all_transitions, to_chain, max_read_lookback_bases, + found_source_anchor->second, cur_dest_anchor.second, + source_seed.distance); } else { #ifdef debug_transition std::cerr << " does not represent an anchor." << std::endl; @@ -382,8 +369,9 @@ std::vector generate_zip_tree_transitions( std::cerr << "\t\tFound backward transition from #" << cur_dest_anchor.second << " to #" << found_source_anchor->second << std::endl; #endif - all_transitions.emplace_back(cur_dest_anchor.second, found_source_anchor->second, - source_seed.distance); + add_transition_if_legal(all_transitions, to_chain, max_read_lookback_bases, + cur_dest_anchor.second, found_source_anchor->second, + source_seed.distance); } else { #ifdef debug_transition std::cerr << " does not represent an anchor." << std::endl; @@ -401,212 +389,79 @@ std::vector generate_zip_tree_transitions( return all_transitions; } -bool find_missing_zip_tree_transitions( - const std::vector& seeds, - const ZipCodeTree& zip_code_tree, - size_t max_graph_lookback_bases, - const std::unordered_map& seed_to_starting, - const std::unordered_map& seed_to_ending, - const SnarlDistanceIndex& distance_index, - const std::vector& all_transitions) { - - // {source anchor : {dest anchor : dist}} - std::unordered_map> found; - for (const auto& transition : all_transitions) { - size_t dist_to_save = transition.graph_distance; - if (!found.count(transition.from_anchor)) { - found[transition.from_anchor] = std::unordered_map(); - } - if (found[transition.from_anchor].count(transition.to_anchor)) { - // If a transition appears multiple times, remember the min - dist_to_save = std::min(transition.graph_distance, - found[transition.from_anchor][transition.to_anchor]); - } - found[transition.from_anchor][transition.to_anchor] = transition.graph_distance; - } - - bool has_missing = false; - - // Helper function to check for a distance between two seeds - auto check_distance = [&] (const ZipCodeTree::oriented_seed_t& from_seed, bool rev_from, - const ZipCodeTree::oriented_seed_t& to_seed, bool rev_to) { - // XOR to get appropriate orientations - rev_from ^= from_seed.is_reversed; - rev_to ^= to_seed.is_reversed; - if (rev_from != rev_to) { - // Cannot be compared; incompatible orientations - return; - } - - // Look up appropriate anchors - auto from_anchor_itr = rev_from ? seed_to_starting.find(from_seed.seed) - : seed_to_ending.find(from_seed.seed); - if ((rev_from && from_anchor_itr == seed_to_starting.end()) - || (!rev_from && from_anchor_itr == seed_to_ending.end())) { - // No anchor exists - return; - } - auto to_anchor_itr = rev_to ? seed_to_ending.find(to_seed.seed) - : seed_to_starting.find(to_seed.seed); - if ((rev_to && to_anchor_itr == seed_to_ending.end()) - || (!rev_to && to_anchor_itr == seed_to_starting.end())) { - // No anchor exists - return; - } - - // Construct seed positions - pos_t from_pos = seeds.at(from_seed.seed).pos; - size_t from_length = distance_index.minimum_length(distance_index.get_node_net_handle(id(from_pos))); - from_pos = rev_from ? reverse(from_pos, from_length) - : from_pos; - pos_t to_pos = seeds.at(to_seed.seed).pos; - size_t to_length = distance_index.minimum_length(distance_index.get_node_net_handle(id(to_pos))); - to_pos = rev_to ? reverse(to_pos, to_length) - : to_pos; - - // Look up true minimum distance - size_t true_distance = minimum_nontrivial_distance(distance_index, from_pos, to_pos); - if (true_distance <= max_graph_lookback_bases) { - // We should've found this transition - auto from_anchor = from_anchor_itr->second; - auto to_anchor = to_anchor_itr->second; - if (!found.count(from_anchor) - || !found[from_anchor].count(to_anchor) - || found[from_anchor][to_anchor] != true_distance) { - has_missing = true; - cerr << "Missing transition " << from_pos << "->" - << to_pos << " dist " << true_distance << endl; - } - } - }; - - vector tree_seeds = zip_code_tree.get_all_seeds(); - for (size_t i = 0; i < tree_seeds.size(); i++) { - // Check self-loops - check_distance(tree_seeds[i], false, tree_seeds[i], false); - check_distance(tree_seeds[i], false, tree_seeds[i], true); - check_distance(tree_seeds[i], true, tree_seeds[i], false); - for (size_t j = i + 1; j < tree_seeds.size(); j++) { - // Check all orientation pairs - check_distance(tree_seeds[i], false, tree_seeds[j], false); - check_distance(tree_seeds[i], false, tree_seeds[j], true); - check_distance(tree_seeds[i], true, tree_seeds[j], false); - check_distance(tree_seeds[i], true, tree_seeds[j], true); - } - } - - return has_missing; -} - -std::vector calculate_transition_read_distances( - const std::vector& all_transitions, - const VectorView& to_chain, - size_t max_read_lookback_bases) { - - std::vector filtered_transitions; - filtered_transitions.reserve(all_transitions.size()); - - for (auto transition : all_transitions) { - // Emit a transition between a source and destination anchor, or skip if actually unreachable. - auto& source_anchor = to_chain[transition.from_anchor]; - auto& dest_anchor = to_chain[transition.to_anchor]; +void add_transition_if_legal(vector& transitions, + const VectorView& to_chain, size_t max_read_lookback_bases, + size_t from_anchor, size_t to_anchor, size_t graph_distance) { + auto& source_anchor = to_chain[from_anchor]; + auto& dest_anchor = to_chain[to_anchor]; #ifdef debug_transition - std::cerr << "Handle transition #" << transition.from_anchor << " " << source_anchor - << " to #" << transition.to_anchor << " " << dest_anchor << std::endl; + std::cerr << "Handle transition #" << from_anchor << " " << source_anchor + << " to #" << to_anchor << " " << dest_anchor << std::endl; + assert(graph_distance != std::numeric_limits::max()); #endif - if (transition.graph_distance == std::numeric_limits::max()) { - // Not reachable in graph (somehow) - // TODO: Should never happen! + size_t read_distance = get_read_distance(source_anchor, dest_anchor); + if (read_distance == std::numeric_limits::max()) { + // Not reachable in read #ifdef debug_transition - std::cerr << "\tNot reachable in graph!" << std::endl; + std::cerr << "\tNot reachable in read." << std::endl; #endif - continue; - } - - size_t read_distance = get_read_distance(source_anchor, dest_anchor); - if (read_distance == std::numeric_limits::max()) { - // Not reachable in read -#ifdef debug_transition - std::cerr << "\tNot reachable in read." << std::endl; -#endif - continue; - } + return; + } - if (read_distance > max_read_lookback_bases) { - // Too far in read to consider + if (read_distance > max_read_lookback_bases) { + // Too far in read to consider #ifdef debug_transition - std::cerr << "\tToo far apart in read (" << read_distance - << "/" << max_read_lookback_bases << ")." << std::endl; + std::cerr << "\tToo far apart in read (" << read_distance + << "/" << max_read_lookback_bases << ")." << std::endl; #endif - continue; - } + return; + } - if (source_anchor.read_exclusion_end() > dest_anchor.read_exclusion_start()) { - // The actual core anchor part is reachable in the read, - // but we cut these down from overlapping minimizers. + if (source_anchor.read_exclusion_end() > dest_anchor.read_exclusion_start()) { + // The actual core anchor part is reachable in the read, + // but we cut these down from overlapping minimizers. #ifdef debug_transition - std::cerr << "\tOriginally overlapped in read." << std::endl; + std::cerr << "\tOriginally overlapped in read." << std::endl; #endif - continue; - } + return; + } - // The zipcode tree is about point positions, - // but we need distances between whole anchors. - // The stored zipcode positions will be at distances - // from the start/end of the associated anchor. - - // If the offset between the zip code point - // and the start of the destination is 0, - // and between the zip code point and the end of the source is 0, - // we subtract 0 from the measured distance. - // Otherwise we need to subtract something. - size_t distance_to_remove = dest_anchor.start_hint_offset() + source_anchor.end_hint_offset(); + // The zipcode tree is about point positions, + // but we need distances between whole anchors. + // The stored zipcode positions will be at distances + // from the start/end of the associated anchor. + + // If the offset between the zip code point + // and the start of the destination is 0, + // and between the zip code point and the end of the source is 0, + // we subtract 0 from the measured distance. + // Otherwise we need to subtract something. + size_t distance_to_remove = dest_anchor.start_hint_offset() + source_anchor.end_hint_offset(); #ifdef debug_transition - std::cerr << "\tZip code tree sees " << transition.graph_distance - << " but we should back out " << distance_to_remove << std::endl; + std::cerr << "\tZip code tree sees " << graph_distance + << " but we should back out " << distance_to_remove << std::endl; #endif - if (distance_to_remove > transition.graph_distance) { - // We actually end further along the graph path to the next - // thing than where the next thing starts, so we can't actually - // get there. - continue; - } - // Consume the length. - transition.graph_distance -= distance_to_remove; - + if (distance_to_remove > graph_distance) { + // We actually end further along the graph path to the next + // thing than where the next thing starts, so we can't actually + // get there. #ifdef debug_transition - std::cerr << "\tZip code tree sees " << source_anchor << " and " - << dest_anchor << " as " << transition.graph_distance << " apart" << std::endl; -#endif - -#ifdef double_check_distances - - auto from_pos = source_anchor.graph_end(); - auto to_pos = dest_anchor.graph_start(); - size_t check_distance = distance_index.minimum_distance( - id(from_pos), is_rev(from_pos), offset(from_pos), - id(to_pos), is_rev(to_pos), offset(to_pos), - false, &graph); - if (check_distance != transition.graph_distance) { - #pragma omp critical (cerr) - std::cerr << "\tZip code tree sees " << source_anchor << " and " - << dest_anchor << " as " << transition.graph_distance - << " apart but they are actually " << check_distance << " apart" << std::endl; - crash_unless(check_distance == transition.graph_distance); - } - + std::cerr << "\tBacked out too much" << std::endl; #endif - - // Send it along. - transition.read_distance = read_distance; - filtered_transitions.emplace_back(transition); + return; } + // Consume the length. + graph_distance -= distance_to_remove; - return filtered_transitions; +#ifdef debug_transition + std::cerr << "\tZip code tree sees " << source_anchor << " and " + << dest_anchor << " as " << graph_distance << " apart" << std::endl; +#endif + transitions.emplace_back(from_anchor, to_anchor, graph_distance, read_distance); } /// Compute a gap score like minimap2. diff --git a/src/algorithms/chain_items.hpp b/src/algorithms/chain_items.hpp index f4531d4201..de77c044f1 100644 --- a/src/algorithms/chain_items.hpp +++ b/src/algorithms/chain_items.hpp @@ -393,7 +393,7 @@ struct transition_info { size_t read_distance; // Constructor; read_distance defaults to max if not given - inline transition_info(size_t from, size_t to, size_t graph_dist, size_t read_dist = std::numeric_limits::max()) + inline transition_info(size_t from, size_t to, size_t graph_dist, size_t read_dist) : from_anchor(from), to_anchor(to), graph_distance(graph_dist), read_distance(read_dist) {} }; @@ -467,44 +467,24 @@ transition_iterator zip_tree_transition_iterator(const std::vector generate_zip_tree_transitions( const std::vector& seeds, const ZipCodeTree& zip_code_tree, size_t max_graph_lookback_bases, + size_t max_read_lookback_bases, + const VectorView& to_chain, const std::unordered_map& seed_to_starting, const std::unordered_map& seed_to_ending); - -/** - * Check if all possible transitions were actually found. - * - * Iterates over all pairs of seeds and uses the distance index - * to determine if there SHOULD have been a transition. - * - * Returns if any transitions were missing. - */ -bool find_missing_zip_tree_transitions( - const std::vector& seeds, - const ZipCodeTree& zip_code_tree, - size_t max_graph_lookback_bases, - const std::unordered_map& seed_to_starting, - const std::unordered_map& seed_to_ending, - const SnarlDistanceIndex& distance_index, - const std::vector& all_transitions); - /** - * Calculate read distances for each of the zip tree's transitions. - * Also filters out transitions that can't be used, - * e.g. not reachable in the read. + * Add a new transition_info to the end of "transitions" + * if said transition is legal (i.e. reachable in the read). * - * Used as a helper by zip_tree_transition_iterator(). + * Helper for generate_zip_tree_transitions() to avoid saving useless stuff. */ -std::vector calculate_transition_read_distances( - const std::vector& all_transitions, - const VectorView& to_chain, - size_t max_read_lookback_bases); +void add_transition_if_legal(vector& transitions, + const VectorView& to_chain, size_t max_read_lookback_bases, + size_t from_anchor, size_t to_anchor, size_t graph_distance); /** * Fill in the given DP table for the explored chain scores ending with each From a608e55da7cbfb460c51d3c0688ef9e8401e12b7 Mon Sep 17 00:00:00 2001 From: faithokamoto Date: Wed, 20 May 2026 12:29:25 -0700 Subject: [PATCH 9/9] move responsibility elsewhere for cleaner code --- src/zip_code.cpp | 75 ---------------------------------- src/zip_code.hpp | 5 --- src/zip_code_tree.cpp | 93 ++++++++++++++++++++++++++++--------------- src/zip_code_tree.hpp | 6 +++ 4 files changed, 68 insertions(+), 111 deletions(-) diff --git a/src/zip_code.cpp b/src/zip_code.cpp index d06f31255d..4699a24494 100644 --- a/src/zip_code.cpp +++ b/src/zip_code.cpp @@ -640,81 +640,6 @@ size_t ZipCode::get_chain_component(const size_t& depth) const { } } -tuple ZipCode::get_chain_child_sort_info(const size_t& depth) const { -#ifdef DEBUG_ZIPCODE - // Check our assumptions - assert(NODE_OFFSET_OFFSET == 0); - assert(NODE_LENGTH_OFFSET == 1); - assert(NODE_IS_REVERSED_OFFSET == 2); - assert(NODE_CHAIN_COMPONENT_OFFSET == 3); - - assert(SNARL_IS_REGULAR_OFFSET == 0); - assert(SNARL_OFFSET_IN_CHAIN_OFFSET == 1); - assert(SNARL_LENGTH_OFFSET == 2); - assert(SNARL_CHAIN_COMPONENT_OFFSET == 4); -#endif - if (depth == 0) { - throw std::runtime_error("Depth 0 is not a child. Do your zipcode, minimizer, and graph files match?"); - } - - ZipCode::code_type_t code_type; - size_t offset_in_chain; - size_t length; - size_t chain_component; - bool is_reversed; - - if (decoder[depth].is_chain) { - // is_chain so could be a chain or a node - if (decoder[depth-1].is_chain) { - // If the thing before this was also a chain, then it is a node - code_type = ZipCode::NODE; - size_t zip_value; - size_t zip_index = decoder[depth].offset; - // First is the offset - std::tie(zip_value, zip_index) = zipcode.get_value_and_next_index(zip_index); - offset_in_chain = zip_value == 0 ? std::numeric_limits::max() : zip_value-1; - // Next is the length - std::tie(zip_value, zip_index) = zipcode.get_value_and_next_index(zip_index); - length = 0 ? std::numeric_limits::max() : zip_value-1; - // Next is the reversedness - std::tie(zip_value, zip_index) = zipcode.get_value_and_next_index(zip_index); - is_reversed = zip_value; - // Finally is the chain component - std::tie(zip_value, zip_index) = zipcode.get_value_and_next_index(zip_index); - chain_component = zip_value; - } else { - throw std::runtime_error("A chain is not a child of a chain. Do your zipcode, minimizer, and graph files match?"); - } - } else { - // Definitely a snarl - size_t zip_value; - size_t zip_index = decoder[depth].offset; - std::tie(zip_value, zip_index) = zipcode.get_value_and_next_index(zip_index); - // First is the snarl type - if (zip_value == 0) { - code_type = ZipCode::IRREGULAR_SNARL; - } else if (zip_value == 1) { - code_type = ZipCode::REGULAR_SNARL; - } else { - code_type = ZipCode::CYCLIC_SNARL; - } - // Next is the offset - std::tie(zip_value, zip_index) = zipcode.get_value_and_next_index(zip_index); - offset_in_chain = zip_value == 0 ? std::numeric_limits::max() : zip_value-1; - // Next is the length - std::tie(zip_value, zip_index) = zipcode.get_value_and_next_index(zip_index); - length = 0 ? std::numeric_limits::max() : zip_value-1; - // Two from now is the chain component - std::tie(zip_value, zip_index) = zipcode.get_value_and_next_index(zip_index); - std::tie(zip_value, zip_index) = zipcode.get_value_and_next_index(zip_index); - chain_component = zip_value; - // Snarls are always considered to be forward - is_reversed = false; - } - - return std::make_tuple(code_type, offset_in_chain, length, chain_component, is_reversed); -} - size_t ZipCode::get_last_chain_component(const size_t& depth, bool get_end) const { if (!decoder[depth].is_chain) { diff --git a/src/zip_code.hpp b/src/zip_code.hpp index 3b7d59269c..d3ed284190 100644 --- a/src/zip_code.hpp +++ b/src/zip_code.hpp @@ -317,11 +317,6 @@ class ZipCode { ///For snarls, this will be the component of the start node size_t get_chain_component(const size_t& depth) const ; - /// Get all info needed to sort a child of a chain - /// This avoids paging through the same information multiple times - /// Order is type, offset, length, chain component, and reversedness - tuple get_chain_child_sort_info(const size_t& depth) const; - ///Get the chain component of the last node in the chain /// This behaves like the distance index get_chain_component- /// for looping chains it returns the last component if get_end is true, diff --git a/src/zip_code_tree.cpp b/src/zip_code_tree.cpp index ce705cee3d..b6e2cd7237 100644 --- a/src/zip_code_tree.cpp +++ b/src/zip_code_tree.cpp @@ -2262,40 +2262,13 @@ void ZipCodeForest::sort_one_interval(forest_growing_state_t& forest_state, cons // with offset in node of 0 (node 3 if chain is traversed forward) // See sort_value_t for more details - ZipCode::code_type_t code_type; - size_t offset_in_chain; - size_t length; - size_t chain_component; - bool is_reversed; - std::tie(code_type, offset_in_chain, length, chain_component, is_reversed) \ - = seed.zipcode.get_chain_child_sort_info(interval.depth+1); - - sort_values_by_seed[zipcode_sort_order[i]].set_code_type(code_type); - sort_values_by_seed[zipcode_sort_order[i]].set_chain_component(chain_component); - min_component = std::min(min_component, chain_component); - max_component = std::max(max_component, chain_component); - - if (code_type == ZipCode::REGULAR_SNARL - || code_type == ZipCode::IRREGULAR_SNARL - || code_type == ZipCode::CYCLIC_SNARL) { - - // For a snarl, the order is prefix_sum*3+1 - sort_values_by_seed[zipcode_sort_order[i]].set_sort_value(offset_in_chain); - sort_values_by_seed[zipcode_sort_order[i]].set_chain_order(1); - } else { - // Order depends on where the position falls in the node - bool node_is_rev = is_reversed != is_rev(seed.pos); - size_t node_offset = node_is_rev ? length - offset(seed.pos) - : offset(seed.pos); - - sort_values_by_seed[zipcode_sort_order[i]].set_sort_value( - SnarlDistanceIndex::sum(offset_in_chain, node_offset)); - sort_values_by_seed[zipcode_sort_order[i]].set_chain_order(node_offset == 0 ? 2 : 0); - } + get_chain_child_sort_info(sort_values_by_seed[zipcode_sort_order[i]], seed, interval.depth+1); + min_component = std::min(min_component, sort_values_by_seed[zipcode_sort_order[i]].get_chain_component()); + max_component = std::max(max_component, sort_values_by_seed[zipcode_sort_order[i]].get_chain_component()); #ifdef DEBUG_ZIP_CODE_SORTING cerr << "Prefix sum " << sort_values_by_seed[zipcode_sort_order[i]].get_distance_value() << " and sort value " << sort_values_by_seed[zipcode_sort_order[i]].get_sort_value() - << " and type " << code_type << endl; + << " and type " << sort_values_by_seed[zipcode_sort_order[i]].get_code_type() << endl; #endif } else { #ifdef DEBUG_ZIP_CODE_SORTING @@ -2377,6 +2350,64 @@ void ZipCodeForest::sort_one_interval(forest_growing_state_t& forest_state, cons return; } +void ZipCodeForest::get_chain_child_sort_info(sort_value_t& chain_child, const Seed& seed, const size_t& depth) const { + if (depth == 0) { + throw std::runtime_error("Depth 0 is not a child. Do your zipcode, minimizer, and graph files match?"); + } + + if (seed.zipcode.decoder[depth].is_chain) { + // is_chain so could be a chain or a node + if (seed.zipcode.decoder[depth-1].is_chain) { + // If the thing before this was also a chain, then it is a node + chain_child.set_code_type(ZipCode::NODE); + size_t zip_value; + size_t zip_index = seed.zipcode.decoder[depth].offset; + // First is the offset + std::tie(zip_value, zip_index) = seed.zipcode.zipcode.get_value_and_next_index(zip_index); + size_t offset_in_chain = zip_value == 0 ? std::numeric_limits::max() : zip_value-1; + // Next is the length + std::tie(zip_value, zip_index) = seed.zipcode.zipcode.get_value_and_next_index(zip_index); + size_t length = 0 ? std::numeric_limits::max() : zip_value-1; + // Next is the reversedness + std::tie(zip_value, zip_index) = seed.zipcode.zipcode.get_value_and_next_index(zip_index); + bool is_reversed = zip_value; + // Finally is the chain component + std::tie(zip_value, zip_index) = seed.zipcode.zipcode.get_value_and_next_index(zip_index); + chain_child.set_chain_component(zip_value); + + // Order depends on where the position falls in the node + size_t node_offset = is_reversed != is_rev(seed.pos) ? length - offset(seed.pos) + : offset(seed.pos); + chain_child.set_sort_value(SnarlDistanceIndex::sum(offset_in_chain, node_offset)); + chain_child.set_chain_order(node_offset == 0 ? 2 : 0); + } else { + throw std::runtime_error("A chain is not a child of a chain. Do your zipcode, minimizer, and graph files match?"); + } + } else { + // Definitely a snarl + chain_child.set_chain_order(1); + size_t zip_value; + size_t zip_index = seed.zipcode.decoder[depth].offset; + std::tie(zip_value, zip_index) = seed.zipcode.zipcode.get_value_and_next_index(zip_index); + // First is the snarl type + if (zip_value == 0) { + chain_child.set_code_type(ZipCode::IRREGULAR_SNARL); + } else if (zip_value == 1) { + chain_child.set_code_type(ZipCode::REGULAR_SNARL); + } else { + chain_child.set_code_type(ZipCode::CYCLIC_SNARL); + } + // Next is the offset + std::tie(zip_value, zip_index) = seed.zipcode.zipcode.get_value_and_next_index(zip_index); + chain_child.set_sort_value(zip_value == 0 ? std::numeric_limits::max() : zip_value-1); + // Three from now is the chain component + std::tie(zip_value, zip_index) = seed.zipcode.zipcode.get_value_and_next_index(zip_index); + std::tie(zip_value, zip_index) = seed.zipcode.zipcode.get_value_and_next_index(zip_index); + std::tie(zip_value, zip_index) = seed.zipcode.zipcode.get_value_and_next_index(zip_index); + chain_child.set_chain_component(zip_value); + } +} + void ZipCodeForest::get_next_intervals(forest_growing_state_t& forest_state, const interval_state_t& interval, std::forward_list& next_intervals) const { vector& zipcode_sort_order = forest_state.seed_sort_order; diff --git a/src/zip_code_tree.hpp b/src/zip_code_tree.hpp index 5b3b39151c..ea23483fcb 100644 --- a/src/zip_code_tree.hpp +++ b/src/zip_code_tree.hpp @@ -1173,6 +1173,12 @@ class ZipCodeForest { void sort_one_interval(forest_growing_state_t& forest_state, const interval_state_t& interval) const; + /// Helper for sort_one_interval() to set up sort_value_t for a chain child + /// Takes the seed and what depth to look at + /// This avoids paging through the same information multiple times + /// node_offset is only needed for nodes + void get_chain_child_sort_info(sort_value_t& chain_child, const Seed& seed, const size_t& depth) const; + /// Helper function for sort_one_interval() to sort seeds using radix sort /// Sorts the slice of seeds in the given interval of zipcode_sort_order, /// a vector of indices into seeds. Reverse order if reverse_order.