diff --git a/.gitignore b/.gitignore index 7cb3dfee5fc..53e1deec36a 100644 --- a/.gitignore +++ b/.gitignore @@ -51,3 +51,4 @@ debug_main.cpp /*.fa /*.gfa .vscode/* +.cache/* \ No newline at end of file diff --git a/.gitmodules b/.gitmodules index 1bb80d2346b..f739a448ee6 100644 --- a/.gitmodules +++ b/.gitmodules @@ -133,3 +133,6 @@ [submodule "deps/mimalloc"] path = deps/mimalloc url = https://github.com/microsoft/mimalloc.git +[submodule "deps/GFAz"] + path = deps/GFAz + url = https://github.com/babyplutokurt/GFAz.git diff --git a/Makefile b/Makefile index 6506208d58f..cf37a644dbb 100644 --- a/Makefile +++ b/Makefile @@ -67,14 +67,14 @@ DEPGEN_FLAGS := -MMD -MP # even though that's not *always* safe. See # and # -INCLUDE_FLAGS :=-I$(CWD)/$(INC_DIR) -I. -I$(CWD)/$(SRC_DIR) -I$(CWD)/$(UNITTEST_SRC_DIR) -I$(CWD)/$(UNITTEST_SUPPORT_SRC_DIR) -I$(CWD)/$(SUBCOMMAND_SRC_DIR) -I$(CWD)/$(INC_DIR)/dynamic $(shell $(PKG_CONFIG) --cflags $(PKG_CONFIG_DEPS) $(PKG_CONFIG_STATIC_DEPS) | tr ' ' '\n' | awk '!x[$$0]++' | tr '\n' ' ') +INCLUDE_FLAGS :=-I$(CWD)/$(INC_DIR) -I$(CWD)/$(INC_DIR)/GFAz -I. -I$(CWD)/$(SRC_DIR) -I$(CWD)/$(UNITTEST_SRC_DIR) -I$(CWD)/$(UNITTEST_SUPPORT_SRC_DIR) -I$(CWD)/$(SUBCOMMAND_SRC_DIR) -I$(CWD)/$(INC_DIR)/dynamic $(shell $(PKG_CONFIG) --cflags $(PKG_CONFIG_DEPS) $(PKG_CONFIG_STATIC_DEPS) | tr ' ' '\n' | awk '!x[$$0]++' | tr '\n' ' ') # Define libraries to link vg against. # These need to come before library search paths from LDFLAGS or we won't # prefer linking vg-installed dependencies over system ones. LD_LIB_DIR_FLAGS := -L$(CWD)/$(LIB_DIR) -LD_LIB_FLAGS := -lvcflib -lwfa2 -ltabixpp -lgssw -lssw -lsublinearLS -lpthread -lncurses -lgcsa2 -lgbwtgraph -lgbwt -lkff -ldivsufsort -ldivsufsort64 -lvcfh -lraptor2 -lpinchesandcacti -l3edgeconnected -lsonlib -lfml -lstructures -lbdsg -lxg -lsdsl -lzstd -lhandlegraph -lcrypto +LD_LIB_FLAGS := -lvcflib -lwfa2 -ltabixpp -lgssw -lssw -lsublinearLS -lpthread -lncurses -lgcsa2 -lgbwtgraph -lgbwt -lkff -ldivsufsort -ldivsufsort64 -lvcfh -lraptor2 -lpinchesandcacti -l3edgeconnected -lsonlib -lfml -lstructures -lbdsg -lxg -lsdsl -lhandlegraph -lgfa_compression_core -lzstd -lcrypto # We omit Boost Program Options for now; we find it in a platform-dependent way. # By default it has no suffix BOOST_SUFFIX="" @@ -372,6 +372,7 @@ IPS4O_DIR=deps/ips4o BBHASH_DIR=deps/BBHash MIO_DIR=deps/mio ATOMIC_QUEUE_DIR=deps/atomic_queue +GFAz_DIR=deps/GFAz # Dependencies that go into libvg's archive # These go in libvg but come from dependencies @@ -408,6 +409,7 @@ LIB_DEPS += $(LIB_DIR)/libvgio.a LIB_DEPS += $(LIB_DIR)/libhandlegraph.a LIB_DEPS += $(LIB_DIR)/libbdsg.a LIB_DEPS += $(LIB_DIR)/libxg.a +LIB_DEPS += $(LIB_DIR)/libgfa_compression_core.a ifneq ($(shell uname -s),Darwin) # On non-Mac (i.e. Linux), where ELF binaries are used, pull in libdw which # backward-cpp will use. @@ -519,6 +521,7 @@ DEPS += $(INC_DIR)/BooPHF.h DEPS += $(INC_DIR)/mio/mmap.hpp DEPS += $(INC_DIR)/atomic_queue.h + .PHONY: clean clean-tests get-deps deps lint test set-path objs static static-docker docs man .pre-build version # Aggregate all libvg deps, and exe deps other than libvg @@ -918,6 +921,16 @@ $(INC_DIR)/mio/mmap.hpp: $(MIO_DIR)/include/mio/* $(INC_DIR)/atomic_queue.h: $(ATOMIC_QUEUE_DIR)/include/* +cp -r $(ATOMIC_QUEUE_DIR)/include/atomic_queue/* $(CWD)/$(INC_DIR)/ + + +$(LIB_DIR)/libgfa_compression_core.a: $(GFAz_DIR)/CMakeLists.txt $(wildcard $(GFAz_DIR)/src/*) $(wildcard $(GFAz_DIR)/src/gpu/*) $(wildcard $(GFAz_DIR)/include/*) $(wildcard $(GFAz_DIR)/include/gpu/*) + +rm -f $(CWD)/$(LIB_DIR)/libgfa_compression_core.a + +rm -Rf $(CWD)/$(INC_DIR)/GFAz + +cd $(GFAz_DIR) && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_C_COMPILER="$(CC)" -DCMAKE_CXX_COMPILER="$(CXX)" -DCMAKE_CXX_FLAGS="-fPIC $(CXXFLAGS) $(CPPFLAGS)" -DBUILD_PYTHON_BINDINGS=OFF -DBUILD_CLI=OFF -DBUILD_BENCHMARKS=OFF -DBUILD_TESTS=OFF -DGFAZ_USE_SYSTEM_ZSTD=ON .. && $(MAKE) $(FILTER) gfa_compression_core && cp libgfa_compression_core.a $(CWD)/$(LIB_DIR)/ + +mkdir -p $(CWD)/$(INC_DIR)/GFAz + +cp -r $(GFAz_DIR)/include/* $(CWD)/$(INC_DIR)/GFAz/ + + $(INC_DIR)/mmmultiset.hpp: $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(INC_DIR)/mmmultimap.hpp $(INC_DIR)/mmmultimap.hpp: $(MMMULTIMAP_DIR)/src/mmmultimap.hpp $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(INC_DIR)/mio/mmap.hpp $(INC_DIR)/atomic_queue.h +cp $(MMMULTIMAP_DIR)/src/mmmultimap.hpp $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(CWD)/$(INC_DIR)/ diff --git a/deps/GFAz b/deps/GFAz new file mode 160000 index 00000000000..76fcddc55f0 --- /dev/null +++ b/deps/GFAz @@ -0,0 +1 @@ +Subproject commit 76fcddc55f0fe4c33d6a46d0e6b1c8b94208cc86 diff --git a/src/algorithms/gfa_to_handle.cpp b/src/algorithms/gfa_to_handle.cpp index a07fa88c9f0..11c4fe04561 100644 --- a/src/algorithms/gfa_to_handle.cpp +++ b/src/algorithms/gfa_to_handle.cpp @@ -1,11 +1,36 @@ #include "gfa_to_handle.hpp" #include "../path.hpp" +#include "../utility.hpp" #include namespace vg { namespace algorithms { +static std::string_view as_view(const GFAParser::chars_t& range) { + // chars_t is a pair of string::const_iterator into a contiguous buffer. + size_t len = range.second - range.first; + return len == 0 ? std::string_view() : std::string_view(&*range.first, len); +} + +static GFAParser::visit_iteratee_t make_p_visit_iteratee(const GFAParser::chars_t& visits) { + // Adapt the legacy P-line scan helpers to the parser's iterator interface. + return [visits](const GFAParser::visit_step_t& visit_step) { + GFAParser::scan_p_visits(visits, [&](int64_t rank, const GFAParser::chars_t& node_name, bool is_reverse) { + return visit_step(rank, 0, as_view(node_name), is_reverse); + }); + }; +} + +static GFAParser::visit_iteratee_t make_w_visit_iteratee(const GFAParser::chars_t& visits) { + // Adapt the legacy W-line scan helpers to the parser's iterator interface. + return [visits](const GFAParser::visit_step_t& visit_step) { + GFAParser::scan_w_visits(visits, [&](int64_t rank, const GFAParser::chars_t& node_name, bool is_reverse) { + return visit_step(rank, 0, as_view(node_name), is_reverse); + }); + }; +} + void GFAIDMapInfo::invert_translation() { if (!numeric_mode) { // Make the mapping @@ -46,6 +71,22 @@ static void write_gfa_translation(const GFAIDMapInfo& id_map_info, const string& } } +// Temporary filename-based parser dispatch for old command paths that have not +// yet moved fully onto the registry/VPKG loading path. +static unique_ptr make_gfa_family_parser_for_file(const string& filename) { + if (filename != "-" && GFAzParser::looks_like_gfaz(filename)) { + return make_unique(); + } + return make_unique(); +} + +static void attach_translation(GFAParser& parser, GFAIDMapInfo* translation) { + if (translation) { + // Use the given external translation so the caller can keep it around. + parser.external_id_map = translation; + } +} + /// Add listeners which let a GFA parser fill in a handle graph with nodes and edges. static void add_graph_listeners(GFAParser& parser, MutableHandleGraph* graph) { parser.node_listeners.push_back([&parser, graph](nid_t id, const GFAParser::chars_t& sequence, const GFAParser::tag_list_t& tags) { @@ -103,8 +144,7 @@ static void add_path_listeners(GFAParser& parser, MutablePathMutableHandleGraph* }); parser.path_listeners.push_back([&parser, graph, reference_samples, ignore_sense](const string& name, - const GFAParser::chars_t& visits, - const GFAParser::chars_t& overlaps, + const GFAParser::visit_iteratee_t& visits, const GFAParser::tag_list_t& tags) { // For P lines, we add the path. @@ -168,29 +208,24 @@ static void add_path_listeners(GFAParser& parser, MutablePathMutableHandleGraph* phase_block, subrange); - // Overlaps are pre-checked in scan_p - // TODO: Do it in a better place. - - GFAParser::scan_p_visits(visits, [&](int64_t step_rank, - const GFAParser::chars_t& step_name, - bool step_is_reverse) { + visits([&](int64_t step_rank, nid_t step_node_id, std::string_view step_name, bool step_is_reverse) { if (step_rank >= 0) { // Not an empty path sentinel. - // Find the node ID to visit. - nid_t n = GFAParser::find_existing_sequence_id(GFAParser::extract(step_name), parser.id_map()); - // And add the step. + // Use the pre-resolved ID if the parser supplied one; otherwise look it up by name. + nid_t n = step_node_id != 0 ? step_node_id + : GFAParser::find_existing_sequence_id(step_name, parser.id_map()); graph->append_step(path_handle, graph->get_handle(n, step_is_reverse)); } // Don't stop. return true; }); }); - + parser.walk_listeners.push_back([&parser, graph, reference_samples](const string& sample_name, int64_t haplotype, const string& contig_name, const subrange_t& subrange, - const GFAParser::chars_t& visits, + const GFAParser::visit_iteratee_t& visits, const GFAParser::tag_list_t& tags) { // For W lines, we add the path with a bit more metadata. @@ -260,23 +295,18 @@ static void add_path_listeners(GFAParser& parser, MutablePathMutableHandleGraph* phase_block, assigned_subrange); - GFAParser::scan_w_visits(visits, [&](int64_t step_rank, - const GFAParser::chars_t& step_name, - bool step_is_reverse) { + visits([&](int64_t step_rank, nid_t step_node_id, std::string_view step_name, bool step_is_reverse) { if (step_rank >= 0) { // Not an empty path sentinel. - // Find the node ID to visit. - nid_t n = GFAParser::find_existing_sequence_id(GFAParser::extract(step_name), parser.id_map()); - // And add the step. + nid_t n = step_node_id != 0 ? step_node_id + : GFAParser::find_existing_sequence_id(step_name, parser.id_map()); graph->append_step(path_handle, graph->get_handle(n, step_is_reverse)); } // Don't stop. return true; }); }); - - - + parser.rgfa_listeners.push_back([&parser, graph, rgfa_cache](nid_t id, int64_t offset, size_t length, @@ -326,10 +356,10 @@ static void add_path_listeners(GFAParser& parser, MutablePathMutableHandleGraph* void gfa_to_handle_graph(const string& filename, MutableHandleGraph* graph, GFAIDMapInfo* translation) { - - get_input_file(filename, [&](istream& in) { - gfa_to_handle_graph(in, graph, translation); - }); + GFATextParser parser; + attach_translation(parser, translation); + parser_to_handle_graph(parser, graph); + parser.parse(filename); } void gfa_to_handle_graph(const string& filename, MutableHandleGraph* graph, @@ -344,24 +374,37 @@ void gfa_to_handle_graph(const string& filename, MutableHandleGraph* graph, void gfa_to_handle_graph(istream& in, MutableHandleGraph* graph, GFAIDMapInfo* translation) { - GFAParser parser; - if (translation) { - // Use the given external translation so the caller can keep it around. - parser.external_id_map = translation; - } - add_graph_listeners(parser, graph); - + GFATextParser parser; + attach_translation(parser, translation); + parser_to_handle_graph(parser, graph); parser.parse(in); } +// Temporary IO wrappers for old command paths that still want filename-based +// "GFA or GFAZ" loading without going through the registry. +void load_gfa_or_gfaz_to_handle_graph(const string& filename, MutableHandleGraph* graph, + GFAIDMapInfo* translation) { + auto parser = make_gfa_family_parser_for_file(filename); + attach_translation(*parser, translation); + parser_to_handle_graph(*parser, graph); + parser->parse(filename); +} + +void load_gfa_or_gfaz_to_handle_graph(const string& filename, MutableHandleGraph* graph, + const string& translation_filename) { + + GFAIDMapInfo id_map_info; + load_gfa_or_gfaz_to_handle_graph(filename, graph, &id_map_info); + write_gfa_translation(id_map_info, translation_filename); +} void gfa_to_path_handle_graph(const string& filename, MutablePathMutableHandleGraph* graph, GFAIDMapInfo* translation, int64_t max_rgfa_rank, unordered_set* ignore_sense) { - - get_input_file(filename, [&](istream& in) { - gfa_to_path_handle_graph(in, graph, translation, max_rgfa_rank); - }); + GFATextParser parser; + attach_translation(parser, translation); + parser_to_path_handle_graph(parser, graph, max_rgfa_rank, ignore_sense); + parser.parse(filename); } void gfa_to_path_handle_graph(const string& filename, MutablePathMutableHandleGraph* graph, @@ -369,7 +412,7 @@ void gfa_to_path_handle_graph(const string& filename, MutablePathMutableHandleGr unordered_set* ignore_sense) { GFAIDMapInfo id_map_info; - gfa_to_path_handle_graph(filename, graph, &id_map_info, max_rgfa_rank); + gfa_to_path_handle_graph(filename, graph, &id_map_info, max_rgfa_rank, ignore_sense); write_gfa_translation(id_map_info, translation_filename); } @@ -380,19 +423,41 @@ void gfa_to_path_handle_graph(istream& in, int64_t max_rgfa_rank, unordered_set* ignore_sense) { - // TODO: Deduplicate this setup code with gfa_to_handle_graph more. - GFAParser parser; - if (translation) { - // Use the given external translation so the caller can keep it around. - parser.external_id_map = translation; - } + GFATextParser parser; + attach_translation(parser, translation); + parser_to_path_handle_graph(parser, graph, max_rgfa_rank, ignore_sense); + parser.parse(in); +} + +void load_gfa_or_gfaz_to_path_handle_graph(const string& filename, MutablePathMutableHandleGraph* graph, + GFAIDMapInfo* translation, int64_t max_rgfa_rank, + unordered_set* ignore_sense) { + auto parser = make_gfa_family_parser_for_file(filename); + attach_translation(*parser, translation); + parser_to_path_handle_graph(*parser, graph, max_rgfa_rank, ignore_sense); + parser->parse(filename); +} + +void load_gfa_or_gfaz_to_path_handle_graph(const string& filename, MutablePathMutableHandleGraph* graph, + int64_t max_rgfa_rank, const string& translation_filename, + unordered_set* ignore_sense) { + + GFAIDMapInfo id_map_info; + load_gfa_or_gfaz_to_path_handle_graph(filename, graph, &id_map_info, max_rgfa_rank, ignore_sense); + write_gfa_translation(id_map_info, translation_filename); +} + +void parser_to_handle_graph(GFAParser& parser, MutableHandleGraph* graph) { + add_graph_listeners(parser, graph); +} + +void parser_to_path_handle_graph(GFAParser& parser, + MutablePathMutableHandleGraph* graph, + int64_t max_rgfa_rank, + unordered_set* ignore_sense) { add_graph_listeners(parser, graph); - - // Set up for path input parser.max_rgfa_rank = max_rgfa_rank; add_path_listeners(parser, graph, ignore_sense); - - parser.parse(in); } /// Read a range, stopping before any end character in the given null-terminated string, @@ -843,8 +908,28 @@ nid_t GFAParser::assign_new_sequence_id(const string& str, GFAIDMapInfo& id_map_ return node_id; } -nid_t GFAParser::find_existing_sequence_id(const string& str, GFAIDMapInfo& id_map_info) { - auto found = id_map_info.name_to_id->find(str); +nid_t GFAParser::find_existing_sequence_id(std::string_view str, GFAIDMapInfo& id_map_info) { + if (id_map_info.numeric_mode && id_map_info.direct_numeric_lookup) { + // GFAz fast path: IDs are dense 1-based integers, no map needed. + if (str.empty()) { + return 0; + } + nid_t parsed_id = 0; + for (char c : str) { + if (!isdigit(static_cast(c))) { + return 0; + } + parsed_id *= 10; + parsed_id += (c - '0'); + } + if (parsed_id > 0 && parsed_id <= id_map_info.max_id) { + return parsed_id; + } + return 0; + } + // unordered_map::find does not support heterogeneous lookup + // before C++20, so we allocate only here on the slow path. + auto found = id_map_info.name_to_id->find(string(str)); if (found != id_map_info.name_to_id->end()) { // already in map, just return return found->second; @@ -853,7 +938,17 @@ nid_t GFAParser::find_existing_sequence_id(const string& str, GFAIDMapInfo& id_m return 0; } -void GFAParser::parse(istream& in) { +void GFAParser::parse(const string& filename) { + get_input_file(filename, [&](istream& in) { + parse(in); + }); +} + +void GFATextParser::parse(const string& filename) { + GFAParser::parse(filename); +} + +void GFATextParser::parse(istream& in) { if (!in) { throw std::ios_base::failure("error:[GFAParser] Couldn't open input stream"); } @@ -1120,7 +1215,7 @@ void GFAParser::parse(istream& in) { auto& visits = get<1>(p_parse); auto& overlaps = get<2>(p_parse); auto& tags = get<3>(p_parse); - + for(auto it = overlaps.first; it != overlaps.second; ++it) { if (*it != '*' && *it != ',' && *it != 'M' && (*it < '0' || *it > '9')) { // This overlap isn't just * or a list of * or a list of matches with numbers. @@ -1137,13 +1232,13 @@ void GFAParser::parse(istream& in) { // Nothing to do for empty paths return true; } - + if (step_rank >= 0) { - string step_string = GFAParser::extract(step_id); - nid_t n = GFAParser::find_existing_sequence_id(step_string, this->id_map()); + std::string_view step_view = as_view(step_id); + nid_t n = GFAParser::find_existing_sequence_id(step_view, this->id_map()); if (!n) { missing = true; - missing_name = std::move(step_string); + missing_name.assign(step_view); return false; } } @@ -1156,7 +1251,7 @@ void GFAParser::parse(istream& in) { for (auto& listener : this->path_listeners) { // Tell all the listener functions - listener(path_name, visits, overlaps, tags); + listener(path_name, make_p_visit_iteratee(visits), tags); } } break; @@ -1187,13 +1282,13 @@ void GFAParser::parse(istream& in) { // Nothing to do for empty paths return true; } - + // For every node the walk visits, make sure we have seen it. - string step_string = GFAParser::extract(step_id); - nid_t n = GFAParser::find_existing_sequence_id(step_string, this->id_map()); + std::string_view step_view = as_view(step_id); + nid_t n = GFAParser::find_existing_sequence_id(step_view, this->id_map()); if (!n) { missing = true; - missing_name = std::move(step_string); + missing_name.assign(step_view); return false; } return true; @@ -1204,9 +1299,10 @@ void GFAParser::parse(istream& in) { return false; } + auto visit_iteratee = make_w_visit_iteratee(visits); for (auto& listener : this->walk_listeners) { // Tell all the listener functions - listener(sample_name, haplotype, contig_name, subrange, visits, tags); + listener(sample_name, haplotype, contig_name, subrange, visit_iteratee, tags); } } break; diff --git a/src/algorithms/gfa_to_handle.hpp b/src/algorithms/gfa_to_handle.hpp index 0e5bda62cd7..27e8772cab3 100644 --- a/src/algorithms/gfa_to_handle.hpp +++ b/src/algorithms/gfa_to_handle.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include "../handle.hpp" @@ -25,6 +26,10 @@ using namespace std; struct GFAIDMapInfo : public NamedNodeBackTranslation { /// If true, GFA string IDs are just graph numerical IDs. bool numeric_mode = true; + /// If true, existing node lookups may resolve numeric IDs directly + /// by range check (1..max_id) without requiring an entry in name_to_id. + /// Only safe when all IDs in [1, max_id] are guaranteed to exist (e.g. GFAz). + bool direct_numeric_lookup = false; /// This holds the max node ID yet used. nid_t max_id = 0; /// This maps from GFA string ID to graph numerical ID. @@ -54,11 +59,11 @@ struct GFAIDMapInfo : public NamedNodeBackTranslation { }; -/// Read a GFA file for a blunt-ended graph into a HandleGraph. Give "-" as a filename for stdin. +/// Read a text GFA file for a blunt-ended graph into a HandleGraph. +/// Give "-" as a filename for stdin. /// -/// Throws GFAFormatError if the GFA file is not acceptable, and -/// std::ios_base::failure if an IO operation fails. Throws invalid_argument if -/// otherwise misused. +/// Throws GFAFormatError if the file is not acceptable, and +/// std::ios_base::failure if an IO operation fails. /// Does not give max ID hints, and so might be very slow when loading into an ODGI graph. void gfa_to_handle_graph(const string& filename, MutableHandleGraph* graph, @@ -74,7 +79,18 @@ void gfa_to_handle_graph(istream& in, MutableHandleGraph* graph, GFAIDMapInfo* translation = nullptr); -/// Same as gfa_to_handle_graph but also adds path elements from the GFA to the graph. +/// Temporary IO helper for old command paths that still need to load either a +/// text GFA or a GFAZ file by filename. +void load_gfa_or_gfaz_to_handle_graph(const string& filename, + MutableHandleGraph* graph, + GFAIDMapInfo* translation = nullptr); + +/// Overload which serializes its translation to a file internally. +void load_gfa_or_gfaz_to_handle_graph(const string& filename, + MutableHandleGraph* graph, + const string& translation_filename); + +/// Same as gfa_to_handle_graph but also adds path elements from the text GFA to the graph. void gfa_to_path_handle_graph(const string& filename, MutablePathMutableHandleGraph* graph, GFAIDMapInfo* translation = nullptr, @@ -95,6 +111,21 @@ void gfa_to_path_handle_graph(istream& in, int64_t max_rgfa_rank = numeric_limits::max(), unordered_set* ignore_sense = nullptr); +/// Temporary IO helper for old command paths that still need to load either a +/// text GFA or a GFAZ file by filename. +void load_gfa_or_gfaz_to_path_handle_graph(const string& filename, + MutablePathMutableHandleGraph* graph, + GFAIDMapInfo* translation = nullptr, + int64_t max_rgfa_rank = numeric_limits::max(), + unordered_set* ignore_sense = nullptr); + +/// Overload which serializes its translation to a file internally. +void load_gfa_or_gfaz_to_path_handle_graph(const string& filename, + MutablePathMutableHandleGraph* graph, + int64_t max_rgfa_rank, + const string& translation_filename, + unordered_set* ignore_sense = nullptr); + /** * Lower-level tools for parsing GFA elements. * @@ -133,6 +164,13 @@ class GFAParser { // And a type for a collection of GFA tags. // This could become a range or list of ranges if we wanted to copy less. using tag_list_t = vector; + // And a callback shape for iterating path/walk visits without exposing + // the source encoding. The node_name is a view into a buffer owned by the + // caller and is valid only for the duration of the call. If node_id is + // nonzero, it is the pre-resolved graph node ID and the listener should + // use it directly instead of looking up node_name; text parsers pass 0. + using visit_step_t = function; + using visit_iteratee_t = function; /** * Parse tags out from a possibly empty range to a vector of tag strings. @@ -174,7 +212,7 @@ class GFAParser { */ static void scan_p_visits(const chars_t& visit_range, function visit_step); - + /** * Scan visits extracted from a W line. * Calls a callback with all the steps. @@ -228,7 +266,7 @@ class GFAParser { /** * Find the existing sequence ID for the given node name, or 0 if it has not been seen yet. */ - static nid_t find_existing_sequence_id(const string& str, GFAIDMapInfo& id_map_info); + static nid_t find_existing_sequence_id(std::string_view str, GFAIDMapInfo& id_map_info); // To actually parse GFA, we stick event listeners on here and then we go // through the GFA. It is the parser's job to make sure events aren't fired @@ -241,7 +279,7 @@ class GFAParser { GFAIDMapInfo* external_id_map = nullptr; /// Get the ID map we should be using for parsing. - inline GFAIDMapInfo& id_map(); + GFAIDMapInfo& id_map(); /// These listeners are called for the header line(s), if any. vector> header_listeners; @@ -255,11 +293,11 @@ class GFAParser { /// These listeners will be called with information for all P line paths, /// after the listeners for all involved nodes, and for the first header if any. /// Listeners are not protected from duplicate path names. - vector> path_listeners; + vector> path_listeners; /// These listeners will be called with information for all W line paths, /// after the listeners for all involved nodes, and for the first header if any. /// Listeners are not protected from duplicate path metadata. - vector& subrange, const chars_t& visits, const tag_list_t& tags)>> walk_listeners; + vector& subrange, const visit_iteratee_t& visits, const tag_list_t& tags)>> walk_listeners; /// These listeners will be called with each visit of an rGFA path to a /// node, after the node listeners for the involved node, but in an /// unspecified order with respect to listeners for headers. They will be @@ -281,8 +319,51 @@ class GFAParser { /** * Parse GFA from the given stream. */ - void parse(istream& in); + virtual void parse(istream& in) = 0; + + /** + * Parse GFA from the given filename. + * The default implementation opens the file as a text GFA stream and calls + * the stream parser. + */ + virtual void parse(const string& filename); + virtual ~GFAParser() = default; + +}; + +/// Drive the shared node/edge import logic from an already-configured parser. +void parser_to_handle_graph(GFAParser& parser, + MutableHandleGraph* graph); + +/// Drive the shared node/edge/path import logic from an already-configured parser. +void parser_to_path_handle_graph(GFAParser& parser, + MutablePathMutableHandleGraph* graph, + int64_t max_rgfa_rank = numeric_limits::max(), + unordered_set* ignore_sense = nullptr); + +/// Text GFA parser implementation that emits listener events from the legacy +/// line-oriented parser. +class GFATextParser : public GFAParser { +public: + void parse(istream& in) override; + void parse(const string& filename) override; +}; + +/// GFAZ parser implementation that emits the same listener events from +/// decompressed GFAZ data. +class GFAzParser : public GFAParser { +public: + void parse(istream& in) override; + void parse(const string& filename) override; + + /// Return true if the named file begins with the GFAz magic number. + /// Returns false for stdin ("-") or empty filenames. + static bool looks_like_gfaz(const string& filename); + + /// Return true if the first sizeof(uint32_t) bytes of the buffer encode the + /// GFAz magic number. + static bool has_magic(const char* buffer, size_t length); }; /// This exception will be thrown if the GFA data is not acceptable. diff --git a/src/algorithms/gfaz_to_handle.cpp b/src/algorithms/gfaz_to_handle.cpp new file mode 100644 index 00000000000..1d4ebd232af --- /dev/null +++ b/src/algorithms/gfaz_to_handle.cpp @@ -0,0 +1,521 @@ +#include "gfa_to_handle.hpp" + +#include "../path.hpp" + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +namespace vg { +namespace algorithms { +using gfaz::gfa_write_utils::SequenceOffsets; +using gfaz::gfa_write_utils::build_offsets; +using gfaz::gfa_write_utils::decode_rules; +using gfaz::gfa_write_utils::decompress_optional_column; +using gfaz::gfa_write_utils::decompress_string_column; +using gfaz::CompressedData; +using gfaz::LinkData; +using gfaz::NodeId; +using gfaz::OptionalFieldColumn; +using gfaz::GFAZ_MAGIC; +using gfaz::deserialize_compressed_data; +namespace Codec = gfaz::Codec; + +struct StreamingGFAZPaths { + string header_line; + size_t node_count = 0; + vector node_lengths; + vector segment_optional_fields; + + LinkData links; + + vector rules_first; + vector rules_second; + vector paths_flat; + vector walks_flat; + SequenceOffsets path_offsets; + SequenceOffsets walk_offsets; + SequenceOffsets original_path_offsets; + SequenceOffsets original_walk_offsets; + + vector path_names; + vector walk_sample_ids; + vector walk_hap_indices; + vector walk_seq_ids; + vector walk_seq_starts; + vector walk_seq_ends; + + uint32_t min_rule_id = 0; +}; + +bool GFAzParser::has_magic(const char* buffer, size_t length) { + if (length < sizeof(uint32_t)) { + return false; + } + uint32_t magic = 0; + memcpy(&magic, buffer, sizeof(uint32_t)); + return magic == GFAZ_MAGIC; +} + +bool GFAzParser::looks_like_gfaz(const string& filename) { + if (filename.empty() || filename == "-") { + return false; + } + ifstream in(filename, ios::binary); + if (!in) { + return false; + } + char buffer[sizeof(uint32_t)]; + in.read(buffer, sizeof(buffer)); + return has_magic(buffer, static_cast(in.gcount())); +} + +static CompressedData load_gfaz_compressed(const string &filename) { + if (filename == "-") { + throw invalid_argument("GFAZ input from stdin (-) is not supported"); + } + return deserialize_compressed_data(filename); +} + +static void expand_rule(uint32_t rule_id, bool reverse, + const vector &first, + const vector &second, uint32_t min_id, + uint32_t max_id, vector &out) { + const uint32_t idx = rule_id - min_id; + const int32_t a = first[idx]; + const int32_t b = second[idx]; + + if (!reverse) { + const uint32_t abs_a = static_cast(std::abs(a)); + if (abs_a >= min_id && abs_a < max_id) { + expand_rule(abs_a, a < 0, first, second, min_id, max_id, out); + } else { + out.push_back(a); + } + + const uint32_t abs_b = static_cast(std::abs(b)); + if (abs_b >= min_id && abs_b < max_id) { + expand_rule(abs_b, b < 0, first, second, min_id, max_id, out); + } else { + out.push_back(b); + } + } else { + const uint32_t abs_b = static_cast(std::abs(b)); + if (abs_b >= min_id && abs_b < max_id) { + expand_rule(abs_b, b >= 0, first, second, min_id, max_id, out); + } else { + out.push_back(-b); + } + + const uint32_t abs_a = static_cast(std::abs(a)); + if (abs_a >= min_id && abs_a < max_id) { + expand_rule(abs_a, a >= 0, first, second, min_id, max_id, out); + } else { + out.push_back(-a); + } + } +} + +static vector decode_sequence_at_index( + const vector &flat, const SequenceOffsets &compressed_offsets, + const SequenceOffsets &original_offsets, size_t index, + const vector &rules_first, const vector &rules_second, + uint32_t min_rule_id, int delta_round) { + if (index + 1 >= compressed_offsets.size()) { + throw out_of_range("GFAZ traversal index out of range"); + } + + const size_t start = compressed_offsets[index]; + const size_t end = compressed_offsets[index + 1]; + if (end > flat.size()) { + throw runtime_error("GFAZ traversal block is truncated"); + } + + const uint32_t max_rule_id = + min_rule_id + static_cast(rules_first.size()); + const size_t original_length = + (index + 1 < original_offsets.size()) + ? (original_offsets[index + 1] - original_offsets[index]) + : (end - start); + + vector decoded; + decoded.reserve(original_length); + for (size_t pos = start; pos < end; ++pos) { + const NodeId node = flat[pos]; + const uint32_t abs_id = static_cast(std::abs(node)); + if (abs_id >= min_rule_id && abs_id < max_rule_id) { + expand_rule(abs_id, node < 0, rules_first, rules_second, min_rule_id, + max_rule_id, decoded); + } else { + decoded.push_back(node); + } + } + + vector> seqs(1); + seqs[0] = std::move(decoded); + for (int round = 0; round < delta_round; ++round) { + Codec::inverse_delta_transform(seqs); + } + return std::move(seqs[0]); +} + +static void dispatch_rgfa_visits( + size_t node_count, const vector& segment_optional_fields, + const vector& node_lengths, int64_t max_rgfa_rank, + const function& callback); + + +static GFAParser::chars_t as_chars(const string& value) { + return make_pair(value.begin(), value.end()); +} + +static GFAParser::tag_list_t collect_optional_tags_for_row( + const vector& columns, + vector& string_offsets, + vector& byte_offsets, + size_t row_index) { + GFAParser::tag_list_t tags; + tags.reserve(columns.size()); + for (size_t i = 0; i < columns.size(); ++i) { + const auto& col = columns[i]; + switch (col.type) { + case 'A': + if (row_index < col.char_values.size()) { + tags.push_back(col.tag + ":A:" + string(1, col.char_values[row_index])); + } + break; + case 'i': + if (row_index < col.int_values.size()) { + tags.push_back(col.tag + ":i:" + to_string(col.int_values[row_index])); + } + break; + case 'f': + if (row_index < col.float_values.size()) { + tags.push_back(col.tag + ":f:" + to_string(col.float_values[row_index])); + } + break; + case 'Z': + case 'J': + case 'H': + if (row_index < col.string_lengths.size()) { + uint32_t len = col.string_lengths[row_index]; + string value = col.concatenated_strings.substr(string_offsets[i], len); + string_offsets[i] += len; + tags.push_back(col.tag + ":" + string(1, col.type) + ":" + value); + } + break; + case 'B': + if (row_index < col.b_lengths.size() && row_index < col.b_subtypes.size()) { + uint32_t len = col.b_lengths[row_index]; + size_t offset = byte_offsets[i]; + byte_offsets[i] += len; + string value = col.tag + ":B:" + string(1, col.b_subtypes[row_index]); + for (uint32_t j = 0; j < len && offset + j < col.b_concat_bytes.size(); ++j) { + value.push_back(','); + value += to_string(col.b_concat_bytes[offset + j]); + } + tags.push_back(std::move(value)); + } + break; + default: + break; + } + } + return tags; +} + +static GFAParser::visit_iteratee_t make_gfaz_visit_iteratee(const vector& visits) { + return [visits](const GFAParser::visit_step_t& visit_step) { + if (visits.empty()) { + visit_step(-1, 0, std::string_view(), false); + return; + } + // GFAz encodes node IDs numerically, so we hand them to the listener + // directly and skip the listener's name-based lookup. + for (size_t i = 0; i < visits.size(); ++i) { + NodeId visit = visits[i]; + bool is_reverse = visit < 0; + nid_t node_id = is_reverse ? -static_cast(visit) : static_cast(visit); + if (!visit_step(i, node_id, std::string_view(), is_reverse)) { + return; + } + } + }; +} + +template +static void handle_duplicate_paths(GFAParser& parser, char line_type, Callback&& callback) { + try { + callback(); + } catch (GFADuplicatePathError& e) { + if (parser.stop_on_duplicate_paths) { + throw; + } + #pragma omp critical (cerr) + cerr << "warning:[GFAzParser] Skipping GFA " << line_type << " line: " + << e.what() << endl; + } +} + +static void dispatch_rgfa_visits(size_t node_count, + const vector& segment_optional_fields, + const vector& node_lengths, + int64_t max_rgfa_rank, + const function& callback) { + if (max_rgfa_rank < 0) { + return; + } + + const OptionalFieldColumn* sn_col = nullptr; + const OptionalFieldColumn* so_col = nullptr; + const OptionalFieldColumn* sr_col = nullptr; + for (const auto& col : segment_optional_fields) { + if (col.tag == "SN" && col.type == 'Z') { + sn_col = &col; + } else if (col.tag == "SO" && col.type == 'i') { + so_col = &col; + } else if (col.tag == "SR" && col.type == 'i') { + sr_col = &col; + } + } + if (!sn_col || !so_col || !sr_col) { + return; + } + + vector sn_offsets(sn_col->string_lengths.size() + 1, 0); + for (size_t i = 0; i < sn_col->string_lengths.size(); ++i) { + sn_offsets[i + 1] = sn_offsets[i] + sn_col->string_lengths[i]; + } + + struct RGFAVisit { + int64_t offset; + nid_t node_id; + size_t length; + }; + struct RGFAPath { + int64_t rank = 0; + bool rank_set = false; + vector visits; + }; + unordered_map by_path; + const int64_t missing_i64 = numeric_limits::min(); + + for (nid_t node_id = 1; static_cast(node_id) <= node_count; ++node_id) { + size_t idx = node_id - 1; + if (idx >= sn_col->string_lengths.size() || + idx >= so_col->int_values.size() || + idx >= sr_col->int_values.size()) { + continue; + } + uint32_t sn_len = sn_col->string_lengths[idx]; + int64_t so = so_col->int_values[idx]; + int64_t sr = sr_col->int_values[idx]; + if (sn_len == 0 || so == missing_i64 || sr == missing_i64 || sr > max_rgfa_rank) { + continue; + } + + string path_name = sn_col->concatenated_strings.substr(sn_offsets[idx], sn_len); + auto& path_info = by_path[path_name]; + if (path_info.rank_set && path_info.rank != sr) { + throw GFAFormatError("rGFA path " + path_name + + " has conflicting ranks " + to_string(sr) + " and " + + to_string(path_info.rank)); + } + path_info.rank = sr; + path_info.rank_set = true; + size_t length = (static_cast(node_id) < node_lengths.size()) ? node_lengths[node_id] : 0; + path_info.visits.push_back({so, node_id, length}); + } + + for (auto& kv : by_path) { + auto& path_name = kv.first; + auto& path_info = kv.second; + sort(path_info.visits.begin(), path_info.visits.end(), [](const RGFAVisit& a, const RGFAVisit& b) { + return a.offset < b.offset; + }); + for (const auto& visit : path_info.visits) { + callback(visit.node_id, visit.offset, visit.length, path_name, path_info.rank); + } + } +} + +void GFAzParser::parse(istream& in) { + (void)in; + throw invalid_argument("GFAZ input from streams is not supported directly"); +} + +void GFAzParser::parse(const string& filename) { + CompressedData compressed = load_gfaz_compressed(filename); + StreamingGFAZPaths gfaz_paths; + + { + string segment_sequences = + Codec::zstd_decompress_string(compressed.segment_sequences_zstd); + vector segment_lengths = + Codec::zstd_decompress_uint32_vector(compressed.segment_seq_lengths_zstd); + gfaz_paths.header_line = compressed.header_line; + gfaz_paths.node_count = segment_lengths.size(); + gfaz_paths.node_lengths.resize(gfaz_paths.node_count + 1, 0); + + auto& map_info = this->id_map(); + map_info.numeric_mode = true; + map_info.direct_numeric_lookup = true; + map_info.max_id = gfaz_paths.node_count; + map_info.name_to_id->clear(); + map_info.id_to_name.reset(); + + gfaz_paths.segment_optional_fields.reserve(compressed.segment_optional_fields_zstd.size()); + for (const auto& column : compressed.segment_optional_fields_zstd) { + gfaz_paths.segment_optional_fields.push_back(decompress_optional_column(column)); + } + + if (!gfaz_paths.header_line.empty() && gfaz_paths.header_line[0] == 'H') { + auto parsed = GFAParser::parse_h(gfaz_paths.header_line); + for (auto& listener : this->header_listeners) { + listener(get<0>(parsed)); + } + } + + vector optional_string_offsets(gfaz_paths.segment_optional_fields.size(), 0); + vector optional_byte_offsets(gfaz_paths.segment_optional_fields.size(), 0); + size_t segment_seq_offset = 0; + for (size_t i = 0; i < segment_lengths.size(); ++i) { + nid_t id = static_cast(i + 1); + uint32_t len = segment_lengths[i]; + if (segment_seq_offset + len > segment_sequences.size()) { + throw runtime_error("GFAZ segment sequence column is truncated"); + } + string sequence = segment_sequences.substr(segment_seq_offset, len); + auto tags = collect_optional_tags_for_row(gfaz_paths.segment_optional_fields, + optional_string_offsets, + optional_byte_offsets, + i); + for (auto& listener : this->node_listeners) { + listener(id, as_chars(sequence), tags); + } + gfaz_paths.node_lengths[id] = len; + segment_seq_offset += len; + } + } + + gfaz_paths.links.from_ids = Codec::decompress_delta_varint_uint32( + compressed.link_from_ids_zstd, compressed.num_links); + gfaz_paths.links.to_ids = Codec::decompress_delta_varint_uint32( + compressed.link_to_ids_zstd, compressed.num_links); + gfaz_paths.links.from_orients = Codec::decompress_orientations( + compressed.link_from_orients_zstd, compressed.num_links); + gfaz_paths.links.to_orients = Codec::decompress_orientations( + compressed.link_to_orients_zstd, compressed.num_links); + gfaz_paths.links.overlap_nums = + Codec::zstd_decompress_uint32_vector(compressed.link_overlap_nums_zstd); + gfaz_paths.links.overlap_ops = + Codec::zstd_decompress_char_vector(compressed.link_overlap_ops_zstd); + + for (size_t i = 0; i < gfaz_paths.links.from_ids.size(); ++i) { + string overlap; + if (i < gfaz_paths.links.overlap_ops.size() && gfaz_paths.links.overlap_ops[i] != '\0') { + overlap = to_string(i < gfaz_paths.links.overlap_nums.size() ? gfaz_paths.links.overlap_nums[i] : 0) + + gfaz_paths.links.overlap_ops[i]; + } + for (auto& listener : this->edge_listeners) { + listener(gfaz_paths.links.from_ids[i], + i < gfaz_paths.links.from_orients.size() ? gfaz_paths.links.from_orients[i] == '-' : false, + gfaz_paths.links.to_ids[i], + i < gfaz_paths.links.to_orients.size() ? gfaz_paths.links.to_orients[i] == '-' : false, + as_chars(overlap), + GFAParser::tag_list_t()); + } + } + + gfaz_paths.path_names = + decompress_string_column(compressed.names_zstd, compressed.name_lengths_zstd); + gfaz_paths.walk_sample_ids = decompress_string_column( + compressed.walk_sample_ids_zstd, compressed.walk_sample_id_lengths_zstd); + gfaz_paths.walk_hap_indices = + Codec::zstd_decompress_uint32_vector(compressed.walk_hap_indices_zstd); + gfaz_paths.walk_seq_ids = decompress_string_column( + compressed.walk_seq_ids_zstd, compressed.walk_seq_id_lengths_zstd); + gfaz_paths.walk_seq_starts = Codec::decompress_varint_int64( + compressed.walk_seq_starts_zstd, compressed.walk_lengths.size()); + gfaz_paths.walk_seq_ends = Codec::decompress_varint_int64( + compressed.walk_seq_ends_zstd, compressed.walk_lengths.size()); + + auto decoded_rules = decode_rules(compressed); + gfaz_paths.rules_first = std::move(decoded_rules.first); + gfaz_paths.rules_second = std::move(decoded_rules.second); + gfaz_paths.min_rule_id = compressed.min_rule_id(); + + if (!compressed.paths_zstd.payload.empty()) { + gfaz_paths.paths_flat = Codec::zstd_decompress_int32_vector(compressed.paths_zstd); + } + if (!compressed.walks_zstd.payload.empty()) { + gfaz_paths.walks_flat = Codec::zstd_decompress_int32_vector(compressed.walks_zstd); + } + gfaz_paths.path_offsets = build_offsets(compressed.sequence_lengths); + gfaz_paths.walk_offsets = build_offsets(compressed.walk_lengths); + gfaz_paths.original_path_offsets = build_offsets(compressed.original_path_lengths); + gfaz_paths.original_walk_offsets = build_offsets(compressed.original_walk_lengths); + + size_t encoded_path_count = + gfaz_paths.path_offsets.empty() ? 0 : gfaz_paths.path_offsets.size() - 1; + size_t path_count = min(gfaz_paths.path_names.size(), encoded_path_count); + for (size_t i = 0; i < path_count; ++i) { + vector visits = decode_sequence_at_index( + gfaz_paths.paths_flat, gfaz_paths.path_offsets, + gfaz_paths.original_path_offsets, i, gfaz_paths.rules_first, + gfaz_paths.rules_second, gfaz_paths.min_rule_id, compressed.delta_round); + auto visit_iteratee = make_gfaz_visit_iteratee(visits); + handle_duplicate_paths(*this, 'P', [&]() { + for (auto& listener : this->path_listeners) { + listener(gfaz_paths.path_names[i], visit_iteratee, GFAParser::tag_list_t()); + } + }); + } + + size_t walk_count = gfaz_paths.walk_offsets.empty() ? 0 : gfaz_paths.walk_offsets.size() - 1; + for (size_t i = 0; i < walk_count; ++i) { + vector visits = decode_sequence_at_index( + gfaz_paths.walks_flat, gfaz_paths.walk_offsets, + gfaz_paths.original_walk_offsets, i, gfaz_paths.rules_first, + gfaz_paths.rules_second, gfaz_paths.min_rule_id, compressed.delta_round); + auto visit_iteratee = make_gfaz_visit_iteratee(visits); + string sample_name = i < gfaz_paths.walk_sample_ids.size() ? gfaz_paths.walk_sample_ids[i] : "*"; + int64_t haplotype = i < gfaz_paths.walk_hap_indices.size() ? gfaz_paths.walk_hap_indices[i] : 0; + string contig_name = i < gfaz_paths.walk_seq_ids.size() ? gfaz_paths.walk_seq_ids[i] : PathMetadata::NO_LOCUS_NAME; + int64_t start = i < gfaz_paths.walk_seq_starts.size() ? gfaz_paths.walk_seq_starts[i] : PathMetadata::NO_END_POSITION; + int64_t end = i < gfaz_paths.walk_seq_ends.size() ? gfaz_paths.walk_seq_ends[i] : PathMetadata::NO_END_POSITION; + subrange_t subrange = PathMetadata::NO_SUBRANGE; + if (start != PathMetadata::NO_END_POSITION) { + subrange.first = start; + if (end != PathMetadata::NO_END_POSITION) { + subrange.second = end; + } + } + handle_duplicate_paths(*this, 'W', [&]() { + for (auto& listener : this->walk_listeners) { + listener(sample_name, haplotype, contig_name, subrange, visit_iteratee, GFAParser::tag_list_t()); + } + }); + } + + dispatch_rgfa_visits(gfaz_paths.node_count, gfaz_paths.segment_optional_fields, + gfaz_paths.node_lengths, this->max_rgfa_rank, + [&](nid_t id, int64_t offset, size_t length, const string& path_name, int64_t path_rank) { + handle_duplicate_paths(*this, 'S', [&]() { + for (auto& listener : this->rgfa_listeners) { + listener(id, offset, length, path_name, path_rank); + } + }); + }); +} + +} // namespace algorithms +} // namespace vg diff --git a/src/index_registry.cpp b/src/index_registry.cpp index 95f6127da68..86ae40f2e04 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -1926,7 +1926,7 @@ IndexRegistry VGIndexes::get_vg_index_registry() { // TODO: this could be fragile if we repurpose this lambda for Reference GFA w/ Haplotypes // if we're constructing from a reference GFA, we don't need anything from W lines unordered_set ignore{PathSense::HAPLOTYPE}; - algorithms::gfa_to_path_handle_graph(input_filename, graph.get(), numeric_limits::max(), translation_name, &ignore); + algorithms::load_gfa_or_gfaz_to_path_handle_graph(input_filename, graph.get(), numeric_limits::max(), translation_name, &ignore); } catch (algorithms::GFAFormatError& e) { error(context) << "GFA file " << input_filename << " is not usable in VG.\n" << e.what() << endl; @@ -6470,4 +6470,3 @@ const IndexGroup& RewindPlanException::get_indexes() const noexcept { } } - diff --git a/src/io/register_libvg_io.cpp b/src/io/register_libvg_io.cpp index cd4faabb603..4ca77f31c56 100644 --- a/src/io/register_libvg_io.cpp +++ b/src/io/register_libvg_io.cpp @@ -20,6 +20,7 @@ #include "register_loader_saver_packed_graph.hpp" #include "register_loader_saver_hash_graph.hpp" #include "register_loader_saver_gfa.hpp" +#include "register_loader_saver_gfaz.hpp" #include "register_loader_saver_zip_codes.hpp" #include "register_loader_params_json.hpp" @@ -45,6 +46,7 @@ bool register_libvg_io() { register_loader_saver_snarl_manager(); register_loader_saver_vg(); register_loader_saver_gfa(); + register_loader_saver_gfaz(); register_loader_saver_xg(); register_loader_saver_packed_graph(); register_loader_saver_hash_graph(); diff --git a/src/io/register_loader_saver_gfaz.cpp b/src/io/register_loader_saver_gfaz.cpp new file mode 100644 index 00000000000..82f09903e6d --- /dev/null +++ b/src/io/register_loader_saver_gfaz.cpp @@ -0,0 +1,86 @@ +/** + * \file register_loader_saver_gfaz.cpp + */ + +#include +#include "register_loader_saver_gfaz.hpp" +#include "algorithms/gfa_to_handle.hpp" + +#include "gfa.hpp" + +#include "handle.hpp" +#include "save_handle_graph.hpp" +#include "../utility.hpp" + +#include +#include +#include +#include + +namespace vg { +namespace io { + +using namespace std; +using namespace vg::io; + +static bool sniff_gfaz(istream& stream) { + if (!stream) { + return false; + } + char buffer[sizeof(uint32_t)]; + size_t buffer_used = 0; + while (stream.peek() != EOF && buffer_used < sizeof(buffer)) { + buffer[buffer_used++] = (char) stream.get(); + } + for (size_t i = 0; i < buffer_used; ++i) { + stream.unget(); + if (!stream) { + throw runtime_error("Ungetting failed after " + to_string(i) + " characters"); + } + } + if (!stream) { + stream.clear(); + } + return algorithms::GFAzParser::has_magic(buffer, buffer_used); +} + +void register_loader_saver_gfaz() { + Registry::register_bare_loader_saver_with_header_check("GFAZ", sniff_gfaz, [](istream& input, const string& filename) -> void* { + GFAHandleGraph* gfa_graph = new GFAHandleGraph(); + string temp_name; + try { + string load_name = filename; + if (load_name.empty() || load_name == "-") { + temp_name = temp_file::create("gfaz-load"); + ofstream temp_stream(temp_name, ios::binary); + temp_stream << input.rdbuf(); + temp_stream.close(); + load_name = temp_name; + } + algorithms::GFAzParser parser; + parser.external_id_map = &gfa_graph->gfa_id_space; + algorithms::parser_to_path_handle_graph(parser, gfa_graph); + parser.parse(load_name); + gfa_graph->gfa_id_space.invert_translation(); + } catch (algorithms::GFAFormatError& e) { + cerr << "error[register_loader_saver_gfaz] GFAZ "; + if (!filename.empty() && filename != "-") { + cerr << "file " << filename; + } else { + cerr << "stream"; + } + cerr << " cannot be loaded: " << e.what() << endl; + exit(1); + } + if (!temp_name.empty()) { + unlink(temp_name.c_str()); + } + return (void*) gfa_graph; + }, [](const void* gfa_graph_void, ostream& output) { + assert(gfa_graph_void != nullptr); + graph_to_gfa((const GFAHandleGraph*) gfa_graph_void, output); + }); +} + +} +} diff --git a/src/io/register_loader_saver_gfaz.hpp b/src/io/register_loader_saver_gfaz.hpp new file mode 100644 index 00000000000..cc8e5001a09 --- /dev/null +++ b/src/io/register_loader_saver_gfaz.hpp @@ -0,0 +1,12 @@ +#ifndef VG_IO_REGISTER_LOADER_SAVER_GFAZ_HPP_INCLUDED +#define VG_IO_REGISTER_LOADER_SAVER_GFAZ_HPP_INCLUDED + +namespace vg { +namespace io { + +void register_loader_saver_gfaz(); + +} +} + +#endif diff --git a/src/subcommand/convert_main.cpp b/src/subcommand/convert_main.cpp index 16f78068c22..be61a5806e1 100644 --- a/src/subcommand/convert_main.cpp +++ b/src/subcommand/convert_main.cpp @@ -3,6 +3,7 @@ #include "../utility.hpp" #include "xg.hpp" #include "../algorithms/gfa_to_handle.hpp" + #include "../io/save_handle_graph.hpp" #include "../gfa.hpp" #include "../gbzgraph.hpp" @@ -314,8 +315,9 @@ int main_convert(int argc, char** argv) { bdsg::HashGraph intermediate; logger.warn() << "currently cannot convert GFA directly to XG; " << "converting through another format" << endl; - vg::algorithms::gfa_to_path_handle_graph(input_stream_name, &intermediate, - input_rgfa_rank, gfa_trans_path); + vg::algorithms::load_gfa_or_gfaz_to_path_handle_graph(input_stream_name, &intermediate, + input_rgfa_rank, gfa_trans_path, + nullptr); graph_to_xg_adjusting_paths(&intermediate, xg_graph, ref_samples, hap_locus, new_sample, drop_haplotypes); } else { @@ -327,14 +329,15 @@ int main_convert(int argc, char** argv) { MutablePathMutableHandleGraph* mutable_output_graph \ = dynamic_cast(output_path_graph); assert(mutable_output_graph != nullptr); - vg::algorithms::gfa_to_path_handle_graph(input_stream_name, mutable_output_graph, - input_rgfa_rank, gfa_trans_path); + vg::algorithms::load_gfa_or_gfaz_to_path_handle_graph(input_stream_name, mutable_output_graph, + input_rgfa_rank, gfa_trans_path, + nullptr); } else { MutableHandleGraph* mutable_output_graph = dynamic_cast(output_graph.get()); assert(mutable_output_graph != nullptr); - vg::algorithms::gfa_to_handle_graph(input_stream_name, mutable_output_graph, - gfa_trans_path); + vg::algorithms::load_gfa_or_gfaz_to_handle_graph(input_stream_name, mutable_output_graph, + gfa_trans_path); } } catch (vg::algorithms::GFAFormatError& e) { logger.error() << "Input GFA is not acceptable.\n" << e.what() << endl; @@ -467,7 +470,7 @@ int main_convert(int argc, char** argv) { void help_convert(char** argv) { cerr << "usage: " << argv[0] << " convert [options] " << endl << "input options:" << endl - << " -g, --gfa-in input in GFA format" << endl + << " -g, --gfa-in input in GFA or GFAZ format" << endl << " -r, --in-rgfa-rank N import rgfa tags with rank <= N as paths [0]" << endl << " --ref-sample STR change haplotypes for this sample to" << endl << " reference paths (may repeat)" << endl @@ -514,7 +517,7 @@ void help_convert(char** argv) { void no_multiple_inputs(const Logger& logger, input_type input) { if (input != INPUT_DEFAULT) { - logger.error() << "cannot combine input types (GFA, GAM, GAF)" << endl; + logger.error() << "cannot combine input types (GFA/GFAZ, GAM, GAF)" << endl; } } diff --git a/src/subcommand/sort_main.cpp b/src/subcommand/sort_main.cpp index 57076d0d323..9fc190083d5 100644 --- a/src/subcommand/sort_main.cpp +++ b/src/subcommand/sort_main.cpp @@ -138,7 +138,7 @@ int main_sort(int argc, char *argv[]) { // Read as GFA graph.reset(new VG()); try { - algorithms::gfa_to_path_handle_graph(filename, graph.get()); + algorithms::load_gfa_or_gfaz_to_path_handle_graph(filename, graph.get()); } catch(algorithms::GFAFormatError& e) { // GFA loading has failed because the file is invalid logger.error() << e.what() << endl; @@ -230,4 +230,3 @@ int main_sort(int argc, char *argv[]) { // Register subcommand static Subcommand vg_sort("sort", "sort variant graph by various algorithms", DEPRECATED, main_sort); - diff --git a/src/subcommand/view_main.cpp b/src/subcommand/view_main.cpp index 93f85190b1f..6946a74f96c 100644 --- a/src/subcommand/view_main.cpp +++ b/src/subcommand/view_main.cpp @@ -565,10 +565,10 @@ int main_view(int argc, char** argv) { try { // Use the disk-backed GFA loader that `vg convert` also uses. - vg::algorithms::gfa_to_path_handle_graph(file_name, - dynamic_cast(graph.get()), - nullptr, - 0); // set rgfa path rank to 0 to be consistent with vg convert's default logic + vg::algorithms::load_gfa_or_gfaz_to_path_handle_graph(file_name, + dynamic_cast(graph.get()), + nullptr, + 0); // set rgfa path rank to 0 to be consistent with vg convert's default logic } catch (vg::algorithms::GFAFormatError& e) { logger.error() << "Input GFA is not acceptable\n" << e.what() << endl; } catch (std::ios_base::failure& e) { @@ -1022,4 +1022,3 @@ int main_view(int argc, char** argv) { // Register subcommand static Subcommand vg_view("view", "format conversions for graphs and alignments", TOOLKIT, main_view); - diff --git a/test/t/03_vg_view.t b/test/t/03_vg_view.t index 28bd7e63336..6e10c8d6b70 100644 --- a/test/t/03_vg_view.t +++ b/test/t/03_vg_view.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 23 +plan tests 24 is $(vg construct -m 1000 -r small/x.fa -v small/x.vcf.gz | vg view -d - | wc -l) 505 "view produces the expected number of lines of dot output" is $(vg construct -m 1000 -r small/x.fa -v small/x.vcf.gz | vg view -g - | wc -l) 503 "view produces the expected number of lines of GFA output" @@ -77,4 +77,5 @@ rm -f paths.truth.txt paths.test.txt vg view graphs/hap_subrange.gfa >/dev/null is "${?}" "0" "vg view can emit a GFA file with subranges on a haplotype without explicit phase blocks" - +diff <(vg view tiny/tiny.gfa | grep '^S' | cut -f3 | sort) <(vg view tiny/tiny.gfaz | grep '^S' | cut -f3 | sort) +is "${?}" "0" "vg view preserves segment sequences for matching GFA and GFAZ inputs" diff --git a/test/t/05_vg_find.t b/test/t/05_vg_find.t index d085565505f..67f36745ec4 100644 --- a/test/t/05_vg_find.t +++ b/test/t/05_vg_find.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 30 +plan tests 31 vg construct -m 1000 -r small/x.fa -v small/x.vcf.gz >x.vg is $? 0 "construction" @@ -122,6 +122,8 @@ vg convert -gp tiny/tiny.gfa | vg find -x - -n 1 -c 2 | vg convert -f - | vg ids vg find -x tiny/tiny.gfa -n 1 -c 2| vg ids -s - | sort > found2.gfa diff found1.gfa found2.gfa is $? 0 "GFA i/o for find -n consistent with converting both ways" +vg find -x tiny/tiny.gfaz -n 1 -c 2 >/dev/null +is $? 0 "find accepts GFAZ graph input" # Find nodes that map to the provided ids diff --git a/test/t/10_vg_stats.t b/test/t/10_vg_stats.t index e2d07e63ac9..10e70864068 100644 --- a/test/t/10_vg_stats.t +++ b/test/t/10_vg_stats.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 21 +plan tests 23 vg construct -r 1mb1kgp/z.fa -v 1mb1kgp/z.vcf.gz >z.vg #is $? 0 "construction of a 1 megabase graph from the 1000 Genomes succeeds" @@ -63,6 +63,8 @@ vg index graphs/atgc.vg -x atgc.xg is "$(vg stats -F atgc.xg)" "format: XG" "vg stats -F detects format of xg graph" vg convert graphs/atgc.vg -f > atgc.gfa is "$(vg stats -F atgc.gfa)" "format: GFA" "vg stats -F detects format of GFA graph" +is "$(vg stats -F tiny/tiny.gfaz)" "format: GFA" "vg stats -F classifies GFAZ input on the GFA graph path" +is "$(vg stats -z tiny/tiny.gfaz)" "$(vg stats -z tiny/tiny.gfa)" "basic stats agree between GFA and GFAZ" rm -f atgc.vg atgc.hg atgc.pg atgc.xg atgc.gfa vg autoindex -v tiny/tiny.vcf.gz -r tiny/tiny.fa -w giraffe -p tiny is "$(vg stats -F tiny.giraffe.gbz)" "format: GBZ" "vg stats -F detects format of GBZ graph" diff --git a/test/t/11_vg_paths.t b/test/t/11_vg_paths.t index e7306d1431e..87a3a54b56b 100644 --- a/test/t/11_vg_paths.t +++ b/test/t/11_vg_paths.t @@ -7,7 +7,7 @@ PATH=../bin:$PATH # for vg export LC_ALL="C" # force a consistent sort order -plan tests 60 +plan tests 61 vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg vg construct -r small/x.fa -v small/x.vcf.gz > x2.vg @@ -128,6 +128,8 @@ vg paths --list -x x.gbz >out.txt is "${?}" "0" "vg paths can list paths from a GBZ with only haplotypes" is "$(cat out.txt | wc -l)" "1" "vg paths sees the haplotype path in a GBZ with only haplotypes" +diff <(vg paths -x tiny/tiny.gfa -F | sort) <(vg paths -x tiny/tiny.gfaz -F | sort) +is "${?}" "0" "vg paths emits matching path FASTA for equivalent GFA and GFAZ inputs" rm -f norm_x4.gfa original.fa norm_x4.fa x4.path x4.norm.path out.txt err.txt rm -f empty.vg empty.gbwt empty.fa @@ -209,4 +211,3 @@ rm -f cross_merge_test.vg cross_merge_right_test.vg - diff --git a/test/t/45_vg_sort.t b/test/t/45_vg_sort.t index 43b96a69ea8..e50f3df5c40 100644 --- a/test/t/45_vg_sort.t +++ b/test/t/45_vg_sort.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 5 +plan tests 6 # Make a big graph that is probably not in sorted order # Chopping it accomplishes that and makes it have several chunks @@ -31,8 +31,10 @@ is "${?}" "0" "a vg graph can be sorted with Eades algorithm without crashing" vg sort -a max-flow -r q giab.vg >/dev/null is "${?}" "0" "a vg graph can be sorted with the max-flow algorithm without crashing" +diff <(vg paths -v <(vg sort -a id tiny/tiny.gfa) -E | sort) <(vg paths -v <(vg sort -a id tiny/tiny.gfaz) -E | sort) +is "${?}" "0" "sorting preserves path sequences for matching GFA and GFAZ inputs" + rm -f giab.vg giab.sorted.vg giab.sorted.vg.vgi ids.txt ids.sorted.txt - diff --git a/test/t/48_vg_convert.t b/test/t/48_vg_convert.t index 6d4b14760a3..206364df25e 100644 --- a/test/t/48_vg_convert.t +++ b/test/t/48_vg_convert.t @@ -7,7 +7,7 @@ PATH=../bin:$PATH # for vg export LC_ALL="C" # force a consistent sort order -plan tests 86 +plan tests 92 vg construct -r complex/c.fa -v complex/c.vcf.gz > c.vg cat <(vg view c.vg | grep ^S | sort) <(vg view c.vg | grep L | uniq | wc -l) <(vg paths -v c.vg -E) > c.info @@ -381,6 +381,31 @@ vg convert tiny/tiny.gfa -p | vg convert -f - | sort > tiny.roundtrip2.gfa diff tiny.roundtrip.gfa tiny.roundtrip2.gfa is $? 0 "No difference roundtripping a GFA if it's loaded as a GFA or HandleGraph" +vg convert -g tiny/tiny.gfa -p > tiny.gfa.input.pg +vg convert -g tiny/tiny.gfaz -p -t 1 > tiny.gfaz.input.pg +diff <(vg paths -v tiny.gfa.input.pg -E | sort) <(vg paths -v tiny.gfaz.input.pg -E | sort) +is $? 0 "GFAZ conversion preserves path sequences from equivalent GFA input" + +vg convert tiny.gfaz.input.pg -f | sort > tiny.gfaz.roundtrip.gfa +vg convert -g tiny/tiny.gfaz -p -t 1 | vg convert -f - | sort > tiny.gfaz.input.norm.gfa +diff <(grep '^S' tiny.roundtrip.gfa | cut -f3 | sort) <(grep '^S' tiny.gfaz.input.norm.gfa | cut -f3 | sort) +is $? 0 "GFAZ conversion preserves segment sequences from equivalent GFA input" + +vg convert tiny/tiny.gfaz -p -t 1 | vg convert -f - | sort > tiny.gfaz.roundtrip.auto.gfa +diff tiny.gfaz.roundtrip.gfa tiny.gfaz.roundtrip.auto.gfa +is $? 0 "GFAZ input is auto-detected through the registry loader" + +vg convert -g tiny/tiny.gfaz -p -t 4 | vg convert -f - | sort > tiny.gfaz.roundtrip.mt.gfa +diff tiny.gfaz.roundtrip.gfa tiny.gfaz.roundtrip.mt.gfa +is $? 0 "GFAZ roundtrip output is stable across thread counts" + +vg convert -g tiny/tiny.gfaz -x > tiny.gfaz.input.xg +diff <(vg paths -v tiny.gfa.input.pg -E | sort) <(vg paths -v tiny.gfaz.input.xg -E | sort) +is $? 0 "GFAZ conversion through XG preserves path sequences from equivalent GFA input" + +vg convert -g tiny/tiny.gfaz -T tiny.gfaz.trans -p > /dev/null +is $(wc -l < tiny.gfaz.trans) 0 "GFAZ import writes no translation lines for dense numeric segment IDs" + grep -v "S 6" tiny/tiny.gfa > tiny.unsort.gfa grep "S 6" tiny/tiny.gfa >> tiny.unsort.gfa cat tiny.unsort.gfa | vg convert -p - 2> tiny.roundtrip3.stderr | vg convert -f - | sort > tiny.roundtrip3.gfa @@ -414,6 +439,9 @@ is $? 0 "Modding unsorted GFA file produces same output as going through convert is $(cat tiny.chop3.4.stderr | wc -l) 0 "No warnings given when input GFA file is unsorted" rm -f tiny.roundtrip.gfa tiny.roundtrip2.gfa tiny.roundtrip3.gfa tiny.roundtrip4.gfa +rm -f tiny.gfa.input.pg tiny.gfaz.input.pg +rm -f tiny.gfaz.roundtrip.gfa tiny.gfaz.roundtrip.auto.gfa tiny.gfaz.roundtrip.mt.gfa +rm -f tiny.gfaz.input.norm.gfa tiny.gfaz.input.xg tiny.gfaz.trans rm -f tiny.roundtrip3.stderr tiny.roundtrip4.stderr rm -f tiny.unsort.gfa rm -f tiny.chop3.gfa tiny.chop3.1.gfa tiny.chop3.2.gfa tiny.chop3.3.gfa tiny.chop3.4.gfa diff --git a/test/t/53_clip.t b/test/t/53_clip.t index edfec715fdf..b251d0a065c 100644 --- a/test/t/53_clip.t +++ b/test/t/53_clip.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 21 +plan tests 22 vg msga -f GRCh38_alts/FASTA/HLA/V-352962.fa -t 1 -k 16 | vg mod -U 10 - | vg mod -c - > hla.vg @@ -84,6 +84,8 @@ printf "L\t200\t+\t300\t+\t0M\n" >> tiny-stubs.gfa vg clip tiny.gfa -s -P x | sort > tiny-nostubs.gfa diff tiny.gfa tiny-nostubs.gfa is "$?" 0 "stub clipping removed all stubs" +diff <(vg clip tiny/tiny.gfa -s -P x | vg paths -v - -E | sort) <(vg clip tiny/tiny.gfaz -s -P x | vg paths -v - -E | sort) +is "$?" 0 "clip preserves path sequences for matching GFA and GFAZ inputs" printf "x\t5\t25\n" > region.bed is $(vg clip tiny-stubs.gfa -s -b region.bed | vg stats -N -) "17" "region clipping filtered out only 2 / 4 stub nodes" @@ -107,4 +109,3 @@ is $(vg stats -E sc-A2d1g.gfa) "30" "One net edges clipped" is $(vg stats -N sc-A2d1g.gfa) "22" "No other nodes were clipped with net edge filter" rm -f sc-A2d1g.gfa - diff --git a/test/tiny/tiny.gfaz b/test/tiny/tiny.gfaz new file mode 100644 index 00000000000..02e87cb40bd Binary files /dev/null and b/test/tiny/tiny.gfaz differ