Skip to content

Commit ddb17ba

Browse files
committed
add sam_tag_dictionary
1 parent b1ca0eb commit ddb17ba

File tree

6 files changed

+584
-15
lines changed

6 files changed

+584
-15
lines changed

include/bio/format/sam_input_handler.hpp

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include <bio/map_io/header.hpp>
3333
#include <bio/map_io/misc.hpp>
3434
#include <bio/map_io/sam_flag.hpp>
35+
#include <bio/map_io/sam_tag_dictionary.hpp>
3536

3637
namespace bio
3738
{
@@ -109,6 +110,13 @@ class format_input_handler<sam> : public format_input_handler_base<format_input_
109110
get<field::tlen>(raw_record) = (*file_it).fields[8];
110111
get<field::seq>(raw_record) = (*file_it).fields[9];
111112
get<field::qual>(raw_record) = (*file_it).fields[10];
113+
114+
// fields[10].end() that is guaranteed to be char*
115+
char const * end_qual = (*file_it).fields[10].data() + (*file_it).fields[10].size() + 1/*\t or \n*/;
116+
// line.end() that is guaranteed to be char*
117+
char const * end_line = (*file_it).line.data() + (*file_it).line.size();
118+
// SAM tags go from end of qual til end of line (possibly empty)
119+
get<field::tags>(raw_record) = std::string_view{end_qual, static_cast<size_t>(end_line - end_qual)};
112120
}
113121

114122
/* PARSED RECORD HANDLING */
@@ -160,7 +168,7 @@ class format_input_handler<sam> : public format_input_handler_base<format_input_
160168
}
161169
}
162170

163-
/* POS, MAPQ are handled correctly by default, unless we want pos to be read into an std:optional */
171+
/* POS, MAPQ are handled correctly by default */
164172

165173
//!\brief Overload for parsing CIGAR.
166174
void parse_field(vtag_t<field::cigar> const & /**/, std::vector<seqan3::cigar> & cigar_vector)
@@ -242,6 +250,16 @@ class format_input_handler<sam> : public format_input_handler_base<format_input_
242250
parse_field_aux(raw_field, parsed_field); // reading into e.g. dna4 vector
243251
}
244252

253+
//!\brief Overload for parsing the SAM tag dictionary.
254+
void parse_field(vtag_t<field::tags> const & /**/, map_io::sam_tag_dictionary & dictionary)
255+
{
256+
std::string_view raw_field = get<field::tags>(raw_record);
257+
258+
if (!raw_field.empty())
259+
for (std::string_view const tag_field : raw_field | detail::eager_split('\t'))
260+
dictionary.parse_and_emplace(tag_field);
261+
}
262+
245263
//!\brief Overload for parsing the private data.
246264
void parse_field(vtag_t<field::_private> const & /**/, map_io::record_private_data & parsed_field)
247265
{

include/bio/map_io/misc.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ inline constexpr auto default_field_ids = vtag<field::qname,
3333
field::tlen,
3434
field::seq,
3535
field::qual,
36+
field::tags,
3637
field::_private>;
3738

3839
} // namespace bio::map_io

include/bio/map_io/reader_options.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include <bio/map_io/header.hpp>
2929
#include <bio/map_io/misc.hpp>
3030
#include <bio/map_io/sam_flag.hpp>
31+
#include <bio/map_io/sam_tag_dictionary.hpp>
3132

3233
namespace bio::map_io
3334
{
@@ -68,6 +69,7 @@ inline constexpr auto field_types_sam =
6869
int32_t, // field::tlen,
6970
decltype(std::string_view{} | seqan3::views::char_to<seqan3::dna5>), // field::seq
7071
decltype(std::string_view{} | seqan3::views::char_to<seqan3::phred42>), // field::qual
72+
sam_tag_dictionary, // field::tags
7173
record_private_data>; // field::_private
7274
//!\}
7375

0 commit comments

Comments
 (0)