Skip to content

Commit c27d493

Browse files
authored
Improve parsing efficiency by switching to byteme::PerByte.
This eliminates the need for the leftover buffer and avoids unnecessary copies to shift the used bytes in the leftover buffer at every call to the (now removed) extract_up_to. Such copying caused major perf degradations when parsing complex objects with lots of nested structure but little actual data. In addition, we now have access to PerByteParallel, which allows us to read from disk and parse in parallel. This should further improve perf.
1 parent 6ec74fa commit c27d493

21 files changed

Lines changed: 360 additions & 295 deletions

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
cmake_minimum_required(VERSION 3.24)
22

33
project(rds2cpp
4-
VERSION 1.0.1
4+
VERSION 1.1.0
55
DESCRIPTION "Standalone C++ library for reading RDS files"
66
LANGUAGES CXX)
77

include/rds2cpp/parse_altrep.hpp

Lines changed: 35 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -14,28 +14,28 @@
1414

1515
namespace rds2cpp {
1616

17-
template<class Reader>
18-
IntegerVector parse_integer_body(Reader&, std::vector<unsigned char>&);
17+
template<class Source_>
18+
IntegerVector parse_integer_body(Source_&);
1919

20-
template<class Reader>
21-
DoubleVector parse_double_body(Reader& reader, std::vector<unsigned char>&);
20+
template<class Source_>
21+
DoubleVector parse_double_body(Source_& src);
2222

23-
template<class Reader>
24-
std::unique_ptr<RObject> parse_object(Reader&, std::vector<unsigned char>&, SharedParseInfo&);
23+
template<class Source_>
24+
std::unique_ptr<RObject> parse_object(Source_&, SharedParseInfo&);
2525

26-
template<class Reader>
27-
PairList parse_pairlist_body(Reader&, std::vector<unsigned char>&, const Header&, SharedParseInfo&);
26+
template<class Source_>
27+
PairList parse_pairlist_body(Source_&, const Header&, SharedParseInfo&);
2828

2929
namespace altrep_internal {
3030

31-
template<class Vector, class Reader>
32-
Vector parse_numeric_compact_seq(Reader& reader, std::vector<unsigned char>& leftovers) try {
33-
auto header = parse_header(reader, leftovers);
31+
template<class Vector, class Source_>
32+
Vector parse_numeric_compact_seq(Source_& src) try {
33+
auto header = parse_header(src);
3434
if (header[3] != static_cast<unsigned char>(SEXPType::REAL)) {
3535
throw std::runtime_error("expected compact_seq to store sequence information in doubles");
3636
}
3737

38-
auto info = parse_double_body(reader, leftovers);
38+
auto info = parse_double_body(src);
3939
const auto& ranges = info.data;
4040
if (ranges.size() != 3) {
4141
throw std::runtime_error("expected compact_seq's sequence information to be of length 3");
@@ -49,7 +49,7 @@ Vector parse_numeric_compact_seq(Reader& reader, std::vector<unsigned char>& lef
4949
output.data[i] = start;
5050
}
5151

52-
auto terminator = parse_header(reader, leftovers);
52+
auto terminator = parse_header(src);
5353
if (terminator[3] != 254) {
5454
throw std::runtime_error("failed to terminate a compact_seq ALTREP correctly");
5555
}
@@ -59,36 +59,36 @@ Vector parse_numeric_compact_seq(Reader& reader, std::vector<unsigned char>& lef
5959
throw traceback("failed to parse compact numeric ALTREP", e);
6060
}
6161

62-
template<class Vector, class Reader>
63-
Vector parse_attribute_wrapper(Reader& reader, std::vector<unsigned char>& leftovers, SharedParseInfo& shared) try {
64-
auto plist_header = parse_header(reader, leftovers);
62+
template<class Vector, class Source_>
63+
Vector parse_attribute_wrapper(Source_& src, SharedParseInfo& shared) try {
64+
auto plist_header = parse_header(src);
6565
if (plist_header[3] != static_cast<unsigned char>(SEXPType::LIST)) {
6666
throw std::runtime_error("expected pairlist in wrap_* ALTREP's payload");
6767
}
6868

6969
// First pairlist element is a CONS cell where the first value is the wrapped integer vector.
7070

71-
auto contents = parse_object(reader, leftovers, shared);
71+
auto contents = parse_object(src, shared);
7272
if (contents->type() != Vector::vector_sexp_type) {
7373
throw std::runtime_error("incorrectly typed contents in wrap_* ALTREP's payload");
7474
}
7575

7676
// Second cons value is the wrapping metadata, we don't care about it.
77-
auto metaheader = parse_header(reader, leftovers);
77+
auto metaheader = parse_header(src);
7878
if (metaheader[3] != static_cast<unsigned char>(SEXPType::INT)) {
7979
throw std::runtime_error("wrap_* ALTREP should have an integer vector for its metadata");
8080
}
8181

82-
auto metadata = parse_integer_body(reader, leftovers);
82+
auto metadata = parse_integer_body(src);
8383
if (metadata.data.size() != 2) {
8484
throw std::runtime_error("wrap_* ALTREP's metadata should be a length-2 integer vector");
8585
}
8686

8787
// Now we can finally get the attributes, which makes up the rest of the pairlist.
8888
auto coerced = static_cast<Vector*>(contents.get());
89-
auto attrheader = parse_header(reader, leftovers);
89+
auto attrheader = parse_header(src);
9090
if (attrheader[3] == static_cast<unsigned>(SEXPType::LIST)) {
91-
parse_attributes_body(reader, leftovers, attrheader, coerced->attributes, shared);
91+
parse_attributes_body(src, attrheader, coerced->attributes, shared);
9292
} else if (attrheader[3] != static_cast<unsigned>(SEXPType::NILVALUE_)) {
9393
throw std::runtime_error("wrap_* ALTREP's attributes should be a pairlist or NULL");
9494
}
@@ -98,15 +98,15 @@ Vector parse_attribute_wrapper(Reader& reader, std::vector<unsigned char>& lefto
9898
throw traceback("failed to parse attribute-wrapped ALTREP", e);
9999
}
100100

101-
template<class Reader>
102-
StringVector parse_deferred_string(Reader& reader, std::vector<unsigned char>& leftovers, SharedParseInfo& shared) try {
103-
auto plist_header = parse_header(reader, leftovers);
101+
template<class Source_>
102+
StringVector parse_deferred_string(Source_& src, SharedParseInfo& shared) try {
103+
auto plist_header = parse_header(src);
104104
if (plist_header[3] != static_cast<unsigned char>(SEXPType::LIST)) {
105105
throw std::runtime_error("expected pairlist in deferred_string ALTREP's payload");
106106
}
107107

108108
// First pairlist element is a CONS cell where the first value is the thing to be converted.
109-
auto contents = parse_object(reader, leftovers, shared);
109+
auto contents = parse_object(src, shared);
110110
StringVector output;
111111

112112
if (contents->type() == SEXPType::INT){
@@ -160,18 +160,18 @@ StringVector parse_deferred_string(Reader& reader, std::vector<unsigned char>& l
160160
}
161161

162162
// Second cons value is the wrapping metadata, we don't care about it.
163-
auto metaheader = parse_header(reader, leftovers);
163+
auto metaheader = parse_header(src);
164164
if (metaheader[3] != static_cast<unsigned char>(SEXPType::INT)) {
165165
throw std::runtime_error("deferred_string ALTREP should have an integer vector for its metadata");
166166
}
167167

168-
auto metadata = parse_integer_body(reader, leftovers);
168+
auto metadata = parse_integer_body(src);
169169
if (metadata.data.size() != 1) {
170170
throw std::runtime_error("deferred_string ALTREP's metadata should be a length-1 integer vector");
171171
}
172172

173173
// Chomp up the null.
174-
auto terminator = parse_header(reader, leftovers);
174+
auto terminator = parse_header(src);
175175
if (terminator[3] != static_cast<unsigned char>(SEXPType::NILVALUE_)) {
176176
throw std::runtime_error("failed to terminate a deferred string ALTREP correctly");
177177
}
@@ -183,14 +183,14 @@ StringVector parse_deferred_string(Reader& reader, std::vector<unsigned char>& l
183183

184184
}
185185

186-
template<class Reader>
187-
std::unique_ptr<RObject> parse_altrep_body(Reader& reader, std::vector<unsigned char>& leftovers, SharedParseInfo& shared) try {
188-
auto header = parse_header(reader, leftovers);
186+
template<class Source_>
187+
std::unique_ptr<RObject> parse_altrep_body(Source_& src, SharedParseInfo& shared) try {
188+
auto header = parse_header(src);
189189
if (header[3] != static_cast<unsigned char>(SEXPType::LIST)) {
190190
throw std::runtime_error("expected ALTREP description to be a pairlist");
191191
}
192192

193-
auto plist = parse_pairlist_body(reader, leftovers, header, shared);
193+
auto plist = parse_pairlist_body(src, header, shared);
194194
if (plist.data.size() < 1 || plist.data[0]->type() != SEXPType::SYM) {
195195
throw std::runtime_error("expected type specification symbol in the ALTREP description");
196196
}
@@ -204,11 +204,11 @@ std::unique_ptr<RObject> parse_altrep_body(Reader& reader, std::vector<unsigned
204204
const auto& symb = shared.symbols[sdx->index];
205205

206206
if (symb.name == "wrap_integer") {
207-
pointerize_(altrep_internal::parse_attribute_wrapper<IntegerVector>(reader, leftovers, shared));
207+
pointerize_(altrep_internal::parse_attribute_wrapper<IntegerVector>(src, shared));
208208
} else if (symb.name == "compact_intseq") {
209-
pointerize_(altrep_internal::parse_numeric_compact_seq<IntegerVector>(reader, leftovers));
209+
pointerize_(altrep_internal::parse_numeric_compact_seq<IntegerVector>(src));
210210
} else if (symb.name == "deferred_string") {
211-
pointerize_(altrep_internal::parse_deferred_string(reader, leftovers, shared));
211+
pointerize_(altrep_internal::parse_deferred_string(src, shared));
212212
} else {
213213
throw std::runtime_error("unrecognized ALTREP type '" + symb.name + "'");
214214
}

include/rds2cpp/parse_atomic.hpp

Lines changed: 55 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -13,18 +13,22 @@ namespace rds2cpp {
1313

1414
namespace atomic_internal {
1515

16-
template<class Vector, class Reader>
17-
Vector parse_integer_or_logical_body(Reader& reader, std::vector<unsigned char>& leftovers) {
18-
size_t len = get_length(reader, leftovers);
16+
template<class Vector, class Source_>
17+
Vector parse_integer_or_logical_body(Source_& src) {
18+
size_t len = get_length(src);
1919
Vector output(len);
2020

2121
constexpr size_t width = 4;
22+
static_assert(width == sizeof(decltype(output.data[0])));
23+
size_t byte_length = width * len;
24+
2225
auto ptr = reinterpret_cast<unsigned char*>(output.data.data());
23-
extract_up_to(reader, leftovers, width * len,
24-
[&](const unsigned char* buffer, size_t n, size_t i) -> void {
25-
std::copy(buffer, buffer + n, ptr + i);
26+
for (size_t i = 0; i < byte_length; ++i) {
27+
if (!src.advance()) {
28+
throw empty_error();
2629
}
27-
);
30+
ptr[i] = src.get();
31+
}
2832

2933
// Flipping endianness.
3034
if (little_endian()) {
@@ -39,32 +43,36 @@ Vector parse_integer_or_logical_body(Reader& reader, std::vector<unsigned char>&
3943

4044
}
4145

42-
template<class Reader>
43-
IntegerVector parse_integer_body(Reader& reader, std::vector<unsigned char>& leftovers) try {
44-
return atomic_internal::parse_integer_or_logical_body<IntegerVector>(reader, leftovers);
46+
template<class Source_>
47+
IntegerVector parse_integer_body(Source_& src) try {
48+
return atomic_internal::parse_integer_or_logical_body<IntegerVector>(src);
4549
} catch (std::exception& e) {
4650
throw traceback("failed to parse data for an integer vector", e);
4751
}
4852

49-
template<class Reader>
50-
LogicalVector parse_logical_body(Reader& reader, std::vector<unsigned char>& leftovers) try {
51-
return atomic_internal::parse_integer_or_logical_body<LogicalVector>(reader, leftovers);
53+
template<class Source_>
54+
LogicalVector parse_logical_body(Source_& src) try {
55+
return atomic_internal::parse_integer_or_logical_body<LogicalVector>(src);
5256
} catch (std::exception& e) {
5357
throw traceback("failed to parse data for a logical vector", e);
5458
}
5559

56-
template<class Reader>
57-
DoubleVector parse_double_body(Reader& reader, std::vector<unsigned char>& leftovers) try {
58-
size_t len = get_length(reader, leftovers);
60+
template<class Source_>
61+
DoubleVector parse_double_body(Source_& src) try {
62+
size_t len = get_length(src);
5963
DoubleVector output(len);
6064

6165
constexpr size_t width = 8;
66+
static_assert(width == sizeof(decltype(output.data[0])));
67+
size_t byte_length = width * len;
68+
6269
auto ptr = reinterpret_cast<unsigned char*>(output.data.data());
63-
extract_up_to(reader, leftovers, width * len,
64-
[&](const unsigned char* buffer, size_t n, size_t i) -> void {
65-
std::copy(buffer, buffer + n, ptr + i);
70+
for (size_t i = 0; i < byte_length; ++i) {
71+
if (!src.advance()) {
72+
throw empty_error();
6673
}
67-
);
74+
ptr[i] = src.get();
75+
}
6876

6977
// Flipping endianness.
7078
if (little_endian()) {
@@ -79,41 +87,48 @@ DoubleVector parse_double_body(Reader& reader, std::vector<unsigned char>& lefto
7987
throw traceback("failed to parse data for a double vector", e);
8088
}
8189

82-
template<class Reader>
83-
RawVector parse_raw_body(Reader& reader, std::vector<unsigned char>& leftovers) try {
84-
size_t len = get_length(reader, leftovers);
90+
template<class Source_>
91+
RawVector parse_raw_body(Source_& src) try {
92+
size_t len = get_length(src);
8593
RawVector output(len);
8694

8795
auto ptr = reinterpret_cast<unsigned char*>(output.data.data());
88-
extract_up_to(reader, leftovers, len,
89-
[&](const unsigned char* buffer, size_t n, size_t i) -> void {
90-
std::copy(buffer, buffer + n, ptr + i);
96+
for (size_t i = 0; i < len; ++i) {
97+
if (!src.advance()) {
98+
throw empty_error();
9199
}
92-
);
100+
ptr[i] = src.get();
101+
}
93102

94103
return output;
95104
} catch (std::exception& e) {
96105
throw traceback("failed to parse data for a raw vector", e);
97106
}
98107

99-
template<class Reader>
100-
ComplexVector parse_complex_body(Reader& reader, std::vector<unsigned char>& leftovers) try {
101-
size_t len = get_length(reader, leftovers);
108+
template<class Source_>
109+
ComplexVector parse_complex_body(Source_& src) try {
110+
size_t len = get_length(src);
102111
ComplexVector output(len);
103112

104113
constexpr size_t width = 16;
114+
static_assert(width == sizeof(decltype(output.data[0])));
115+
size_t byte_length = width * len;
116+
105117
auto ptr = reinterpret_cast<unsigned char*>(output.data.data());
106-
extract_up_to(reader, leftovers, width * len,
107-
[&](const unsigned char* buffer, size_t n, size_t i) -> void {
108-
std::copy(buffer, buffer + n, ptr + i);
118+
for (size_t b = 0; b < byte_length; ++b) {
119+
if (!src.advance()) {
120+
throw empty_error();
109121
}
110-
);
122+
ptr[b] = src.get();
123+
}
111124

112125
// Flipping endianness for each double.
113126
if (little_endian()) {
127+
constexpr size_t single_width = width / 2;
128+
size_t single_length = len * 2;
114129
auto copy = ptr;
115-
for (size_t n = 0; n < len * 2; ++n, copy += width / 2) {
116-
std::reverse(copy, copy + width/2);
130+
for (size_t n = 0; n < single_length; ++n, copy += single_width) {
131+
std::reverse(copy, copy + single_width);
117132
}
118133
}
119134

@@ -122,12 +137,12 @@ ComplexVector parse_complex_body(Reader& reader, std::vector<unsigned char>& lef
122137
throw traceback("failed to parse data for a complex vector", e);
123138
}
124139

125-
template<class Reader>
126-
StringVector parse_string_body(Reader& reader, std::vector<unsigned char>& leftovers) try {
127-
size_t len = get_length(reader, leftovers);
140+
template<class Source_>
141+
StringVector parse_string_body(Source_& src) try {
142+
size_t len = get_length(src);
128143
StringVector output(len);
129144
for (size_t i = 0; i < len; ++i) {
130-
auto str = parse_single_string(reader, leftovers);
145+
auto str = parse_single_string(src);
131146
output.data[i] = str.value;
132147
output.encodings[i] = str.encoding;
133148
output.missing[i] = str.missing;

include/rds2cpp/parse_attributes.hpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,16 @@
1010

1111
namespace rds2cpp {
1212

13-
template<class Reader>
14-
PairList parse_pairlist_body(Reader&, std::vector<unsigned char>&, const Header&, SharedParseInfo&);
13+
template<class Source_>
14+
PairList parse_pairlist_body(Source_&, const Header&, SharedParseInfo&);
1515

1616
inline bool has_attributes(const Header& header) {
1717
return (header[2] & 0x2);
1818
}
1919

20-
template<class Reader>
21-
void parse_attributes_body(Reader& reader, std::vector<unsigned char>& leftovers, const Header& header, Attributes& output, SharedParseInfo& shared) try {
22-
auto plist = parse_pairlist_body(reader, leftovers, header, shared);
20+
template<class Source_>
21+
void parse_attributes_body(Source_& src, const Header& header, Attributes& output, SharedParseInfo& shared) try {
22+
auto plist = parse_pairlist_body(src, header, shared);
2323

2424
size_t nnodes = plist.data.size();
2525
for (size_t t = 0; t < nnodes; ++t) {
@@ -35,13 +35,13 @@ void parse_attributes_body(Reader& reader, std::vector<unsigned char>& leftovers
3535
throw traceback("failed to parse attribute contents", e);
3636
}
3737

38-
template<class Reader>
39-
void parse_attributes(Reader& reader, std::vector<unsigned char>& leftovers, Attributes& output, SharedParseInfo& shared) try {
40-
auto header = parse_header(reader, leftovers);
38+
template<class Source_>
39+
void parse_attributes(Source_& src, Attributes& output, SharedParseInfo& shared) try {
40+
auto header = parse_header(src);
4141
if (header[3] != static_cast<unsigned>(SEXPType::LIST)) {
4242
throw std::runtime_error("attributes should be a pairlist");
4343
}
44-
parse_attributes_body(reader, leftovers, header, output, shared);
44+
parse_attributes_body(src, header, output, shared);
4545
return;
4646
} catch (std::exception& e) {
4747
throw traceback("failed to parse attributes", e);

0 commit comments

Comments
 (0)