Skip to content

Commit 7305fbb

Browse files
xiaoxmengfacebook-github-bot
authored andcommitted
feat: Add kCompactRaw raw encoding support and simplify deserialization API (facebookincubator#633)
Summary: CONTEXT: The nimble serializer/deserializer needed several improvements: (1) Stream sizes in the kCompactRaw trailer only supported Trivial and Varint encoding. Stream sizes tend to be similar across streams, making delta encoding a natural fit. However, in practice delta-varint provides marginal savings over plain varint for stream sizes since the values vary enough that deltas don't compress much better. (2) DeserializerOptions required callers to pass a full SerializationVersion, but the deserializer auto-detects version from the header byte — only a bool flag is needed. (3) StreamData took two separate bool flags (encodingEnabled, useVarintRowCount) instead of deriving them from the version. (4) Several helper functions were missing for version classification. WHAT: - Add Zigzag.h utility (branchless zigzag encode/decode for int32/int64) and Delta encoding support for kCompactRaw stream sizes. Delta encoding stores the first value as-is, then zigzag+varint encodes consecutive deltas. Wire: [0x08][count:varint][first:varint][zigzag(delta_1):varint]...[trailer_size:u32] - Simplify DeserializerOptions: replace `std::optional<SerializationVersion> version` with `bool hasHeader`. When true, auto-detect version from the first byte of serialized data. When false, assume kLegacy (no version header). - Simplify StreamData: take SerializationVersion instead of two bool flags (encodingEnabled, useVarintRowCount), derive both internally. - Add version helper functions in Options.h: - `nonLegacyFormat()`: returns true for kCompact/kCompactRaw/kTabletRaw - `isRawFormat()`: returns true for kCompactRaw/kTabletRaw - `isTabletRawFormat()`: returns true for kTabletRaw only - Add validation in Serializer constructor: reject explicit kLegacy and kTabletRaw versions, validate streamSizesEncodingType requires non-legacy version. - Extract `StreamDataReader::readVersion()` method for version header parsing. - Add fuzz tests for mixed-version serialization and projection (cycles through kCompact, kCompactRaw, kCompactRaw+Delta) with decode verification. - SST writer benchmark: remove sst_partitioned_index flag, change nimble_stream_sizes_encoding default from Trivial to Delta. Reviewed By: srsuryadev, jiahaol-work, tanjialiang Differential Revision: D98941222
1 parent f8a1d5a commit 7305fbb

15 files changed

Lines changed: 678 additions & 224 deletions

dwio/nimble/common/Zigzag.h

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#pragma once
17+
18+
#include <cstdint>
19+
20+
// ZigZag encoding maps signed integers to unsigned integers so that values
21+
// with small absolute value have small encoded values, making them efficient
22+
// for varint encoding.
23+
//
24+
// zigzagEncode32(0) == 0
25+
// zigzagEncode32(-1) == 1
26+
// zigzagEncode32(1) == 2
27+
// zigzagEncode32(-2) == 3
28+
// ...
29+
//
30+
// In general:
31+
// if x >= 0, zigzagEncode32(x) == 2*x
32+
// if x < 0, zigzagEncode32(x) == -2*x - 1
33+
34+
namespace facebook::nimble::zigzag {
35+
36+
inline constexpr uint32_t zigzagEncode32(int32_t val) noexcept {
37+
return static_cast<uint32_t>((val << 1) ^ (val >> 31));
38+
}
39+
40+
inline constexpr int32_t zigzagDecode32(uint32_t val) noexcept {
41+
return static_cast<int32_t>((val >> 1) ^ -(val & 1));
42+
}
43+
44+
inline constexpr uint64_t zigzagEncode64(int64_t val) noexcept {
45+
return static_cast<uint64_t>((val << 1) ^ (val >> 63));
46+
}
47+
48+
inline constexpr int64_t zigzagDecode64(uint64_t val) noexcept {
49+
return static_cast<int64_t>((val >> 1) ^ -(val & 1));
50+
}
51+
52+
} // namespace facebook::nimble::zigzag
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#include <gtest/gtest.h>
17+
18+
#include <limits>
19+
20+
#include "dwio/nimble/common/Zigzag.h"
21+
22+
using namespace ::facebook::nimble::zigzag;
23+
24+
TEST(ZigzagTest, encode32) {
25+
EXPECT_EQ(zigzagEncode32(0), 0);
26+
EXPECT_EQ(zigzagEncode32(-1), 1);
27+
EXPECT_EQ(zigzagEncode32(1), 2);
28+
EXPECT_EQ(zigzagEncode32(-2), 3);
29+
EXPECT_EQ(zigzagEncode32(2), 4);
30+
EXPECT_EQ(zigzagEncode32(-3), 5);
31+
EXPECT_EQ(zigzagEncode32(std::numeric_limits<int32_t>::max()), 4294967294U);
32+
EXPECT_EQ(zigzagEncode32(std::numeric_limits<int32_t>::min()), 4294967295U);
33+
}
34+
35+
TEST(ZigzagTest, decode32) {
36+
EXPECT_EQ(zigzagDecode32(0), 0);
37+
EXPECT_EQ(zigzagDecode32(1), -1);
38+
EXPECT_EQ(zigzagDecode32(2), 1);
39+
EXPECT_EQ(zigzagDecode32(3), -2);
40+
EXPECT_EQ(zigzagDecode32(4), 2);
41+
EXPECT_EQ(zigzagDecode32(5), -3);
42+
EXPECT_EQ(zigzagDecode32(4294967294U), std::numeric_limits<int32_t>::max());
43+
EXPECT_EQ(zigzagDecode32(4294967295U), std::numeric_limits<int32_t>::min());
44+
}
45+
46+
TEST(ZigzagTest, roundTrip32) {
47+
for (int32_t val :
48+
{0,
49+
1,
50+
-1,
51+
100,
52+
-100,
53+
1'000'000,
54+
-1'000'000,
55+
std::numeric_limits<int32_t>::max(),
56+
std::numeric_limits<int32_t>::min()}) {
57+
EXPECT_EQ(zigzagDecode32(zigzagEncode32(val)), val);
58+
}
59+
}
60+
61+
TEST(ZigzagTest, encode64) {
62+
EXPECT_EQ(zigzagEncode64(0), 0);
63+
EXPECT_EQ(zigzagEncode64(-1), 1);
64+
EXPECT_EQ(zigzagEncode64(1), 2);
65+
EXPECT_EQ(zigzagEncode64(-2), 3);
66+
EXPECT_EQ(
67+
zigzagEncode64(std::numeric_limits<int64_t>::max()),
68+
std::numeric_limits<uint64_t>::max() - 1);
69+
EXPECT_EQ(
70+
zigzagEncode64(std::numeric_limits<int64_t>::min()),
71+
std::numeric_limits<uint64_t>::max());
72+
}
73+
74+
TEST(ZigzagTest, roundTrip64) {
75+
for (int64_t val :
76+
{int64_t{0},
77+
int64_t{1},
78+
int64_t{-1},
79+
int64_t{100},
80+
int64_t{-100},
81+
int64_t{1'000'000'000'000},
82+
int64_t{-1'000'000'000'000},
83+
std::numeric_limits<int64_t>::max(),
84+
std::numeric_limits<int64_t>::min()}) {
85+
EXPECT_EQ(zigzagDecode64(zigzagEncode64(val)), val);
86+
}
87+
}

dwio/nimble/serializer/Deserializer.cpp

Lines changed: 9 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -91,14 +91,10 @@ class DeserializerImpl : public Decoder {
9191
DeserializerImpl(
9292
const Type* type,
9393
bool inMapStream,
94-
bool enableEncoding,
95-
bool useVarintRowCount,
9694
velox::memory::MemoryPool* pool)
9795
: type_{type},
9896
pool_{pool},
9997
inMapStream_{inMapStream},
100-
encodingEnabled_{enableEncoding},
101-
useVarintRowCount_{useVarintRowCount},
10298
scalarKind_{getScalarKindForType(*type)},
10399
typeStorageWidth_{getTypeStorageWidth(*type)} {}
104100

@@ -156,19 +152,18 @@ class DeserializerImpl : public Decoder {
156152
}
157153

158154
// Add data starting at the given row offset.
159-
void addBatch(uint32_t rowOffset, std::string_view data) {
155+
// version: the auto-detected serialization version, used to determine
156+
// encoding enabled and varint row count settings.
157+
void addBatch(
158+
uint32_t rowOffset,
159+
std::string_view data,
160+
SerializationVersion version) {
160161
if (data.empty()) {
161162
return;
162163
}
163164
batchSegments_.emplace_back(
164165
BatchSegment{
165-
rowOffset,
166-
serde::StreamData(
167-
scalarKind_,
168-
encodingEnabled_,
169-
useVarintRowCount_,
170-
data,
171-
pool_)});
166+
rowOffset, serde::StreamData(scalarKind_, version, data, pool_)});
172167
}
173168

174169
// Record a segment where this key is present in every row (in-map stream
@@ -499,10 +494,6 @@ class DeserializerImpl : public Decoder {
499494
// True for inMap streams (fills with 'false' when missing), false for nulls
500495
// streams (fills with 'true' when missing).
501496
const bool inMapStream_;
502-
// True when nimble encoding is used for scalar streams.
503-
const bool encodingEnabled_;
504-
// Whether encoding headers use varint (true) or fixed u32 (false) row counts.
505-
const bool useVarintRowCount_;
506497
// Cached from type at construction to avoid per-call dispatch.
507498
const ScalarKind scalarKind_;
508499
const uint32_t typeStorageWidth_;
@@ -639,15 +630,10 @@ Deserializer::Deserializer(
639630
void Deserializer::createDeserializersForType(
640631
const Type& type,
641632
uint32_t depth) {
642-
const bool enableEncoding = options_.enableEncoding();
643-
const bool useVarintRowCount =
644-
!isTabletRawFormat(options_.serializationVersion());
645633
deserializerMap_[getMainDescriptor(type).offset()] =
646634
std::make_unique<DeserializerImpl>(
647635
&type,
648636
/*inMapStream=*/false,
649-
enableEncoding,
650-
useVarintRowCount,
651637
pool_);
652638
// FlatMap is only supported at depth 1 (top-level columns). FlatMap keys can
653639
// vary across batches, causing gaps in nulls/inMap streams. Gap detection is
@@ -661,8 +647,6 @@ void Deserializer::createDeserializersForType(
661647
deserializerMap_[inMapOffset] = std::make_unique<DeserializerImpl>(
662648
&type,
663649
/*inMapStream=*/true,
664-
enableEncoding,
665-
useVarintRowCount,
666650
pool_);
667651
inMapChildTypes_[inMapOffset] = flatMap.childAt(i).get();
668652
}
@@ -691,6 +675,7 @@ void Deserializer::deserialize(
691675
serde::StreamDataReader reader{pool_, options_};
692676
for (auto sv : data) {
693677
const auto batchRows = reader.initialize(sv);
678+
const auto version = reader.version();
694679
// Reset present tracking from previous batch.
695680
if (hasInMapChildren && !inMapPresentOffsetsList_.empty()) {
696681
for (auto off : inMapPresentOffsetsList_) {
@@ -707,7 +692,7 @@ void Deserializer::deserialize(
707692
auto* decoder = deserializers_[offset];
708693
if (decoder != nullptr) {
709694
DeserializerImpl::toDecoderImpl(decoder)->addBatch(
710-
rowOffset, streamData);
695+
rowOffset, streamData, version);
711696
}
712697
}
713698
});

dwio/nimble/serializer/DeserializerImpl.cpp

Lines changed: 27 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,13 @@ namespace facebook::nimble::serde {
3030

3131
StreamData::StreamData(
3232
ScalarKind kind,
33-
bool encodingEnabled,
34-
bool useVarintRowCount,
33+
SerializationVersion version,
3534
std::string_view data,
3635
velox::memory::MemoryPool* pool)
3736
: kind_{kind},
3837
pool_{pool},
39-
encodingEnabled_{encodingEnabled},
40-
useVarintRowCount_{useVarintRowCount} {
38+
encodingEnabled_{nonLegacyFormat(version)},
39+
useVarintRowCount_{!isTabletRawFormat(version)} {
4140
NIMBLE_CHECK_NOT_NULL(pool_, "Memory pool required for encoding");
4241
init(data);
4342
}
@@ -63,11 +62,12 @@ uint32_t StreamData::decodeStrings(uint32_t count, std::string_view* output) {
6362
return index;
6463
}
6564

66-
void StreamData::reset(std::string_view data, bool encodingEnabled) {
65+
void StreamData::reset(std::string_view data, SerializationVersion version) {
6766
readRows_ = 0;
6867
encoding_.reset();
6968
stringBuffers_.clear();
70-
encodingEnabled_ = encodingEnabled;
69+
encodingEnabled_ = nonLegacyFormat(version);
70+
useVarintRowCount_ = !isTabletRawFormat(version);
7171
// Re-initialize with new data.
7272
init(data);
7373
}
@@ -207,39 +207,37 @@ StreamDataReader::StreamDataReader(
207207
uint32_t StreamDataReader::initialize(std::string_view data) {
208208
pos_ = data.data();
209209
end_ = data.end();
210-
if (options_.hasVersionHeader()) {
211-
const auto version = static_cast<SerializationVersion>(*pos_);
212-
NIMBLE_CHECK(
213-
version == SerializationVersion::kLegacy ||
214-
version == SerializationVersion::kCompact ||
215-
version == SerializationVersion::kCompactRaw ||
216-
version == SerializationVersion::kTabletRaw,
217-
"Unsupported version {}",
218-
static_cast<uint8_t>(version));
219-
// Verify the version read from serialized data matches options.
220-
NIMBLE_CHECK_EQ(
221-
version,
222-
*options_.version,
223-
"Version mismatch: data has version {}, options expect {}",
224-
version,
225-
*options_.version);
226-
++pos_;
227-
}
210+
readVersion();
228211
// All non-legacy formats use varint row count.
229-
if (usesVarintRowCount(options_.serializationVersion())) {
212+
if (usesVarintRowCount(version_)) {
230213
return varint::readVarint32(&pos_);
231214
}
232215
return encoding::readUint32(pos_);
233216
}
234217

218+
void StreamDataReader::readVersion() {
219+
if (!options_.hasHeader) {
220+
version_ = SerializationVersion::kLegacy;
221+
return;
222+
}
223+
version_ = static_cast<SerializationVersion>(*pos_);
224+
NIMBLE_CHECK(
225+
version_ == SerializationVersion::kLegacy ||
226+
version_ == SerializationVersion::kCompact ||
227+
version_ == SerializationVersion::kCompactRaw ||
228+
version_ == SerializationVersion::kTabletRaw,
229+
"Unsupported version {}",
230+
static_cast<uint8_t>(version_));
231+
++pos_;
232+
}
233+
235234
void StreamDataReader::iterateStreams(
236235
const std::function<void(uint32_t offset, std::string_view data)>&
237236
callback) {
238-
if (options_.enableEncoding()) {
237+
if (nonLegacyFormat(version_)) {
239238
// kCompact/kCompactRaw/kTabletRaw format: read stream sizes from trailer.
240-
const auto streamSizes =
241-
detail::readStreamSizes(end_, options_.serializationVersion(), pool_);
242-
const bool tabletRaw = isTabletRawFormat(options_.serializationVersion());
239+
const auto streamSizes = detail::readStreamSizes(end_, version_, pool_);
240+
const bool tabletRaw = isTabletRawFormat(version_);
243241

244242
for (uint32_t i = 0; i < streamSizes.size(); ++i) {
245243
std::string_view streamData(pos_, streamSizes[i]);

0 commit comments

Comments
 (0)