Skip to content

Commit 33752c3

Browse files
xiaoxmengfacebook-github-bot
authored andcommitted
perf: Add bulkGet64WithBaseline and 8-byte fast path for FixedBitWidthEncoding (facebookincubator#641)
Summary: Referenced from MRS AusList decode optimization D98819389 (AusLongListForBitpackEncoder). Ports the key branchless byte-aligned load technique to Nimble's FixedBitWidthEncoding for general use. Add bulk decode optimizations for 64-bit types in FixedBitWidthEncoding, targeting the selective reader and serializer/deserializer materialize() paths. Changes: FixedBitArray: Add bulkGet64WithBaseline() for 64-bit output with arbitrary bitWidth. Three code paths by bit width: - bitWidth <= 32: delegates to the optimized template-unrolled 32-bit path (bulkGetWithBaseline32Into64). - bitWidth 33-57: branchless byte-aligned loads — since the sub-byte offset is at most 7, bitWidth + remainder <= 57 + 7 = 64, so each value fits in a single 64-bit load with no cross-word boundary branch. This eliminates the branch in the hot loop and enables better instruction-level parallelism. - bitWidth > 57: falls back to per-element get() for cross-word handling. FixedBitWidthEncoding: Extend the selective reader fast path (bulkScan + readWithVisitorFast) from 4-byte-only to also support 8-byte integral types (int64/uint64). Previously, 64-bit columns always used the slow per-element path. Legacy FixedBitWidthEncoding: Updated materialize() to use bulkGet64WithBaseline for 8-byte types. Differential Revision: D99154749
1 parent a78abce commit 33752c3

10 files changed

Lines changed: 590 additions & 44 deletions

File tree

dwio/nimble/common/FixedBitArray.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,36 @@ void FixedBitArray::bulkGetWithBaseline32Into64(
356356
*this, buffer_, start, length, values, baseline);
357357
}
358358

359+
void FixedBitArray::bulkGet64WithBaseline(
360+
uint64_t start,
361+
uint64_t length,
362+
uint64_t* values,
363+
uint64_t baseline) const {
364+
if (bitWidth_ <= 32) {
365+
// Delegate to the optimized template-unrolled 32-bit path.
366+
bulkGetWithBaseline32Into64(start, length, values, baseline);
367+
return;
368+
}
369+
if (bitWidth_ <= 57) {
370+
// Branchless byte-aligned loads: since the sub-byte offset is at most 7,
371+
// bitWidth + remainder <= 57 + 7 = 64, so each value fits in a single
372+
// 64-bit load — no cross-word boundary branch needed.
373+
for (uint64_t i = 0; i < length; ++i) {
374+
const uint64_t bits = (start + i) * bitWidth_;
375+
const uint64_t offset = bits >> 3;
376+
const uint64_t remainder = bits & 7;
377+
const uint64_t word =
378+
*reinterpret_cast<const uint64_t*>(buffer_ + offset);
379+
values[i] = ((word >> remainder) & mask_) + baseline;
380+
}
381+
} else {
382+
// Wide bit widths (> 57): need cross-word boundary handling.
383+
for (uint64_t i = 0; i < length; ++i) {
384+
values[i] = get(start + i) + baseline;
385+
}
386+
}
387+
}
388+
359389
template <int bitWidth, int loopPosition, bool withBaseline>
360390
void bulkSet32Loop(
361391
uint64_t** nextWord,

dwio/nimble/common/FixedBitArray.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,17 @@ class FixedBitArray {
9898
uint64_t* values,
9999
uint64_t baseline) const;
100100

101+
// Retrieves a contiguous subarray from slots [start, start + length) into
102+
// 64-bit output values, adding baseline to each. Supports any bit width
103+
// up to 64. For bitWidth <= 32, delegates to the optimized bulkGet32 path.
104+
// For bitWidth 33-57, uses branchless byte-aligned loads (single 64-bit
105+
// load per value). For bitWidth > 57, handles cross-word boundary overflow.
106+
void bulkGet64WithBaseline(
107+
uint64_t start,
108+
uint64_t length,
109+
uint64_t* values,
110+
uint64_t baseline) const;
111+
101112
// Sets a contiguous subarray of slots from [start, start + length).
102113
// Considerably faster than looping/ a get call. Only callable when bitWidth
103114
// <= 32. Same semantics as set -- see the warning there.

dwio/nimble/common/Types.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,11 @@ constexpr bool isFourByteIntegralType() {
277277
return std::is_same_v<T, int32_t> || std::is_same_v<T, uint32_t>;
278278
}
279279

280+
template <typename T>
281+
constexpr bool isEightByteIntegralType() {
282+
return std::is_same_v<T, int64_t> || std::is_same_v<T, uint64_t>;
283+
}
284+
280285
template <typename T>
281286
constexpr bool isSignedIntegralType() {
282287
return std::is_same_v<T, int32_t> || std::is_same_v<T, int64_t> ||

dwio/nimble/common/tests/FixedBitArrayTests.cpp

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,11 @@
1414
* limitations under the License.
1515
*/
1616
#include <gtest/gtest.h>
17+
#include <cstring>
1718
#include <memory>
1819

1920
#include "dwio/nimble/common/FixedBitArray.h"
21+
#include "fmt/format.h"
2022
#include "folly/Benchmark.h"
2123
#include "folly/Random.h"
2224

@@ -257,6 +259,112 @@ TEST(FixedBitArrayTests, BulkGetWithBaseline32Random) {
257259
}
258260
}
259261

262+
TEST(FixedBitArrayTests, bulkGet64WithBaseline) {
263+
auto seed = folly::Random::rand32();
264+
LOG(INFO) << "seed: " << seed;
265+
std::mt19937 rng(seed);
266+
267+
// Test all three code paths:
268+
// - bitWidth <= 32: delegates to bulkGetWithBaseline32Into64
269+
// - bitWidth 33-56: branchless byte-aligned loads
270+
// - bitWidth > 56: per-element get() fallback
271+
for (int bitWidth = 1; bitWidth <= 64; ++bitWidth) {
272+
SCOPED_TRACE(fmt::format("bitWidth={}", bitWidth));
273+
const uint64_t maxElement = bitWidth == 64
274+
? std::numeric_limits<uint64_t>::max()
275+
: (1ULL << bitWidth) - 1;
276+
const uint64_t baseline = folly::Random::rand64(rng) % (maxElement / 2 + 1);
277+
const uint64_t valueRange = maxElement - baseline;
278+
279+
for (int test = 0; test < kNumTestsPerBitWidth; ++test) {
280+
const int elementCount = folly::Random::rand32(rng) % kMaxElements;
281+
auto buffer = std::make_unique<char[]>(
282+
nimble::FixedBitArray::bufferSize(elementCount, bitWidth));
283+
// Zero-initialize buffer since set() uses OR semantics.
284+
std::memset(
285+
buffer.get(),
286+
0,
287+
nimble::FixedBitArray::bufferSize(elementCount, bitWidth));
288+
nimble::FixedBitArray fixedBitArray(buffer.get(), bitWidth);
289+
290+
std::vector<uint64_t> randomValues(elementCount);
291+
for (int i = 0; i < elementCount; ++i) {
292+
randomValues[i] = valueRange == std::numeric_limits<uint64_t>::max()
293+
? folly::Random::rand64(rng)
294+
: folly::Random::rand64(rng) % (valueRange + 1);
295+
}
296+
for (int i = 0; i < elementCount; ++i) {
297+
fixedBitArray.set(i, randomValues[i]);
298+
}
299+
300+
// Bulk read all elements.
301+
std::vector<uint64_t> values(elementCount);
302+
fixedBitArray.bulkGet64WithBaseline(
303+
0, elementCount, values.data(), baseline);
304+
for (int i = 0; i < elementCount; ++i) {
305+
ASSERT_EQ(values[i], randomValues[i] + baseline)
306+
<< "bitWidth: " << bitWidth << ", i: " << i;
307+
}
308+
309+
// Single-element reads at each position.
310+
for (int i = 0; i < elementCount; ++i) {
311+
uint64_t element;
312+
fixedBitArray.bulkGet64WithBaseline(i, 1, &element, baseline);
313+
ASSERT_EQ(element, randomValues[i] + baseline);
314+
}
315+
316+
// Read from a random offset.
317+
if (elementCount > 1) {
318+
const int offset = folly::Random::rand32(rng) % (elementCount - 1);
319+
const int count = elementCount - offset;
320+
std::vector<uint64_t> partial(count);
321+
fixedBitArray.bulkGet64WithBaseline(
322+
offset, count, partial.data(), baseline);
323+
for (int i = 0; i < count; ++i) {
324+
ASSERT_EQ(partial[i], randomValues[offset + i] + baseline);
325+
}
326+
}
327+
}
328+
}
329+
}
330+
331+
TEST(FixedBitArrayTests, bulkGet64WithBaselineZeroBaseline) {
332+
// Verify bulkGet64WithBaseline with baseline=0 matches per-element get().
333+
auto seed = folly::Random::rand32();
334+
LOG(INFO) << "seed: " << seed;
335+
std::mt19937 rng(seed);
336+
337+
for (int bitWidth : {1, 8, 16, 32, 40, 48, 56, 60, 64}) {
338+
SCOPED_TRACE(fmt::format("bitWidth={}", bitWidth));
339+
const int elementCount = 100 + folly::Random::rand32(rng) % 200;
340+
const uint64_t maxElement = bitWidth == 64
341+
? std::numeric_limits<uint64_t>::max()
342+
: (1ULL << bitWidth) - 1;
343+
auto buffer = std::make_unique<char[]>(
344+
nimble::FixedBitArray::bufferSize(elementCount, bitWidth));
345+
std::memset(
346+
buffer.get(),
347+
0,
348+
nimble::FixedBitArray::bufferSize(elementCount, bitWidth));
349+
nimble::FixedBitArray fixedBitArray(buffer.get(), bitWidth);
350+
351+
for (int i = 0; i < elementCount; ++i) {
352+
const uint64_t value = bitWidth == 64
353+
? folly::Random::rand64(rng)
354+
: folly::Random::rand64(rng) % (maxElement + 1);
355+
fixedBitArray.set(i, value);
356+
}
357+
358+
std::vector<uint64_t> bulkValues(elementCount);
359+
fixedBitArray.bulkGet64WithBaseline(0, elementCount, bulkValues.data(), 0);
360+
361+
for (int i = 0; i < elementCount; ++i) {
362+
ASSERT_EQ(bulkValues[i], fixedBitArray.get(i))
363+
<< "bitWidth: " << bitWidth << ", i: " << i;
364+
}
365+
}
366+
}
367+
260368
TEST(FixedBitArrayTests, BulkSet32Random) {
261369
auto seed = folly::Random::rand32();
262370
LOG(INFO) << "seed: " << seed;

dwio/nimble/common/tests/TypesTest.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,15 @@ TEST(TypesTest, IsFourByteIntegralType) {
265265
EXPECT_FALSE(isFourByteIntegralType<float>());
266266
}
267267

268+
TEST(TypesTest, isEightByteIntegralType) {
269+
EXPECT_TRUE(isEightByteIntegralType<int64_t>());
270+
EXPECT_TRUE(isEightByteIntegralType<uint64_t>());
271+
EXPECT_FALSE(isEightByteIntegralType<int32_t>());
272+
EXPECT_FALSE(isEightByteIntegralType<uint32_t>());
273+
EXPECT_FALSE(isEightByteIntegralType<int16_t>());
274+
EXPECT_FALSE(isEightByteIntegralType<double>());
275+
}
276+
268277
TEST(TypesTest, IsFloatingPointType) {
269278
EXPECT_TRUE(isFloatingPointType<float>());
270279
EXPECT_TRUE(isFloatingPointType<double>());

dwio/nimble/encodings/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14+
add_subdirectory(benchmarks)
1415
add_subdirectory(tests)
1516
add_subdirectory(legacy)
1617

dwio/nimble/encodings/FixedBitWidthEncoding.h

Lines changed: 43 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -137,17 +137,15 @@ void FixedBitWidthEncoding<T>::materialize(uint32_t rowCount, void* buffer) {
137137
if constexpr (isFourByteIntegralType<physicalType>()) {
138138
fixedBitArray_.bulkGetWithBaseline32(
139139
row_, rowCount, static_cast<uint32_t*>(buffer), baseline_);
140+
} else if constexpr (isEightByteIntegralType<physicalType>()) {
141+
fixedBitArray_.bulkGet64WithBaseline(
142+
row_, rowCount, static_cast<uint64_t*>(buffer), baseline_);
140143
} else {
141-
if (sizeof(physicalType) == 8 && bitWidth_ <= 32) {
142-
fixedBitArray_.bulkGetWithBaseline32Into64(
143-
row_, rowCount, static_cast<uint64_t*>(buffer), baseline_);
144-
} else {
145-
const uint32_t start = row_;
146-
const uint32_t end = start + rowCount;
147-
physicalType* output = static_cast<physicalType*>(buffer);
148-
for (uint32_t i = start; i < end; ++i) {
149-
*output++ = fixedBitArray_.get(i) + baseline_;
150-
}
144+
const uint32_t start = row_;
145+
const uint32_t end = start + rowCount;
146+
physicalType* output = static_cast<physicalType*>(buffer);
147+
for (uint32_t i = start; i < end; ++i) {
148+
*output++ = fixedBitArray_.get(i) + baseline_;
151149
}
152150
}
153151
row_ += rowCount;
@@ -158,10 +156,10 @@ template <typename V>
158156
void FixedBitWidthEncoding<T>::readWithVisitor(
159157
V& visitor,
160158
ReadWithVisitorParams& params) {
161-
// Fast path: use bulk scan for 4-byte integral types with no filter and no
162-
// hook. This is common for dictionary indices (uint32_t).
163-
// The fast path only supports ExtractToReader (not hooks).
164-
// We also check that the output type is compatible:
159+
// Fast path: use bulk scan for integral types with no filter and no hook.
160+
// Supports 4-byte types (common for dictionary indices) and 8-byte types
161+
// (int64/uint64 columns). The fast path only supports ExtractToReader.
162+
// Output type must be compatible:
165163
// - Same type: direct memcpy
166164
// - Widening (larger output type): loop with conversion
167165
using OutputType = detail::ValueType<typename V::DataType>;
@@ -170,7 +168,9 @@ void FixedBitWidthEncoding<T>::readWithVisitor(
170168
constexpr bool kSameType = std::is_same_v<physicalType, OutputType>;
171169
constexpr bool kIsWidening = sizeof(OutputType) > sizeof(physicalType) &&
172170
std::is_integral_v<OutputType> && std::is_integral_v<physicalType>;
173-
constexpr bool kCanUseFastPath = isFourByteIntegralType<physicalType>() &&
171+
constexpr bool kIsFourByte = isFourByteIntegralType<physicalType>();
172+
constexpr bool kIsEightByteIntegral = isEightByteIntegralType<physicalType>();
173+
constexpr bool kCanUseFastPath = (kIsFourByte || kIsEightByteIntegral) &&
174174
!V::kHasFilter && !V::kHasHook && kExtractToReader &&
175175
(kSameType || kIsWidening);
176176
if constexpr (kCanUseFastPath) {
@@ -200,8 +200,9 @@ void FixedBitWidthEncoding<T>::bulkScan(
200200
using DataType = typename V::DataType;
201201
using OutputType = detail::ValueType<DataType>;
202202
static_assert(
203-
isFourByteIntegralType<physicalType>(),
204-
"bulkScan only supports 4-byte integral types");
203+
isFourByteIntegralType<physicalType>() ||
204+
isEightByteIntegralType<physicalType>(),
205+
"bulkScan only supports 4-byte or 8-byte integral types");
205206

206207
if (numSelected == 0) {
207208
return;
@@ -227,24 +228,32 @@ void FixedBitWidthEncoding<T>::bulkScan(
227228
std::is_integral_v<OutputType> && std::is_integral_v<physicalType>;
228229

229230
if constexpr (V::dense) {
230-
// Dense case: values are contiguous, read in bulk.
231-
buffer_.resize(numSelected);
232-
fixedBitArray_.bulkGetWithBaseline32(
233-
selectedRows[0] + offset,
234-
numSelected,
235-
reinterpret_cast<uint32_t*>(buffer_.data()),
236-
baseline_);
237-
238-
if constexpr (kSameSize) {
239-
// Same size types: use fast memcpy (works for same type or
240-
// signed/unsigned variants like int32_t vs uint32_t).
241-
std::memcpy(values, buffer_.data(), numSelected * sizeof(physicalType));
242-
} else if constexpr (kIsUpcast) {
243-
// Widening case: copy with implicit type conversion.
244-
// Compilers typically auto-vectorize this pattern.
245-
for (vector_size_t i = 0; i < numSelected; ++i) {
246-
values[i] = static_cast<OutputType>(buffer_[i]);
231+
if constexpr (isFourByteIntegralType<physicalType>()) {
232+
// 4-byte path: use the optimized template-unrolled bulk decode.
233+
buffer_.resize(numSelected);
234+
fixedBitArray_.bulkGetWithBaseline32(
235+
selectedRows[0] + offset,
236+
numSelected,
237+
reinterpret_cast<uint32_t*>(buffer_.data()),
238+
baseline_);
239+
240+
if constexpr (kSameSize) {
241+
std::memcpy(values, buffer_.data(), numSelected * sizeof(physicalType));
242+
} else if constexpr (kIsUpcast) {
243+
for (vector_size_t i = 0; i < numSelected; ++i) {
244+
values[i] = static_cast<OutputType>(buffer_[i]);
245+
}
247246
}
247+
} else {
248+
// 8-byte path: use bulkGet64WithBaseline which handles all bit widths
249+
// including branchless byte-aligned loads for bitWidth <= 56.
250+
static_assert(isEightByteIntegralType<physicalType>());
251+
static_assert(kSameSize, "8-byte bulkScan requires same-size output");
252+
fixedBitArray_.bulkGet64WithBaseline(
253+
selectedRows[0] + offset,
254+
numSelected,
255+
reinterpret_cast<uint64_t*>(values),
256+
baseline_);
248257
}
249258
} else {
250259
// Sparse case: read individual values at specified positions.
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
add_executable(nimble_fixed_bit_width_benchmark FixedBitWidthBenchmark.cpp)
15+
16+
target_link_libraries(
17+
nimble_fixed_bit_width_benchmark
18+
nimble_encodings_tests_utils
19+
nimble_common
20+
Folly::follybenchmark
21+
Folly::folly
22+
velox_memory
23+
)

0 commit comments

Comments
 (0)