perf: Add bulkGet64WithBaseline and 8-byte fast path for FixedBitWidthEncoding (facebookincubator#641)

xiaoxmeng · facebook-github-bot · commit 33752c3330c5 · 2026-04-05T10:16:13.000-07:00
Summary:

Referenced from MRS AusList decode optimization D98819389 (AusLongListForBitpackEncoder). Ports the key branchless byte-aligned load technique to Nimble's FixedBitWidthEncoding for general use.

Add bulk decode optimizations for 64-bit types in FixedBitWidthEncoding, targeting the selective reader and serializer/deserializer materialize() paths.

Changes:

FixedBitArray: Add bulkGet64WithBaseline() for 64-bit output with arbitrary bitWidth. Three code paths by bit width:
- bitWidth &lt;= 32: delegates to the optimized template-unrolled 32-bit path (bulkGetWithBaseline32Into64).
- bitWidth 33-57: branchless byte-aligned loads — since the sub-byte offset is at most 7, bitWidth + remainder &lt;= 57 + 7 = 64, so each value fits in a single 64-bit load with no cross-word boundary branch. This eliminates the branch in the hot loop and enables better instruction-level parallelism.
- bitWidth &gt; 57: falls back to per-element get() for cross-word handling.

FixedBitWidthEncoding: Extend the selective reader fast path (bulkScan + readWithVisitorFast) from 4-byte-only to also support 8-byte integral types (int64/uint64). Previously, 64-bit columns always used the slow per-element path.

Legacy FixedBitWidthEncoding: Updated materialize() to use bulkGet64WithBaseline for 8-byte types.

Differential Revision: D99154749
diff --git a/dwio/nimble/common/FixedBitArray.cpp b/dwio/nimble/common/FixedBitArray.cpp
@@ -356,6 +356,36 @@ void FixedBitArray::bulkGetWithBaseline32Into64(
       *this, buffer_, start, length, values, baseline);
 }
 
+void FixedBitArray::bulkGet64WithBaseline(
+    uint64_t start,
+    uint64_t length,
+    uint64_t* values,
+    uint64_t baseline) const {
+  if (bitWidth_ <= 32) {
+    // Delegate to the optimized template-unrolled 32-bit path.
+    bulkGetWithBaseline32Into64(start, length, values, baseline);
+    return;
+  }
+  if (bitWidth_ <= 57) {
+    // Branchless byte-aligned loads: since the sub-byte offset is at most 7,
+    // bitWidth + remainder <= 57 + 7 = 64, so each value fits in a single
+    // 64-bit load — no cross-word boundary branch needed.
+    for (uint64_t i = 0; i < length; ++i) {
+      const uint64_t bits = (start + i) * bitWidth_;
+      const uint64_t offset = bits >> 3;
+      const uint64_t remainder = bits & 7;
+      const uint64_t word =
+          *reinterpret_cast<const uint64_t*>(buffer_ + offset);
+      values[i] = ((word >> remainder) & mask_) + baseline;
+    }
+  } else {
+    // Wide bit widths (> 57): need cross-word boundary handling.
+    for (uint64_t i = 0; i < length; ++i) {
+      values[i] = get(start + i) + baseline;
+    }
+  }
+}
+
 template <int bitWidth, int loopPosition, bool withBaseline>
 void bulkSet32Loop(
     uint64_t** nextWord,
diff --git a/dwio/nimble/common/FixedBitArray.h b/dwio/nimble/common/FixedBitArray.h
@@ -98,6 +98,17 @@ class FixedBitArray {
       uint64_t* values,
       uint64_t baseline) const;
 
+  // Retrieves a contiguous subarray from slots [start, start + length) into
+  // 64-bit output values, adding baseline to each. Supports any bit width
+  // up to 64. For bitWidth <= 32, delegates to the optimized bulkGet32 path.
+  // For bitWidth 33-57, uses branchless byte-aligned loads (single 64-bit
+  // load per value). For bitWidth > 57, handles cross-word boundary overflow.
+  void bulkGet64WithBaseline(
+      uint64_t start,
+      uint64_t length,
+      uint64_t* values,
+      uint64_t baseline) const;
+
   // Sets a contiguous subarray of slots from [start, start + length).
   // Considerably faster than looping/ a get call. Only callable when bitWidth
   // <= 32. Same semantics as set -- see the warning there.
diff --git a/dwio/nimble/common/Types.h b/dwio/nimble/common/Types.h
@@ -277,6 +277,11 @@ constexpr bool isFourByteIntegralType() {
   return std::is_same_v<T, int32_t> || std::is_same_v<T, uint32_t>;
 }
 
+template <typename T>
+constexpr bool isEightByteIntegralType() {
+  return std::is_same_v<T, int64_t> || std::is_same_v<T, uint64_t>;
+}
+
 template <typename T>
 constexpr bool isSignedIntegralType() {
   return std::is_same_v<T, int32_t> || std::is_same_v<T, int64_t> ||
diff --git a/dwio/nimble/common/tests/FixedBitArrayTests.cpp b/dwio/nimble/common/tests/FixedBitArrayTests.cpp
@@ -14,9 +14,11 @@
  * limitations under the License.
  */
 #include <gtest/gtest.h>
+#include <cstring>
 #include <memory>
 
 #include "dwio/nimble/common/FixedBitArray.h"
+#include "fmt/format.h"
 #include "folly/Benchmark.h"
 #include "folly/Random.h"
 
@@ -257,6 +259,112 @@ TEST(FixedBitArrayTests, BulkGetWithBaseline32Random) {
   }
 }
 
+TEST(FixedBitArrayTests, bulkGet64WithBaseline) {
+  auto seed = folly::Random::rand32();
+  LOG(INFO) << "seed: " << seed;
+  std::mt19937 rng(seed);
+
+  // Test all three code paths:
+  // - bitWidth <= 32: delegates to bulkGetWithBaseline32Into64
+  // - bitWidth 33-56: branchless byte-aligned loads
+  // - bitWidth > 56: per-element get() fallback
+  for (int bitWidth = 1; bitWidth <= 64; ++bitWidth) {
+    SCOPED_TRACE(fmt::format("bitWidth={}", bitWidth));
+    const uint64_t maxElement = bitWidth == 64
+        ? std::numeric_limits<uint64_t>::max()
+        : (1ULL << bitWidth) - 1;
+    const uint64_t baseline = folly::Random::rand64(rng) % (maxElement / 2 + 1);
+    const uint64_t valueRange = maxElement - baseline;
+
+    for (int test = 0; test < kNumTestsPerBitWidth; ++test) {
+      const int elementCount = folly::Random::rand32(rng) % kMaxElements;
+      auto buffer = std::make_unique<char[]>(
+          nimble::FixedBitArray::bufferSize(elementCount, bitWidth));
+      // Zero-initialize buffer since set() uses OR semantics.
+      std::memset(
+          buffer.get(),
+          0,
+          nimble::FixedBitArray::bufferSize(elementCount, bitWidth));
+      nimble::FixedBitArray fixedBitArray(buffer.get(), bitWidth);
+
+      std::vector<uint64_t> randomValues(elementCount);
+      for (int i = 0; i < elementCount; ++i) {
+        randomValues[i] = valueRange == std::numeric_limits<uint64_t>::max()
+            ? folly::Random::rand64(rng)
+            : folly::Random::rand64(rng) % (valueRange + 1);
+      }
+      for (int i = 0; i < elementCount; ++i) {
+        fixedBitArray.set(i, randomValues[i]);
+      }
+
+      // Bulk read all elements.
+      std::vector<uint64_t> values(elementCount);
+      fixedBitArray.bulkGet64WithBaseline(
+          0, elementCount, values.data(), baseline);
+      for (int i = 0; i < elementCount; ++i) {
+        ASSERT_EQ(values[i], randomValues[i] + baseline)
+            << "bitWidth: " << bitWidth << ", i: " << i;
+      }
+
+      // Single-element reads at each position.
+      for (int i = 0; i < elementCount; ++i) {
+        uint64_t element;
+        fixedBitArray.bulkGet64WithBaseline(i, 1, &element, baseline);
+        ASSERT_EQ(element, randomValues[i] + baseline);
+      }
+
+      // Read from a random offset.
+      if (elementCount > 1) {
+        const int offset = folly::Random::rand32(rng) % (elementCount - 1);
+        const int count = elementCount - offset;
+        std::vector<uint64_t> partial(count);
+        fixedBitArray.bulkGet64WithBaseline(
+            offset, count, partial.data(), baseline);
+        for (int i = 0; i < count; ++i) {
+          ASSERT_EQ(partial[i], randomValues[offset + i] + baseline);
+        }
+      }
+    }
+  }
+}
+
+TEST(FixedBitArrayTests, bulkGet64WithBaselineZeroBaseline) {
+  // Verify bulkGet64WithBaseline with baseline=0 matches per-element get().
+  auto seed = folly::Random::rand32();
+  LOG(INFO) << "seed: " << seed;
+  std::mt19937 rng(seed);
+
+  for (int bitWidth : {1, 8, 16, 32, 40, 48, 56, 60, 64}) {
+    SCOPED_TRACE(fmt::format("bitWidth={}", bitWidth));
+    const int elementCount = 100 + folly::Random::rand32(rng) % 200;
+    const uint64_t maxElement = bitWidth == 64
+        ? std::numeric_limits<uint64_t>::max()
+        : (1ULL << bitWidth) - 1;
+    auto buffer = std::make_unique<char[]>(
+        nimble::FixedBitArray::bufferSize(elementCount, bitWidth));
+    std::memset(
+        buffer.get(),
+        0,
+        nimble::FixedBitArray::bufferSize(elementCount, bitWidth));
+    nimble::FixedBitArray fixedBitArray(buffer.get(), bitWidth);
+
+    for (int i = 0; i < elementCount; ++i) {
+      const uint64_t value = bitWidth == 64
+          ? folly::Random::rand64(rng)
+          : folly::Random::rand64(rng) % (maxElement + 1);
+      fixedBitArray.set(i, value);
+    }
+
+    std::vector<uint64_t> bulkValues(elementCount);
+    fixedBitArray.bulkGet64WithBaseline(0, elementCount, bulkValues.data(), 0);
+
+    for (int i = 0; i < elementCount; ++i) {
+      ASSERT_EQ(bulkValues[i], fixedBitArray.get(i))
+          << "bitWidth: " << bitWidth << ", i: " << i;
+    }
+  }
+}
+
 TEST(FixedBitArrayTests, BulkSet32Random) {
   auto seed = folly::Random::rand32();
   LOG(INFO) << "seed: " << seed;
diff --git a/dwio/nimble/common/tests/TypesTest.cpp b/dwio/nimble/common/tests/TypesTest.cpp
@@ -265,6 +265,15 @@ TEST(TypesTest, IsFourByteIntegralType) {
   EXPECT_FALSE(isFourByteIntegralType<float>());
 }
 
+TEST(TypesTest, isEightByteIntegralType) {
+  EXPECT_TRUE(isEightByteIntegralType<int64_t>());
+  EXPECT_TRUE(isEightByteIntegralType<uint64_t>());
+  EXPECT_FALSE(isEightByteIntegralType<int32_t>());
+  EXPECT_FALSE(isEightByteIntegralType<uint32_t>());
+  EXPECT_FALSE(isEightByteIntegralType<int16_t>());
+  EXPECT_FALSE(isEightByteIntegralType<double>());
+}
+
 TEST(TypesTest, IsFloatingPointType) {
   EXPECT_TRUE(isFloatingPointType<float>());
   EXPECT_TRUE(isFloatingPointType<double>());
diff --git a/dwio/nimble/encodings/CMakeLists.txt b/dwio/nimble/encodings/CMakeLists.txt
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+add_subdirectory(benchmarks)
 add_subdirectory(tests)
 add_subdirectory(legacy)
 
diff --git a/dwio/nimble/encodings/FixedBitWidthEncoding.h b/dwio/nimble/encodings/FixedBitWidthEncoding.h
@@ -137,17 +137,15 @@ void FixedBitWidthEncoding<T>::materialize(uint32_t rowCount, void* buffer) {
   if constexpr (isFourByteIntegralType<physicalType>()) {
     fixedBitArray_.bulkGetWithBaseline32(
         row_, rowCount, static_cast<uint32_t*>(buffer), baseline_);
+  } else if constexpr (isEightByteIntegralType<physicalType>()) {
+    fixedBitArray_.bulkGet64WithBaseline(
+        row_, rowCount, static_cast<uint64_t*>(buffer), baseline_);
   } else {
-    if (sizeof(physicalType) == 8 && bitWidth_ <= 32) {
-      fixedBitArray_.bulkGetWithBaseline32Into64(
-          row_, rowCount, static_cast<uint64_t*>(buffer), baseline_);
-    } else {
-      const uint32_t start = row_;
-      const uint32_t end = start + rowCount;
-      physicalType* output = static_cast<physicalType*>(buffer);
-      for (uint32_t i = start; i < end; ++i) {
-        *output++ = fixedBitArray_.get(i) + baseline_;
-      }
+    const uint32_t start = row_;
+    const uint32_t end = start + rowCount;
+    physicalType* output = static_cast<physicalType*>(buffer);
+    for (uint32_t i = start; i < end; ++i) {
+      *output++ = fixedBitArray_.get(i) + baseline_;
     }
   }
   row_ += rowCount;
@@ -158,10 +156,10 @@ template <typename V>
 void FixedBitWidthEncoding<T>::readWithVisitor(
     V& visitor,
     ReadWithVisitorParams& params) {
-  // Fast path: use bulk scan for 4-byte integral types with no filter and no
-  // hook. This is common for dictionary indices (uint32_t).
-  // The fast path only supports ExtractToReader (not hooks).
-  // We also check that the output type is compatible:
+  // Fast path: use bulk scan for integral types with no filter and no hook.
+  // Supports 4-byte types (common for dictionary indices) and 8-byte types
+  // (int64/uint64 columns). The fast path only supports ExtractToReader.
+  // Output type must be compatible:
   // - Same type: direct memcpy
   // - Widening (larger output type): loop with conversion
   using OutputType = detail::ValueType<typename V::DataType>;
@@ -170,7 +168,9 @@ void FixedBitWidthEncoding<T>::readWithVisitor(
   constexpr bool kSameType = std::is_same_v<physicalType, OutputType>;
   constexpr bool kIsWidening = sizeof(OutputType) > sizeof(physicalType) &&
       std::is_integral_v<OutputType> && std::is_integral_v<physicalType>;
-  constexpr bool kCanUseFastPath = isFourByteIntegralType<physicalType>() &&
+  constexpr bool kIsFourByte = isFourByteIntegralType<physicalType>();
+  constexpr bool kIsEightByteIntegral = isEightByteIntegralType<physicalType>();
+  constexpr bool kCanUseFastPath = (kIsFourByte || kIsEightByteIntegral) &&
       !V::kHasFilter && !V::kHasHook && kExtractToReader &&
       (kSameType || kIsWidening);
   if constexpr (kCanUseFastPath) {
@@ -200,8 +200,9 @@ void FixedBitWidthEncoding<T>::bulkScan(
   using DataType = typename V::DataType;
   using OutputType = detail::ValueType<DataType>;
   static_assert(
-      isFourByteIntegralType<physicalType>(),
-      "bulkScan only supports 4-byte integral types");
+      isFourByteIntegralType<physicalType>() ||
+          isEightByteIntegralType<physicalType>(),
+      "bulkScan only supports 4-byte or 8-byte integral types");
 
   if (numSelected == 0) {
     return;
@@ -227,24 +228,32 @@ void FixedBitWidthEncoding<T>::bulkScan(
       std::is_integral_v<OutputType> && std::is_integral_v<physicalType>;
 
   if constexpr (V::dense) {
-    // Dense case: values are contiguous, read in bulk.
-    buffer_.resize(numSelected);
-    fixedBitArray_.bulkGetWithBaseline32(
-        selectedRows[0] + offset,
-        numSelected,
-        reinterpret_cast<uint32_t*>(buffer_.data()),
-        baseline_);
-
-    if constexpr (kSameSize) {
-      // Same size types: use fast memcpy (works for same type or
-      // signed/unsigned variants like int32_t vs uint32_t).
-      std::memcpy(values, buffer_.data(), numSelected * sizeof(physicalType));
-    } else if constexpr (kIsUpcast) {
-      // Widening case: copy with implicit type conversion.
-      // Compilers typically auto-vectorize this pattern.
-      for (vector_size_t i = 0; i < numSelected; ++i) {
-        values[i] = static_cast<OutputType>(buffer_[i]);
+    if constexpr (isFourByteIntegralType<physicalType>()) {
+      // 4-byte path: use the optimized template-unrolled bulk decode.
+      buffer_.resize(numSelected);
+      fixedBitArray_.bulkGetWithBaseline32(
+          selectedRows[0] + offset,
+          numSelected,
+          reinterpret_cast<uint32_t*>(buffer_.data()),
+          baseline_);
+
+      if constexpr (kSameSize) {
+        std::memcpy(values, buffer_.data(), numSelected * sizeof(physicalType));
+      } else if constexpr (kIsUpcast) {
+        for (vector_size_t i = 0; i < numSelected; ++i) {
+          values[i] = static_cast<OutputType>(buffer_[i]);
+        }
       }
+    } else {
+      // 8-byte path: use bulkGet64WithBaseline which handles all bit widths
+      // including branchless byte-aligned loads for bitWidth <= 56.
+      static_assert(isEightByteIntegralType<physicalType>());
+      static_assert(kSameSize, "8-byte bulkScan requires same-size output");
+      fixedBitArray_.bulkGet64WithBaseline(
+          selectedRows[0] + offset,
+          numSelected,
+          reinterpret_cast<uint64_t*>(values),
+          baseline_);
     }
   } else {
     // Sparse case: read individual values at specified positions.
diff --git a/dwio/nimble/encodings/benchmarks/CMakeLists.txt b/dwio/nimble/encodings/benchmarks/CMakeLists.txt
@@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+add_executable(nimble_fixed_bit_width_benchmark FixedBitWidthBenchmark.cpp)
+
+target_link_libraries(
+  nimble_fixed_bit_width_benchmark
+  nimble_encodings_tests_utils
+  nimble_common
+  Folly::follybenchmark
+  Folly::folly
+  velox_memory
+)
diff --git a/dwio/nimble/encodings/benchmarks/FixedBitWidthBenchmark.cpp b/dwio/nimble/encodings/benchmarks/FixedBitWidthBenchmark.cpp
diff --git a/dwio/nimble/encodings/legacy/FixedBitWidthEncoding.h b/dwio/nimble/encodings/legacy/FixedBitWidthEncoding.h