From d6aab7b50b3de82aa15375a29c3007591c2f2362 Mon Sep 17 00:00:00 2001
From: Nick Terrell <terrelln@meta.com>
Date: Thu, 5 Feb 2026 11:25:25 -0800
Subject: [PATCH 1/2] Bound match length

Summary: Fix a problem where we were emitting matches longer than 2^16-1, and then storing them in a `uint16_t`.

Differential Revision: D92425119
---
 contrib/lz-research/BUCK          |  1 +
 contrib/lz-research/codecs/Lz.cpp | 21 ++++++++++++++++-----
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/contrib/lz-research/BUCK b/contrib/lz-research/BUCK
index 02d094bbc..49abae11d 100644
--- a/contrib/lz-research/BUCK
+++ b/contrib/lz-research/BUCK
@@ -4,6 +4,7 @@ load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library")
 oncall("data_compression")
 
 cpp_library(
+    # @autodeps-skip
     name = "codecs",
     srcs = [
         "codecs/Bucket.cpp",
diff --git a/contrib/lz-research/codecs/Lz.cpp b/contrib/lz-research/codecs/Lz.cpp
index be79ce546..a8911e60d 100644
--- a/contrib/lz-research/codecs/Lz.cpp
+++ b/contrib/lz-research/codecs/Lz.cpp
@@ -203,6 +203,12 @@ size_t constexpr kSmallMatch = 5;
 size_t constexpr kLargeMatch = 8;
 size_t constexpr kNoOpOffset = 16;
 
+inline length_t boundML(uint32_t ml)
+{
+    return length_t(
+            std::min<uint32_t>(ml, std::numeric_limits<length_t>::max()));
+}
+
 inline length_t matchLength(ZS_FastTagTable_Entry entry, uint8_t const* ip)
 {
     if constexpr (ZS_FastTagTable_kMaxMatchLen >= 12) {
@@ -394,7 +400,7 @@ class Transform {
         while (ip < ilimit) {
             auto entry = ZS_FastTagTable_getAndUpdateT(
                     &table, ip, ip - istart, kMinMatch);
-            auto ml             = matchLength(entry, ip);
+            auto ml             = boundML(matchLength(entry, ip));
             auto* match         = istart + entry.pos;
             const auto distance = ip - match;
             if (ml >= kMinMatch && valueFits(distance, sizeof(offset_t))
@@ -402,11 +408,12 @@ class Transform {
                 if (ZL_UNLIKELY(ml >= ZS_FastTagTable_kMaxMatchLen)) {
                     ZL_ASSERT_EQ(
                             memcmp(match, ip, ZS_FastTagTable_kMaxMatchLen), 0);
-                    ml = ZS_FastTagTable_kMaxMatchLen
+                    ml = boundML(
+                            ZS_FastTagTable_kMaxMatchLen
                             + matchLength(
                                     match + ZS_FastTagTable_kMaxMatchLen,
                                     ip + ZS_FastTagTable_kMaxMatchLen,
-                                    iend);
+                                    iend));
                 } else {
                     ZL_ASSERT_EQ(memcmp(match, ip, ml), 0, "ml = %u", ml);
                 }
@@ -545,7 +552,7 @@ class Transform {
                 uint32_t distance  = ip - match;
                 match              = distance >= 65536 ? ip - 1 : match;
                 distance           = distance >= 65536 ? 1 : distance;
-                length_t ml        = matchLength(match, ip, iend);
+                length_t ml        = boundML(matchLength(match, ip, iend));
                 ml                 = ml == 0 ? 1 : ml;
                 const bool isMatch = (ml >= kMinMatch);
 
@@ -703,7 +710,7 @@ class Transform {
             uint32_t distance  = ip - match;
             match              = distance >= 65536 ? ip - 1 : match;
             distance           = distance >= 65536 ? 1 : distance;
-            length_t ml        = matchLength(match, ip, iend);
+            length_t ml        = boundML(matchLength(match, ip, iend));
             ml                 = ml == 0 ? 1 : ml;
             const bool isMatch = (ml >= kMinMatch);
 
@@ -798,6 +805,7 @@ class Transform {
                     --ip;
                     ++ml;
                 }
+                ml        = boundML(ml);
                 size_t ll = ip - anchor;
                 offset_t const offset =
                         kIsIndex ? (match - istart) : (ip - match);
@@ -912,6 +920,7 @@ class Transform {
                     --ip;
                     ++ml;
                 }
+                ml        = boundML(ml);
                 size_t ll = ip - anchor;
                 offset_t const offset =
                         kIsIndex ? (match - istart) : (ip - match);
@@ -1011,6 +1020,7 @@ class Transform {
                     --ip;
                     ++ml;
                 }
+                ml        = boundML(ml);
                 size_t ll = ip - anchor;
                 offset_t const offset =
                         kIsIndex ? (match - istart) : (ip - match);
@@ -1095,6 +1105,7 @@ class Transform {
                 --ptr;
                 ++ml;
             }
+            ml                    = boundML(ml);
             size_t ll             = ptr - anchor;
             offset_t const offset = kIsIndex ? (match - istart) : (ptr - match);
 

From 1178e008098e193a252bcaf8e9c7e77e5edb8216 Mon Sep 17 00:00:00 2001
From: Nick Terrell <terrelln@meta.com>
Date: Thu, 5 Feb 2026 11:25:25 -0800
Subject: [PATCH 2/2] Merge crunch optimization

Summary:
Port the optimizations from Crunch in D92286807 back into the codebase.

I ported the Crunch code exactly as-is, and then merged the optimizations back into the codebase as-needed to get the same performance. The only optimization that mattered was loading all the control bytes and LUT entries up front.

Differential Revision: D92425118
---
 contrib/lz-research/codecs/VarByte.cpp | 154 +++++++++++++++----------
 1 file changed, 95 insertions(+), 59 deletions(-)

diff --git a/contrib/lz-research/codecs/VarByte.cpp b/contrib/lz-research/codecs/VarByte.cpp
index 53875786c..ad3c65d39 100644
--- a/contrib/lz-research/codecs/VarByte.cpp
+++ b/contrib/lz-research/codecs/VarByte.cpp
@@ -209,33 +209,86 @@ class VarByteCoder {
         return baseLUT_[control] + value;
     }
 
+    void decode4(
+            uint16_t* __restrict out,
+            std::pair<uint32_t, uint32_t> e0,
+            std::pair<uint32_t, uint32_t> e1,
+            const uint8_t* __restrict bytes,
+            size_t& bitsConsumed) const
+    {
+        const size_t bytesOffset = bitsConsumed / 8;
+        const size_t bitsOffset  = bitsConsumed % 8;
+
+        uint64_t data       = ZL_readLE64(bytes + bytesOffset) >> bitsOffset;
+        const uint64_t mask = uint64_t(e0.second) | (uint64_t(e1.second) << 32);
+        const uint64_t base = uint64_t(e0.first) | (uint64_t(e1.first) << 32);
+
+        const int bitsNeeded = __builtin_popcountll(mask);
+        if (ZL_LIKELY(bitsNeeded <= 56) || bitsOffset == 0) {
+            ZL_writeLE64(out, base + utils::bitDeposit(data, mask));
+        } else {
+            data |= uint64_t(bytes[bytesOffset + 8]) << (64 - bitsOffset);
+            ZL_writeLE64(out, base + utils::bitDeposit(data, mask));
+        }
+        bitsConsumed += bitsNeeded;
+    }
+
     void decode8xU16(
             uint16_t* out,
             const uint8_t* control,
             const uint8_t* bytes,
             size_t& bitsConsumed) const
     {
-        for (size_t i = 0; i < 2; ++i) {
-            const size_t bytesOffset = bitsConsumed / 8;
-            const size_t bitsOffset  = bitsConsumed % 8;
-
-            uint64_t data = ZL_readLE64(bytes + bytesOffset) >> bitsOffset;
-
-            const uint8_t c0    = control[2 * i + 0];
-            const uint8_t c1    = control[2 * i + 1];
-            const uint64_t mask = uint64_t(mask2xLUTU16_[c0])
-                    | (uint64_t(mask2xLUTU16_[c1]) << 32);
-            const uint64_t base = uint64_t(base2xLUTU16_[c0])
-                    | (uint64_t(base2xLUTU16_[c1]) << 32);
-
-            const int bitsNeeded = __builtin_popcountll(mask);
-            if (ZL_LIKELY(bitsNeeded <= 56) || bitsOffset == 0) {
-                ZL_writeLE64(out + 4 * i, base + utils::bitDeposit(data, mask));
+        const auto e0 = combined2xLUTU16_[control[0]];
+        const auto e1 = combined2xLUTU16_[control[1]];
+        const auto e2 = combined2xLUTU16_[control[2]];
+        const auto e3 = combined2xLUTU16_[control[3]];
+        decode4(out, e0, e1, bytes, bitsConsumed);
+        decode4(out + 4, e2, e3, bytes, bitsConsumed);
+    }
+
+    template <typename UInt>
+    void decode8(
+            UInt* __restrict out,
+            size_t outIdx,
+            const uint8_t* __restrict control,
+            const uint8_t* __restrict bytes,
+            size_t& bitsConsumed) const
+
+    {
+        size_t const controlIdx = outIdx / 2;
+        if constexpr (1 && sizeof(UInt) == 2) {
+            decode8xU16(
+                    &out[outIdx], control + controlIdx, bytes, bitsConsumed);
+        } else {
+#if ZL_ARCH_X86_64
+            auto outVec0 = decode4(
+                    control[controlIdx + 0],
+                    control[controlIdx + 1],
+                    _mm_loadu_si128((__m128i_u const*)&bytes[bitsConsumed / 8]),
+                    bitsConsumed);
+            auto outVec1 = decode4(
+                    control[controlIdx + 2],
+                    control[controlIdx + 3],
+                    _mm_loadu_si128((__m128i_u const*)&bytes[bitsConsumed / 8]),
+                    bitsConsumed);
+            if constexpr (sizeof(UInt) == 4) {
+                _mm_storeu_si128((__m128i_u*)&out[outIdx + 0], outVec0);
+                _mm_storeu_si128((__m128i_u*)&out[outIdx + 4], outVec1);
             } else {
-                data |= uint64_t(bytes[bytesOffset + 8]) << (64 - bitsOffset);
-                ZL_writeLE64(out + 4 * i, base + utils::bitDeposit(data, mask));
+                auto const outVec = _mm_packus_epi32(outVec0, outVec1);
+                _mm_storeu_si128((__m128i_u*)&out[outIdx], outVec);
+            }
+#else
+            for (size_t i = 0; i < 8; ++i) {
+                uint8_t const ctrl =
+                        (control[outIdx / 2] >> (i % 2 == 0 ? 0 : 4)) % 16;
+                out[outIdx + i] =
+                        decode1(ctrl,
+                                safeRead(bytes.subspan(bitsConsumed / 8)),
+                                bitsConsumed);
             }
-            bitsConsumed += bitsNeeded;
+#endif
         }
     }
 
@@ -258,54 +311,21 @@ class VarByteCoder {
         size_t outIdx           = 0;
         size_t constexpr kLimit = 16 + (sizeof(UInt) == 2 ? 8 : 16);
         if (bytes.size() >= kLimit) {
+            // Unrolling this loop to 16x makes it ~1% faster.
+            // Consider doing that when productionizing.
             size_t const bitsLimit = 8 * (bytes.size() - kLimit);
             size_t const outLimit  = out.size() - 8;
             while (bitsConsumed < bitsLimit && outIdx < outLimit) {
-                size_t const controlIdx = outIdx / 2;
                 // fprintf(
                 //     stderr,
                 //     "bitsConsumed = %zu | limit = %zu | outIdx = %zu |
                 //     ctrlIdx = %zu\n", bitsConsumed, bitsLimit, outIdx,
                 //     controlIdx);
-                if constexpr (1 && sizeof(UInt) == 2) {
-                    decode8xU16(
-                            &out[outIdx],
-                            control.data() + controlIdx,
-                            bytes.data(),
-                            bitsConsumed);
-                } else {
-#if ZL_ARCH_X86_64
-                    auto outVec0 = decode4(
-                            control[controlIdx + 0],
-                            control[controlIdx + 1],
-                            _mm_loadu_si128(
-                                    (__m128i_u const*)&bytes[bitsConsumed / 8]),
-                            bitsConsumed);
-                    auto outVec1 = decode4(
-                            control[controlIdx + 2],
-                            control[controlIdx + 3],
-                            _mm_loadu_si128(
-                                    (__m128i_u const*)&bytes[bitsConsumed / 8]),
-                            bitsConsumed);
-                    if constexpr (sizeof(UInt) == 4) {
-                        _mm_storeu_si128((__m128i_u*)&out[outIdx + 0], outVec0);
-                        _mm_storeu_si128((__m128i_u*)&out[outIdx + 4], outVec1);
-                    } else {
-                        auto const outVec = _mm_packus_epi32(outVec0, outVec1);
-                        _mm_storeu_si128((__m128i_u*)&out[outIdx], outVec);
-                    }
-#else
-                    for (size_t i = 0; i < 8; ++i) {
-                        uint8_t const ctrl =
-                                (control[outIdx / 2] >> (i % 2 == 0 ? 0 : 4))
-                                % 16;
-                        out[outIdx + i] = decode1(
-                                ctrl,
-                                safeRead(bytes.subspan(bitsConsumed / 8)),
-                                bitsConsumed);
-                    }
-#endif
-                }
+                decode8(out.data(),
+                        outIdx,
+                        control.data(),
+                        bytes.data(),
+                        bitsConsumed);
                 outIdx += 8;
             }
         }
@@ -628,6 +648,22 @@ class VarByteCoder {
             makeBaseLUT()) };
     static constexpr std::array<uint32_t, 256> mask2xLUTU16_{ expandLUTU16(
             makeMaskLUT()) };
+
+    template <size_t N>
+    static constexpr std::array<std::pair<uint32_t, uint32_t>, N> combineLUT(
+            const std::array<uint32_t, N>& a,
+            const std::array<uint32_t, N>& b)
+    {
+        std::array<std::pair<uint32_t, uint32_t>, N> out;
+        for (size_t i = 0; i < N; ++i) {
+            out[i] = { a[i], b[i] };
+        }
+        return out;
+    }
+
+    static constexpr auto combined2xLUTU16_{
+        combineLUT(base2xLUTU16_, mask2xLUTU16_)
+    };
 };
 
 using VarByteCoder16 =