From d6aab7b50b3de82aa15375a29c3007591c2f2362 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Thu, 5 Feb 2026 11:25:25 -0800 Subject: [PATCH 1/2] Bound match length Summary: Fix a problem where we were emitting matches longer than 2^16-1, and then storing them in a `uint16_t`. Differential Revision: D92425119 --- contrib/lz-research/BUCK | 1 + contrib/lz-research/codecs/Lz.cpp | 21 ++++++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/contrib/lz-research/BUCK b/contrib/lz-research/BUCK index 02d094bbc..49abae11d 100644 --- a/contrib/lz-research/BUCK +++ b/contrib/lz-research/BUCK @@ -4,6 +4,7 @@ load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library") oncall("data_compression") cpp_library( + # @autodeps-skip name = "codecs", srcs = [ "codecs/Bucket.cpp", diff --git a/contrib/lz-research/codecs/Lz.cpp b/contrib/lz-research/codecs/Lz.cpp index be79ce546..a8911e60d 100644 --- a/contrib/lz-research/codecs/Lz.cpp +++ b/contrib/lz-research/codecs/Lz.cpp @@ -203,6 +203,12 @@ size_t constexpr kSmallMatch = 5; size_t constexpr kLargeMatch = 8; size_t constexpr kNoOpOffset = 16; +inline length_t boundML(uint32_t ml) +{ + return length_t( + std::min(ml, std::numeric_limits::max())); +} + inline length_t matchLength(ZS_FastTagTable_Entry entry, uint8_t const* ip) { if constexpr (ZS_FastTagTable_kMaxMatchLen >= 12) { @@ -394,7 +400,7 @@ class Transform { while (ip < ilimit) { auto entry = ZS_FastTagTable_getAndUpdateT( &table, ip, ip - istart, kMinMatch); - auto ml = matchLength(entry, ip); + auto ml = boundML(matchLength(entry, ip)); auto* match = istart + entry.pos; const auto distance = ip - match; if (ml >= kMinMatch && valueFits(distance, sizeof(offset_t)) @@ -402,11 +408,12 @@ class Transform { if (ZL_UNLIKELY(ml >= ZS_FastTagTable_kMaxMatchLen)) { ZL_ASSERT_EQ( memcmp(match, ip, ZS_FastTagTable_kMaxMatchLen), 0); - ml = ZS_FastTagTable_kMaxMatchLen + ml = boundML( + ZS_FastTagTable_kMaxMatchLen + matchLength( match + ZS_FastTagTable_kMaxMatchLen, ip + ZS_FastTagTable_kMaxMatchLen, - iend); + iend)); } else { ZL_ASSERT_EQ(memcmp(match, ip, ml), 0, "ml = %u", ml); } @@ -545,7 +552,7 @@ class Transform { uint32_t distance = ip - match; match = distance >= 65536 ? ip - 1 : match; distance = distance >= 65536 ? 1 : distance; - length_t ml = matchLength(match, ip, iend); + length_t ml = boundML(matchLength(match, ip, iend)); ml = ml == 0 ? 1 : ml; const bool isMatch = (ml >= kMinMatch); @@ -703,7 +710,7 @@ class Transform { uint32_t distance = ip - match; match = distance >= 65536 ? ip - 1 : match; distance = distance >= 65536 ? 1 : distance; - length_t ml = matchLength(match, ip, iend); + length_t ml = boundML(matchLength(match, ip, iend)); ml = ml == 0 ? 1 : ml; const bool isMatch = (ml >= kMinMatch); @@ -798,6 +805,7 @@ class Transform { --ip; ++ml; } + ml = boundML(ml); size_t ll = ip - anchor; offset_t const offset = kIsIndex ? (match - istart) : (ip - match); @@ -912,6 +920,7 @@ class Transform { --ip; ++ml; } + ml = boundML(ml); size_t ll = ip - anchor; offset_t const offset = kIsIndex ? (match - istart) : (ip - match); @@ -1011,6 +1020,7 @@ class Transform { --ip; ++ml; } + ml = boundML(ml); size_t ll = ip - anchor; offset_t const offset = kIsIndex ? (match - istart) : (ip - match); @@ -1095,6 +1105,7 @@ class Transform { --ptr; ++ml; } + ml = boundML(ml); size_t ll = ptr - anchor; offset_t const offset = kIsIndex ? (match - istart) : (ptr - match); From 1178e008098e193a252bcaf8e9c7e77e5edb8216 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Thu, 5 Feb 2026 11:25:25 -0800 Subject: [PATCH 2/2] Merge crunch optimization Summary: Port the optimizations from Crunch in D92286807 back into the codebase. I ported the Crunch code exactly as-is, and then merged the optimizations back into the codebase as-needed to get the same performance. The only optimization that mattered was loading all the control bytes and LUT entries up front. Differential Revision: D92425118 --- contrib/lz-research/codecs/VarByte.cpp | 154 +++++++++++++++---------- 1 file changed, 95 insertions(+), 59 deletions(-) diff --git a/contrib/lz-research/codecs/VarByte.cpp b/contrib/lz-research/codecs/VarByte.cpp index 53875786c..ad3c65d39 100644 --- a/contrib/lz-research/codecs/VarByte.cpp +++ b/contrib/lz-research/codecs/VarByte.cpp @@ -209,33 +209,86 @@ class VarByteCoder { return baseLUT_[control] + value; } + void decode4( + uint16_t* __restrict out, + std::pair e0, + std::pair e1, + const uint8_t* __restrict bytes, + size_t& bitsConsumed) const + { + const size_t bytesOffset = bitsConsumed / 8; + const size_t bitsOffset = bitsConsumed % 8; + + uint64_t data = ZL_readLE64(bytes + bytesOffset) >> bitsOffset; + const uint64_t mask = uint64_t(e0.second) | (uint64_t(e1.second) << 32); + const uint64_t base = uint64_t(e0.first) | (uint64_t(e1.first) << 32); + + const int bitsNeeded = __builtin_popcountll(mask); + if (ZL_LIKELY(bitsNeeded <= 56) || bitsOffset == 0) { + ZL_writeLE64(out, base + utils::bitDeposit(data, mask)); + } else { + data |= uint64_t(bytes[bytesOffset + 8]) << (64 - bitsOffset); + ZL_writeLE64(out, base + utils::bitDeposit(data, mask)); + } + bitsConsumed += bitsNeeded; + } + void decode8xU16( uint16_t* out, const uint8_t* control, const uint8_t* bytes, size_t& bitsConsumed) const { - for (size_t i = 0; i < 2; ++i) { - const size_t bytesOffset = bitsConsumed / 8; - const size_t bitsOffset = bitsConsumed % 8; - - uint64_t data = ZL_readLE64(bytes + bytesOffset) >> bitsOffset; - - const uint8_t c0 = control[2 * i + 0]; - const uint8_t c1 = control[2 * i + 1]; - const uint64_t mask = uint64_t(mask2xLUTU16_[c0]) - | (uint64_t(mask2xLUTU16_[c1]) << 32); - const uint64_t base = uint64_t(base2xLUTU16_[c0]) - | (uint64_t(base2xLUTU16_[c1]) << 32); - - const int bitsNeeded = __builtin_popcountll(mask); - if (ZL_LIKELY(bitsNeeded <= 56) || bitsOffset == 0) { - ZL_writeLE64(out + 4 * i, base + utils::bitDeposit(data, mask)); + const auto e0 = combined2xLUTU16_[control[0]]; + const auto e1 = combined2xLUTU16_[control[1]]; + const auto e2 = combined2xLUTU16_[control[2]]; + const auto e3 = combined2xLUTU16_[control[3]]; + decode4(out, e0, e1, bytes, bitsConsumed); + decode4(out + 4, e2, e3, bytes, bitsConsumed); + } + + template + void decode8( + UInt* __restrict out, + size_t outIdx, + const uint8_t* __restrict control, + const uint8_t* __restrict bytes, + size_t& bitsConsumed) const + + { + size_t const controlIdx = outIdx / 2; + if constexpr (1 && sizeof(UInt) == 2) { + decode8xU16( + &out[outIdx], control + controlIdx, bytes, bitsConsumed); + } else { +#if ZL_ARCH_X86_64 + auto outVec0 = decode4( + control[controlIdx + 0], + control[controlIdx + 1], + _mm_loadu_si128((__m128i_u const*)&bytes[bitsConsumed / 8]), + bitsConsumed); + auto outVec1 = decode4( + control[controlIdx + 2], + control[controlIdx + 3], + _mm_loadu_si128((__m128i_u const*)&bytes[bitsConsumed / 8]), + bitsConsumed); + if constexpr (sizeof(UInt) == 4) { + _mm_storeu_si128((__m128i_u*)&out[outIdx + 0], outVec0); + _mm_storeu_si128((__m128i_u*)&out[outIdx + 4], outVec1); } else { - data |= uint64_t(bytes[bytesOffset + 8]) << (64 - bitsOffset); - ZL_writeLE64(out + 4 * i, base + utils::bitDeposit(data, mask)); + auto const outVec = _mm_packus_epi32(outVec0, outVec1); + _mm_storeu_si128((__m128i_u*)&out[outIdx], outVec); + } +#else + for (size_t i = 0; i < 8; ++i) { + uint8_t const ctrl = + (control[outIdx / 2] >> (i % 2 == 0 ? 0 : 4)) % 16; + out[outIdx + i] = + decode1(ctrl, + safeRead(bytes.subspan(bitsConsumed / 8)), + bitsConsumed); } - bitsConsumed += bitsNeeded; +#endif } } @@ -258,54 +311,21 @@ class VarByteCoder { size_t outIdx = 0; size_t constexpr kLimit = 16 + (sizeof(UInt) == 2 ? 8 : 16); if (bytes.size() >= kLimit) { + // Unrolling this loop to 16x makes it ~1% faster. + // Consider doing that when productionizing. size_t const bitsLimit = 8 * (bytes.size() - kLimit); size_t const outLimit = out.size() - 8; while (bitsConsumed < bitsLimit && outIdx < outLimit) { - size_t const controlIdx = outIdx / 2; // fprintf( // stderr, // "bitsConsumed = %zu | limit = %zu | outIdx = %zu | // ctrlIdx = %zu\n", bitsConsumed, bitsLimit, outIdx, // controlIdx); - if constexpr (1 && sizeof(UInt) == 2) { - decode8xU16( - &out[outIdx], - control.data() + controlIdx, - bytes.data(), - bitsConsumed); - } else { -#if ZL_ARCH_X86_64 - auto outVec0 = decode4( - control[controlIdx + 0], - control[controlIdx + 1], - _mm_loadu_si128( - (__m128i_u const*)&bytes[bitsConsumed / 8]), - bitsConsumed); - auto outVec1 = decode4( - control[controlIdx + 2], - control[controlIdx + 3], - _mm_loadu_si128( - (__m128i_u const*)&bytes[bitsConsumed / 8]), - bitsConsumed); - if constexpr (sizeof(UInt) == 4) { - _mm_storeu_si128((__m128i_u*)&out[outIdx + 0], outVec0); - _mm_storeu_si128((__m128i_u*)&out[outIdx + 4], outVec1); - } else { - auto const outVec = _mm_packus_epi32(outVec0, outVec1); - _mm_storeu_si128((__m128i_u*)&out[outIdx], outVec); - } -#else - for (size_t i = 0; i < 8; ++i) { - uint8_t const ctrl = - (control[outIdx / 2] >> (i % 2 == 0 ? 0 : 4)) - % 16; - out[outIdx + i] = decode1( - ctrl, - safeRead(bytes.subspan(bitsConsumed / 8)), - bitsConsumed); - } -#endif - } + decode8(out.data(), + outIdx, + control.data(), + bytes.data(), + bitsConsumed); outIdx += 8; } } @@ -628,6 +648,22 @@ class VarByteCoder { makeBaseLUT()) }; static constexpr std::array mask2xLUTU16_{ expandLUTU16( makeMaskLUT()) }; + + template + static constexpr std::array, N> combineLUT( + const std::array& a, + const std::array& b) + { + std::array, N> out; + for (size_t i = 0; i < N; ++i) { + out[i] = { a[i], b[i] }; + } + return out; + } + + static constexpr auto combined2xLUTU16_{ + combineLUT(base2xLUTU16_, mask2xLUTU16_) + }; }; using VarByteCoder16 =