hellobertrand · hellobertrand · May 30, 2026
@@ -46,7 +46,8 @@ jobs:
 
       - name: Clone LZbench
         run: |
-          git clone --depth 1 https://github.com/inikep/lzbench "${LZBENCH_DIR}"
+          # git clone --depth 1 https://github.com/inikep/lzbench "${LZBENCH_DIR}"
+          git clone -b zxc-0.12.x https://github.com/hellobertrand/lzbench "${LZBENCH_DIR}"
 
       - name: Copy Lib ZXC
         run: |

@@ -167,6 +167,12 @@ jobs:
             cross_prefix: x86_64-linux-gnu
             cmake_processor: x86_64
             qemu_binary: qemu-x86_64
+          - name: Linux amd64 (SSE2)
+            cross_pkg: gcc
+            cross_prefix: x86_64-linux-gnu
+            cmake_processor: x86_64
+            qemu_binary: qemu-x86_64
+            qemu_cpu: core2duo
           - name: Linux amd64 (AVX2)
             cross_pkg: gcc
             cross_prefix: x86_64-linux-gnu

@@ -178,11 +178,15 @@ else()
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64|x86_64|AMD64")
     message(STATUS "Building x86_64 AVX2 and AVX512 variants...")
     if(MSVC)
+        # SSE2 for MSVC: SSE2 is the x86-64 baseline
+        zxc_add_variant(_sse2 "/D__SSE2__")
         # AVX2 for MSVC (Enables AVX2/BMI1/BMI2 sets)
         zxc_add_variant(_avx2 "/arch:AVX2;/D__BMI__;/D__BMI2__;/D__LZCNT__")
         # AVX512 for MSVC (VS2019 16.10+ supports /arch:AVX512)
         zxc_add_variant(_avx512 "/arch:AVX512;/D__BMI__;/D__BMI2__;/D__LZCNT__")
     else()
+        # SSE2 for GCC/Clang (x86-64 baseline)
+        zxc_add_variant(_sse2 "-msse2")
         # AVX2 for GCC/Clang
         zxc_add_variant(_avx2 "-mavx2;-mfma;-mbmi;-mbmi2;-mlzcnt")
         # AVX512 for GCC/Clang

@@ -52,6 +52,7 @@ variant_sources = ['src/lib/zxc_compress.c',
 variant_sets = [['_default', []]]
 
 if host_cpu == 'x86_64'
+  variant_sets += [['_sse2',   ['-msse2']]]
   variant_sets += [['_avx2',   ['-mavx2', '-mfma', '-mbmi', '-mbmi2', '-mlzcnt']]]
   variant_sets += [['_avx512', ['-mavx512f', '-mavx512bw', '-mbmi', '-mbmi2', '-mlzcnt']]]
 elif host_cpu == 'aarch64'

@@ -76,6 +76,66 @@ static ZXC_ALWAYS_INLINE uint32_t zxc_mm256_reduce_max_epu32(__m256i v) {
 }
 #endif
 
+#if defined(ZXC_USE_SSE2)
+/**
+ * @brief SSE2 emulation of SSE4.1 @c _mm_max_epu32 (element-wise unsigned max).
+ *
+ * SSE2 has no unsigned 32-bit compare, so we map unsigned ordering to signed
+ * ordering by flipping the sign bit (XOR 0x80000000) before @c _mm_cmpgt_epi32,
+ * then select element-wise.
+ */
+// codeql[cpp/unused-static-function] : Used conditionally when ZXC_USE_SSE2 is defined
+static ZXC_ALWAYS_INLINE __m128i zxc_mm_max_epu32_sse2(__m128i a, __m128i b) {
+    const __m128i bias = _mm_set1_epi32((int)0x80000000);
+    const __m128i gt = _mm_cmpgt_epi32(_mm_xor_si128(a, bias), _mm_xor_si128(b, bias));
+    return _mm_or_si128(_mm_and_si128(gt, a), _mm_andnot_si128(gt, b));
+}
+
+/**
+ * @brief SSE2 emulation of SSE4.1 @c _mm_blendv_epi8.
+ *
+ * Selects bytes from @p b where the corresponding @p mask byte has its high bit
+ * set, else from @p a. In every call site the mask lanes are full-width compare
+ * results (all-ones or all-zero per element), so a plain bitwise select is exact.
+ */
+// codeql[cpp/unused-static-function] : Used conditionally when ZXC_USE_SSE2 is defined
+static ZXC_ALWAYS_INLINE __m128i zxc_mm_blendv_epi8_sse2(__m128i a, __m128i b, __m128i mask) {
+    return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a));
+}
+
+/**
+ * @brief SSE2 emulation of SSE4.1 @c _mm_packus_epi32 (saturating u32 -> u16).
+ *
+ * SSE2 only has signed @c _mm_packs_epi32 (saturates to int16). Bias each lane
+ * by -0x8000 so values in [0, 0xFFFF] land in the signed int16 range with no
+ * saturation, pack, then add 0x8000 back per 16-bit lane. Exact for inputs in
+ * [0, 0xFFFF] (all call sites pass match lengths < 2^16).
+ */
+// codeql[cpp/unused-static-function] : Used conditionally when ZXC_USE_SSE2 is defined
+static ZXC_ALWAYS_INLINE __m128i zxc_mm_packus_epi32_sse2(__m128i a, __m128i b) {
+    const __m128i bias32 = _mm_set1_epi32(0x8000);
+    const __m128i bias16 = _mm_set1_epi16((short)0x8000);
+    const __m128i pa = _mm_sub_epi32(a, bias32);
+    const __m128i pb = _mm_sub_epi32(b, bias32);
+    return _mm_add_epi16(_mm_packs_epi32(pa, pb), bias16);
+}
+
+/**
+ * @brief Horizontal maximum of four packed unsigned 32-bit integers (SSE2).
+ *
+ * @param[in] v The 128-bit vector containing 4 unsigned 32-bit integers.
+ * @return The maximum unsigned 32-bit integer found in the vector.
+ */
+// codeql[cpp/unused-static-function] : Used conditionally when ZXC_USE_SSE2 is defined
+static ZXC_ALWAYS_INLINE uint32_t zxc_mm_reduce_max_epu32(__m128i v) {
+    __m128i vshuf = _mm_shuffle_epi32(v, _MM_SHUFFLE(1, 0, 3, 2));  // Swap 64-bit halves
+    v = zxc_mm_max_epu32_sse2(v, vshuf);                            // Max of pairs
+    vshuf = _mm_shuffle_epi32(v, _MM_SHUFFLE(2, 3, 0, 1));          // Swap adjacent lanes
+    v = zxc_mm_max_epu32_sse2(v, vshuf);                            // Final max
+    return (uint32_t)_mm_cvtsi128_si32(v);                          // Extract scalar result
+}
+#endif
+
 /**
  * @brief Writes a Prefix Varint encoded value to a buffer.
  *
@@ -315,6 +375,22 @@ static ZXC_ALWAYS_INLINE zxc_match_t zxc_lz77_find_best_match(
                     goto _match_len_done;
                 }
             }
+#elif defined(ZXC_USE_SSE2)
+            const uint8_t* limit_16 = iend - 16;
+            while (ip + mlen < limit_16) {
+                const __m128i v_src = _mm_loadu_si128((const __m128i*)(ip + mlen));
+                const __m128i v_ref = _mm_loadu_si128((const __m128i*)(ref + mlen));
+                const __m128i v_cmp = _mm_cmpeq_epi8(v_src, v_ref);
+                const uint32_t mask = (uint32_t)_mm_movemask_epi8(v_cmp);
+                if (mask == 0xFFFFU)
+                    mlen += 16;
+                else {
+                    // mask != 0xFFFF => a differing byte exists in bits 0..15,
+                    // so the lowest set bit of ~mask lies in that range.
+                    mlen += zxc_ctz32(~mask);
+                    goto _match_len_done;
+                }
+            }
 #elif defined(ZXC_USE_NEON64)
             {
                 const uint8_t* limit_32 = iend - 32;
@@ -669,11 +745,37 @@ static int zxc_encode_block_num(const zxc_cctx_t* RESTRICT ctx, const uint8_t* R
             max_d = vgetq_lane_u32(v_max1, 0);
 #endif
 
+            if (j > 0) prev = zxc_le32(in_ptr + (j - 1) * sizeof(uint32_t));
+        }
+#elif defined(ZXC_USE_SSE2)
+        // SSE2 processes 128-bit vectors (4 uint32 integers)
+        if (frames >= 4) {
+            __m128i v_max_accum = _mm_setzero_si128();  // Initialize max accumulator to 0
+
+            for (; j < (frames & ~3); j += 4) {
+                if (UNLIKELY(i == 0 && j == 0)) goto _scalar;
+
+                // Load 4 consecutive integers and the same window offset by -1
+                const __m128i vc = _mm_loadu_si128((const __m128i*)(in_ptr + j * 4));
+                const __m128i vp = _mm_loadu_si128((const __m128i*)(in_ptr + j * 4 - 4));
+
+                const __m128i diff = _mm_sub_epi32(vc, vp);  // Compute deltas: curr - prev
+
+                // ZigZag encode: (diff << 1) ^ (diff >> 31)
+                const __m128i zigzag =
+                    _mm_xor_si128(_mm_slli_epi32(diff, 1), _mm_srai_epi32(diff, 31));
+                _mm_storeu_si128((__m128i*)&deltas[j], zigzag);  // Store results
+                // SSE2 has no unsigned max; use the SSE2 emulation.
+                v_max_accum = zxc_mm_max_epu32_sse2(v_max_accum, zigzag);  // Update max accumulator
+            }
+
+            max_d = zxc_mm_reduce_max_epu32(v_max_accum);  // Horizontal max reduction
+
             if (j > 0) prev = zxc_le32(in_ptr + (j - 1) * sizeof(uint32_t));
         }
 #endif
 #if defined(ZXC_USE_AVX2) || defined(ZXC_USE_AVX512) || defined(ZXC_USE_NEON64) || \
-    defined(ZXC_USE_NEON32)
+    defined(ZXC_USE_NEON32) || defined(ZXC_USE_SSE2)
     _scalar:
 #endif
         for (; j < frames; j++) {
@@ -823,6 +925,34 @@ static ZXC_ALWAYS_INLINE size_t zxc_opt_dp_update_const_cost(
             vst1_u16(&parent_off[p + L], vbsl_u16(v_mask16, v_off, v_po));
         }
     }
+#elif defined(ZXC_USE_SSE2)
+    if (L + 4 <= L_end) {
+        const __m128i v_inc = _mm_setr_epi32(0, 1, 2, 3);
+        const __m128i v_nxt = _mm_set1_epi32((int)nxt);
+        const __m128i v_bias = _mm_set1_epi32((int)0x80000000);
+        const __m128i v_nxt_b = _mm_xor_si128(v_nxt, v_bias);
+        const __m128i v_off = _mm_set1_epi16((short)off_biased);
+        for (; L + 4 <= L_end; L += 4) {
+            const __m128i v_L_lanes = _mm_add_epi32(v_inc, _mm_set1_epi32((int)L));
+            const __m128i v_dp = _mm_loadu_si128((const __m128i*)&dp[p + L]);
+            /* Unsigned compare via sign-bit bias (SSE2 cmpgt is signed only):
+             *   (dp ^ 0x80000000) > (nxt ^ 0x80000000)  iff  dp > nxt. */
+            const __m128i v_dp_b = _mm_xor_si128(v_dp, v_bias);
+            const __m128i v_mask = _mm_cmpgt_epi32(v_dp_b, v_nxt_b);
+            const __m128i v_dp_new = zxc_mm_blendv_epi8_sse2(v_dp, v_nxt, v_mask);
+            _mm_storeu_si128((__m128i*)&dp[p + L], v_dp_new);
+            /* Narrow the 4x int32 mask / length lanes to 4x int16 (low 64 bits).
+             * packs: 0xFFFFFFFF -> 0xFFFF, 0 -> 0; packus (SSE2-emulated): u32->u16. */
+            const __m128i v_mask16 = _mm_packs_epi32(v_mask, v_mask);
+            const __m128i v_L_u16 = zxc_mm_packus_epi32_sse2(v_L_lanes, v_L_lanes);
+            __m128i v_pl = _mm_loadl_epi64((const __m128i*)&parent_len[p + L]);
+            v_pl = zxc_mm_blendv_epi8_sse2(v_pl, v_L_u16, v_mask16);
+            _mm_storel_epi64((__m128i*)&parent_len[p + L], v_pl);
+            __m128i v_po = _mm_loadl_epi64((const __m128i*)&parent_off[p + L]);
+            v_po = zxc_mm_blendv_epi8_sse2(v_po, v_off, v_mask16);
+            _mm_storel_epi64((__m128i*)&parent_off[p + L], v_po);
+        }
+    }
 #endif
     /* Scalar tail (and full path on archs without SIMD).
      * L < L_end <= UINT16_MAX (caller precondition), so the cast is lossless. */
@@ -1398,6 +1528,17 @@ parse_done:;
                 }
                 p += 32;
             }
+#elif defined(ZXC_USE_SSE2)
+            const __m128i vb = _mm_set1_epi8((char)b);
+            while (p <= p_end - 16) {
+                const __m128i v = _mm_loadu_si128((const __m128i*)p);
+                const uint32_t mask = (uint32_t)_mm_movemask_epi8(_mm_cmpeq_epi8(v, vb));
+                if (mask != 0xFFFFU) {
+                    p += zxc_ctz32(~mask);
+                    goto _run_done;
+                }
+                p += 16;
+            }
 #elif defined(ZXC_USE_NEON64)
             const uint8x16_t vb = vdupq_n_u8(b);
             while (p <= p_end - 16) {
@@ -1446,7 +1587,7 @@ parse_done:;
             while (p < p_end && *p == b) p++;
 
 #if defined(ZXC_USE_AVX512) || defined(ZXC_USE_AVX2) || defined(ZXC_USE_NEON64) || \
-    defined(ZXC_USE_NEON32)
+    defined(ZXC_USE_NEON32) || defined(ZXC_USE_SSE2)
         _run_done:;
 #endif
             const size_t run = (size_t)(p - run_start);
@@ -1497,6 +1638,22 @@ parse_done:;
                     }
                     p += 32;
                 }
+#elif defined(ZXC_USE_SSE2)
+                while (p <= p_end_4 - 16) {
+                    __m128i v0 = _mm_loadu_si128((const __m128i*)p);
+                    __m128i v1 = _mm_loadu_si128((const __m128i*)(p + 1));
+                    __m128i v2 = _mm_loadu_si128((const __m128i*)(p + 2));
+                    __m128i v3 = _mm_loadu_si128((const __m128i*)(p + 3));
+                    __m128i vend = _mm_and_si128(
+                        _mm_cmpeq_epi8(v0, v1),
+                        _mm_and_si128(_mm_cmpeq_epi8(v1, v2), _mm_cmpeq_epi8(v2, v3)));
+                    uint32_t mask = (uint32_t)_mm_movemask_epi8(vend);
+                    if (mask != 0) {
+                        p += zxc_ctz32(mask);
+                        goto _lit_done;
+                    }
+                    p += 16;
+                }
 #elif defined(ZXC_USE_NEON64)
                 while (p <= p_end_4 - 16) {
                     uint8x16_t v0 = vld1q_u8(p);
@@ -1561,7 +1718,7 @@ parse_done:;
                 }
 
 #if defined(ZXC_USE_AVX512) || defined(ZXC_USE_AVX2) || defined(ZXC_USE_NEON64) || \
-    defined(ZXC_USE_NEON32)
+    defined(ZXC_USE_NEON32) || defined(ZXC_USE_SSE2)
             _lit_done:;
 #endif
                 const size_t lit_run = (size_t)(p - lit_start);

@@ -195,11 +195,13 @@ static ZXC_ALWAYS_INLINE void zxc_copy_overlap16(uint8_t* dst, uint32_t off) {
     vst1_u8(dst + 8, vtbl2_u8(src_tbl, mask_hi));
 
 #elif defined(ZXC_USE_AVX2) || defined(ZXC_USE_AVX512)
+    // SSSE3 PSHUFB (implied by AVX2/AVX512): single 16-byte table-lookup replicate.
     __m128i mask = _mm_load_si128((const __m128i*)zxc_overlap_masks[off]);
     __m128i src_data = _mm_loadu_si128((const __m128i*)(dst - off));
     _mm_storeu_si128((__m128i*)dst, _mm_shuffle_epi8(src_data, mask));
 
 #else
+    // SSE2-only tier and non-SIMD builds: scalar replicate (no PSHUFB).
     const uint8_t* src = dst - off;
     for (size_t i = 0; i < 16; i++) {
         dst[i] = src[i % off];
@@ -308,6 +310,24 @@ static ZXC_ALWAYS_INLINE __m512i zxc_mm512_prefix_sum_epi32(__m512i v) {
 }
 #endif
 
+#if defined(ZXC_USE_SSE2)
+/**
+ * @brief Computes the prefix sum of a 128-bit vector of four 32-bit integers
+ *        using SSE2 byte-shift adds (the 128-bit analogue of the NEON helper).
+ *
+ * For input [a, b, c, d] the result is [a, a+b, a+b+c, a+b+c+d].
+ *
+ * @param[in] v Input vector of four 32-bit integers.
+ * @return Vector containing the prefix sums.
+ */
+// codeql[cpp/unused-static-function] : Used conditionally when ZXC_USE_SSE2 is defined
+static ZXC_ALWAYS_INLINE __m128i zxc_mm_prefix_sum_epi32(__m128i v) {
+    v = _mm_add_epi32(v, _mm_slli_si128(v, 4));  // [a, a+b, b+c, c+d]
+    v = _mm_add_epi32(v, _mm_slli_si128(v, 8));  // [a, a+b, a+b+c, a+b+c+d]
+    return v;
+}
+#endif
+
 /**
  * @brief Decodes a block of numerical data compressed with the ZXC format.
  *
@@ -442,6 +462,22 @@ static int zxc_decode_block_num(const uint8_t* RESTRICT src, const size_t src_si
             running_val = vgetq_lane_u32(v_run, 0);  // Extract once at the end of the batch
 #endif
 
+#elif defined(ZXC_USE_SSE2)
+            __m128i v_run = _mm_set1_epi32((int)running_val);  // Broadcast running total
+            for (int k = 0; k < ZXC_NUM_DEC_BATCH; k += 4) {
+                __m128i v_deltas = _mm_load_si128((const __m128i*)&deltas[k]);  // Load 4 deltas
+
+                __m128i v_sum = zxc_mm_prefix_sum_epi32(v_deltas);  // Local prefix sums
+                v_sum = _mm_add_epi32(v_sum, v_run);                // Add base running total
+
+                _mm_storeu_si128((__m128i*)&batch_dst[k], v_sum);  // Store decoded values
+
+                // Broadcast 4th element (lane 3) to all lanes for the next iteration.
+                v_run = _mm_shuffle_epi32(v_sum, _MM_SHUFFLE(3, 3, 3, 3));
+            }
+            // Extract final running_val back to GPR for the scalar tail.
+            running_val = (uint32_t)_mm_cvtsi128_si32(v_run);
+
 #else
             for (int k = 0; k < ZXC_NUM_DEC_BATCH; k++) {
                 running_val += deltas[k];