diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 937cbe32..50b8cee9 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -46,7 +46,8 @@ jobs:
 
       - name: Clone LZbench
         run: |
-          git clone --depth 1 https://github.com/inikep/lzbench "${LZBENCH_DIR}"
+          # git clone --depth 1 https://github.com/inikep/lzbench "${LZBENCH_DIR}"
+          git clone -b zxc-0.12.x https://github.com/hellobertrand/lzbench "${LZBENCH_DIR}"
 
       - name: Copy Lib ZXC
         run: |
diff --git a/.github/workflows/multiarch.yml b/.github/workflows/multiarch.yml
index 0343945d..94815758 100644
--- a/.github/workflows/multiarch.yml
+++ b/.github/workflows/multiarch.yml
@@ -167,6 +167,12 @@ jobs:
             cross_prefix: x86_64-linux-gnu
             cmake_processor: x86_64
             qemu_binary: qemu-x86_64
+          - name: Linux amd64 (SSE2)
+            cross_pkg: gcc
+            cross_prefix: x86_64-linux-gnu
+            cmake_processor: x86_64
+            qemu_binary: qemu-x86_64
+            qemu_cpu: core2duo
           - name: Linux amd64 (AVX2)
             cross_pkg: gcc
             cross_prefix: x86_64-linux-gnu
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 76185070..98e8ff91 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -178,11 +178,15 @@ else()
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64|x86_64|AMD64")
     message(STATUS "Building x86_64 AVX2 and AVX512 variants...")
     if(MSVC)
+        # SSE2 for MSVC: SSE2 is the x86-64 baseline
+        zxc_add_variant(_sse2 "/D__SSE2__")
         # AVX2 for MSVC (Enables AVX2/BMI1/BMI2 sets)
         zxc_add_variant(_avx2 "/arch:AVX2;/D__BMI__;/D__BMI2__;/D__LZCNT__")
         # AVX512 for MSVC (VS2019 16.10+ supports /arch:AVX512)
         zxc_add_variant(_avx512 "/arch:AVX512;/D__BMI__;/D__BMI2__;/D__LZCNT__")
     else()
+        # SSE2 for GCC/Clang (x86-64 baseline)
+        zxc_add_variant(_sse2 "-msse2")
         # AVX2 for GCC/Clang
         zxc_add_variant(_avx2 "-mavx2;-mfma;-mbmi;-mbmi2;-mlzcnt")
         # AVX512 for GCC/Clang
diff --git a/meson.build b/meson.build
index dac03d92..a8af39f3 100644
--- a/meson.build
+++ b/meson.build
@@ -52,6 +52,7 @@ variant_sources = ['src/lib/zxc_compress.c',
 variant_sets = [['_default', []]]
 
 if host_cpu == 'x86_64'
+  variant_sets += [['_sse2',   ['-msse2']]]
   variant_sets += [['_avx2',   ['-mavx2', '-mfma', '-mbmi', '-mbmi2', '-mlzcnt']]]
   variant_sets += [['_avx512', ['-mavx512f', '-mavx512bw', '-mbmi', '-mbmi2', '-mlzcnt']]]
 elif host_cpu == 'aarch64'
diff --git a/src/lib/zxc_compress.c b/src/lib/zxc_compress.c
index 29f6a035..694d07d6 100644
--- a/src/lib/zxc_compress.c
+++ b/src/lib/zxc_compress.c
@@ -76,6 +76,66 @@ static ZXC_ALWAYS_INLINE uint32_t zxc_mm256_reduce_max_epu32(__m256i v) {
 }
 #endif
 
+#if defined(ZXC_USE_SSE2)
+/**
+ * @brief SSE2 emulation of SSE4.1 @c _mm_max_epu32 (element-wise unsigned max).
+ *
+ * SSE2 has no unsigned 32-bit compare, so we map unsigned ordering to signed
+ * ordering by flipping the sign bit (XOR 0x80000000) before @c _mm_cmpgt_epi32,
+ * then select element-wise.
+ */
+// codeql[cpp/unused-static-function] : Used conditionally when ZXC_USE_SSE2 is defined
+static ZXC_ALWAYS_INLINE __m128i zxc_mm_max_epu32_sse2(__m128i a, __m128i b) {
+    const __m128i bias = _mm_set1_epi32((int)0x80000000);
+    const __m128i gt = _mm_cmpgt_epi32(_mm_xor_si128(a, bias), _mm_xor_si128(b, bias));
+    return _mm_or_si128(_mm_and_si128(gt, a), _mm_andnot_si128(gt, b));
+}
+
+/**
+ * @brief SSE2 emulation of SSE4.1 @c _mm_blendv_epi8.
+ *
+ * Selects bytes from @p b where the corresponding @p mask byte has its high bit
+ * set, else from @p a. In every call site the mask lanes are full-width compare
+ * results (all-ones or all-zero per element), so a plain bitwise select is exact.
+ */
+// codeql[cpp/unused-static-function] : Used conditionally when ZXC_USE_SSE2 is defined
+static ZXC_ALWAYS_INLINE __m128i zxc_mm_blendv_epi8_sse2(__m128i a, __m128i b, __m128i mask) {
+    return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a));
+}
+
+/**
+ * @brief SSE2 emulation of SSE4.1 @c _mm_packus_epi32 (saturating u32 -> u16).
+ *
+ * SSE2 only has signed @c _mm_packs_epi32 (saturates to int16). Bias each lane
+ * by -0x8000 so values in [0, 0xFFFF] land in the signed int16 range with no
+ * saturation, pack, then add 0x8000 back per 16-bit lane. Exact for inputs in
+ * [0, 0xFFFF] (all call sites pass match lengths < 2^16).
+ */
+// codeql[cpp/unused-static-function] : Used conditionally when ZXC_USE_SSE2 is defined
+static ZXC_ALWAYS_INLINE __m128i zxc_mm_packus_epi32_sse2(__m128i a, __m128i b) {
+    const __m128i bias32 = _mm_set1_epi32(0x8000);
+    const __m128i bias16 = _mm_set1_epi16((short)0x8000);
+    const __m128i pa = _mm_sub_epi32(a, bias32);
+    const __m128i pb = _mm_sub_epi32(b, bias32);
+    return _mm_add_epi16(_mm_packs_epi32(pa, pb), bias16);
+}
+
+/**
+ * @brief Horizontal maximum of four packed unsigned 32-bit integers (SSE2).
+ *
+ * @param[in] v The 128-bit vector containing 4 unsigned 32-bit integers.
+ * @return The maximum unsigned 32-bit integer found in the vector.
+ */
+// codeql[cpp/unused-static-function] : Used conditionally when ZXC_USE_SSE2 is defined
+static ZXC_ALWAYS_INLINE uint32_t zxc_mm_reduce_max_epu32(__m128i v) {
+    __m128i vshuf = _mm_shuffle_epi32(v, _MM_SHUFFLE(1, 0, 3, 2));  // Swap 64-bit halves
+    v = zxc_mm_max_epu32_sse2(v, vshuf);                            // Max of pairs
+    vshuf = _mm_shuffle_epi32(v, _MM_SHUFFLE(2, 3, 0, 1));          // Swap adjacent lanes
+    v = zxc_mm_max_epu32_sse2(v, vshuf);                            // Final max
+    return (uint32_t)_mm_cvtsi128_si32(v);                          // Extract scalar result
+}
+#endif
+
 /**
  * @brief Writes a Prefix Varint encoded value to a buffer.
  *
@@ -315,6 +375,22 @@ static ZXC_ALWAYS_INLINE zxc_match_t zxc_lz77_find_best_match(
                     goto _match_len_done;
                 }
             }
+#elif defined(ZXC_USE_SSE2)
+            const uint8_t* limit_16 = iend - 16;
+            while (ip + mlen < limit_16) {
+                const __m128i v_src = _mm_loadu_si128((const __m128i*)(ip + mlen));
+                const __m128i v_ref = _mm_loadu_si128((const __m128i*)(ref + mlen));
+                const __m128i v_cmp = _mm_cmpeq_epi8(v_src, v_ref);
+                const uint32_t mask = (uint32_t)_mm_movemask_epi8(v_cmp);
+                if (mask == 0xFFFFU)
+                    mlen += 16;
+                else {
+                    // mask != 0xFFFF => a differing byte exists in bits 0..15,
+                    // so the lowest set bit of ~mask lies in that range.
+                    mlen += zxc_ctz32(~mask);
+                    goto _match_len_done;
+                }
+            }
 #elif defined(ZXC_USE_NEON64)
             {
                 const uint8_t* limit_32 = iend - 32;
@@ -669,11 +745,37 @@ static int zxc_encode_block_num(const zxc_cctx_t* RESTRICT ctx, const uint8_t* R
             max_d = vgetq_lane_u32(v_max1, 0);
 #endif
 
+            if (j > 0) prev = zxc_le32(in_ptr + (j - 1) * sizeof(uint32_t));
+        }
+#elif defined(ZXC_USE_SSE2)
+        // SSE2 processes 128-bit vectors (4 uint32 integers)
+        if (frames >= 4) {
+            __m128i v_max_accum = _mm_setzero_si128();  // Initialize max accumulator to 0
+
+            for (; j < (frames & ~3); j += 4) {
+                if (UNLIKELY(i == 0 && j == 0)) goto _scalar;
+
+                // Load 4 consecutive integers and the same window offset by -1
+                const __m128i vc = _mm_loadu_si128((const __m128i*)(in_ptr + j * 4));
+                const __m128i vp = _mm_loadu_si128((const __m128i*)(in_ptr + j * 4 - 4));
+
+                const __m128i diff = _mm_sub_epi32(vc, vp);  // Compute deltas: curr - prev
+
+                // ZigZag encode: (diff << 1) ^ (diff >> 31)
+                const __m128i zigzag =
+                    _mm_xor_si128(_mm_slli_epi32(diff, 1), _mm_srai_epi32(diff, 31));
+                _mm_storeu_si128((__m128i*)&deltas[j], zigzag);  // Store results
+                // SSE2 has no unsigned max; use the SSE2 emulation.
+                v_max_accum = zxc_mm_max_epu32_sse2(v_max_accum, zigzag);  // Update max accumulator
+            }
+
+            max_d = zxc_mm_reduce_max_epu32(v_max_accum);  // Horizontal max reduction
+
             if (j > 0) prev = zxc_le32(in_ptr + (j - 1) * sizeof(uint32_t));
         }
 #endif
 #if defined(ZXC_USE_AVX2) || defined(ZXC_USE_AVX512) || defined(ZXC_USE_NEON64) || \
-    defined(ZXC_USE_NEON32)
+    defined(ZXC_USE_NEON32) || defined(ZXC_USE_SSE2)
     _scalar:
 #endif
         for (; j < frames; j++) {
@@ -823,6 +925,34 @@ static ZXC_ALWAYS_INLINE size_t zxc_opt_dp_update_const_cost(
             vst1_u16(&parent_off[p + L], vbsl_u16(v_mask16, v_off, v_po));
         }
     }
+#elif defined(ZXC_USE_SSE2)
+    if (L + 4 <= L_end) {
+        const __m128i v_inc = _mm_setr_epi32(0, 1, 2, 3);
+        const __m128i v_nxt = _mm_set1_epi32((int)nxt);
+        const __m128i v_bias = _mm_set1_epi32((int)0x80000000);
+        const __m128i v_nxt_b = _mm_xor_si128(v_nxt, v_bias);
+        const __m128i v_off = _mm_set1_epi16((short)off_biased);
+        for (; L + 4 <= L_end; L += 4) {
+            const __m128i v_L_lanes = _mm_add_epi32(v_inc, _mm_set1_epi32((int)L));
+            const __m128i v_dp = _mm_loadu_si128((const __m128i*)&dp[p + L]);
+            /* Unsigned compare via sign-bit bias (SSE2 cmpgt is signed only):
+             *   (dp ^ 0x80000000) > (nxt ^ 0x80000000)  iff  dp > nxt. */
+            const __m128i v_dp_b = _mm_xor_si128(v_dp, v_bias);
+            const __m128i v_mask = _mm_cmpgt_epi32(v_dp_b, v_nxt_b);
+            const __m128i v_dp_new = zxc_mm_blendv_epi8_sse2(v_dp, v_nxt, v_mask);
+            _mm_storeu_si128((__m128i*)&dp[p + L], v_dp_new);
+            /* Narrow the 4x int32 mask / length lanes to 4x int16 (low 64 bits).
+             * packs: 0xFFFFFFFF -> 0xFFFF, 0 -> 0; packus (SSE2-emulated): u32->u16. */
+            const __m128i v_mask16 = _mm_packs_epi32(v_mask, v_mask);
+            const __m128i v_L_u16 = zxc_mm_packus_epi32_sse2(v_L_lanes, v_L_lanes);
+            __m128i v_pl = _mm_loadl_epi64((const __m128i*)&parent_len[p + L]);
+            v_pl = zxc_mm_blendv_epi8_sse2(v_pl, v_L_u16, v_mask16);
+            _mm_storel_epi64((__m128i*)&parent_len[p + L], v_pl);
+            __m128i v_po = _mm_loadl_epi64((const __m128i*)&parent_off[p + L]);
+            v_po = zxc_mm_blendv_epi8_sse2(v_po, v_off, v_mask16);
+            _mm_storel_epi64((__m128i*)&parent_off[p + L], v_po);
+        }
+    }
 #endif
     /* Scalar tail (and full path on archs without SIMD).
      * L < L_end <= UINT16_MAX (caller precondition), so the cast is lossless. */
@@ -1398,6 +1528,17 @@ parse_done:;
                 }
                 p += 32;
             }
+#elif defined(ZXC_USE_SSE2)
+            const __m128i vb = _mm_set1_epi8((char)b);
+            while (p <= p_end - 16) {
+                const __m128i v = _mm_loadu_si128((const __m128i*)p);
+                const uint32_t mask = (uint32_t)_mm_movemask_epi8(_mm_cmpeq_epi8(v, vb));
+                if (mask != 0xFFFFU) {
+                    p += zxc_ctz32(~mask);
+                    goto _run_done;
+                }
+                p += 16;
+            }
 #elif defined(ZXC_USE_NEON64)
             const uint8x16_t vb = vdupq_n_u8(b);
             while (p <= p_end - 16) {
@@ -1446,7 +1587,7 @@ parse_done:;
             while (p < p_end && *p == b) p++;
 
 #if defined(ZXC_USE_AVX512) || defined(ZXC_USE_AVX2) || defined(ZXC_USE_NEON64) || \
-    defined(ZXC_USE_NEON32)
+    defined(ZXC_USE_NEON32) || defined(ZXC_USE_SSE2)
         _run_done:;
 #endif
             const size_t run = (size_t)(p - run_start);
@@ -1497,6 +1638,22 @@ parse_done:;
                     }
                     p += 32;
                 }
+#elif defined(ZXC_USE_SSE2)
+                while (p <= p_end_4 - 16) {
+                    __m128i v0 = _mm_loadu_si128((const __m128i*)p);
+                    __m128i v1 = _mm_loadu_si128((const __m128i*)(p + 1));
+                    __m128i v2 = _mm_loadu_si128((const __m128i*)(p + 2));
+                    __m128i v3 = _mm_loadu_si128((const __m128i*)(p + 3));
+                    __m128i vend = _mm_and_si128(
+                        _mm_cmpeq_epi8(v0, v1),
+                        _mm_and_si128(_mm_cmpeq_epi8(v1, v2), _mm_cmpeq_epi8(v2, v3)));
+                    uint32_t mask = (uint32_t)_mm_movemask_epi8(vend);
+                    if (mask != 0) {
+                        p += zxc_ctz32(mask);
+                        goto _lit_done;
+                    }
+                    p += 16;
+                }
 #elif defined(ZXC_USE_NEON64)
                 while (p <= p_end_4 - 16) {
                     uint8x16_t v0 = vld1q_u8(p);
@@ -1561,7 +1718,7 @@ parse_done:;
                 }
 
 #if defined(ZXC_USE_AVX512) || defined(ZXC_USE_AVX2) || defined(ZXC_USE_NEON64) || \
-    defined(ZXC_USE_NEON32)
+    defined(ZXC_USE_NEON32) || defined(ZXC_USE_SSE2)
             _lit_done:;
 #endif
                 const size_t lit_run = (size_t)(p - lit_start);
diff --git a/src/lib/zxc_decompress.c b/src/lib/zxc_decompress.c
index 8f47a24e..ed5af86d 100644
--- a/src/lib/zxc_decompress.c
+++ b/src/lib/zxc_decompress.c
@@ -195,11 +195,13 @@ static ZXC_ALWAYS_INLINE void zxc_copy_overlap16(uint8_t* dst, uint32_t off) {
     vst1_u8(dst + 8, vtbl2_u8(src_tbl, mask_hi));
 
 #elif defined(ZXC_USE_AVX2) || defined(ZXC_USE_AVX512)
+    // SSSE3 PSHUFB (implied by AVX2/AVX512): single 16-byte table-lookup replicate.
     __m128i mask = _mm_load_si128((const __m128i*)zxc_overlap_masks[off]);
     __m128i src_data = _mm_loadu_si128((const __m128i*)(dst - off));
     _mm_storeu_si128((__m128i*)dst, _mm_shuffle_epi8(src_data, mask));
 
 #else
+    // SSE2-only tier and non-SIMD builds: scalar replicate (no PSHUFB).
     const uint8_t* src = dst - off;
     for (size_t i = 0; i < 16; i++) {
         dst[i] = src[i % off];
@@ -308,6 +310,24 @@ static ZXC_ALWAYS_INLINE __m512i zxc_mm512_prefix_sum_epi32(__m512i v) {
 }
 #endif
 
+#if defined(ZXC_USE_SSE2)
+/**
+ * @brief Computes the prefix sum of a 128-bit vector of four 32-bit integers
+ *        using SSE2 byte-shift adds (the 128-bit analogue of the NEON helper).
+ *
+ * For input [a, b, c, d] the result is [a, a+b, a+b+c, a+b+c+d].
+ *
+ * @param[in] v Input vector of four 32-bit integers.
+ * @return Vector containing the prefix sums.
+ */
+// codeql[cpp/unused-static-function] : Used conditionally when ZXC_USE_SSE2 is defined
+static ZXC_ALWAYS_INLINE __m128i zxc_mm_prefix_sum_epi32(__m128i v) {
+    v = _mm_add_epi32(v, _mm_slli_si128(v, 4));  // [a, a+b, b+c, c+d]
+    v = _mm_add_epi32(v, _mm_slli_si128(v, 8));  // [a, a+b, a+b+c, a+b+c+d]
+    return v;
+}
+#endif
+
 /**
  * @brief Decodes a block of numerical data compressed with the ZXC format.
  *
@@ -442,6 +462,22 @@ static int zxc_decode_block_num(const uint8_t* RESTRICT src, const size_t src_si
             running_val = vgetq_lane_u32(v_run, 0);  // Extract once at the end of the batch
 #endif
 
+#elif defined(ZXC_USE_SSE2)
+            __m128i v_run = _mm_set1_epi32((int)running_val);  // Broadcast running total
+            for (int k = 0; k < ZXC_NUM_DEC_BATCH; k += 4) {
+                __m128i v_deltas = _mm_load_si128((const __m128i*)&deltas[k]);  // Load 4 deltas
+
+                __m128i v_sum = zxc_mm_prefix_sum_epi32(v_deltas);  // Local prefix sums
+                v_sum = _mm_add_epi32(v_sum, v_run);                // Add base running total
+
+                _mm_storeu_si128((__m128i*)&batch_dst[k], v_sum);  // Store decoded values
+
+                // Broadcast 4th element (lane 3) to all lanes for the next iteration.
+                v_run = _mm_shuffle_epi32(v_sum, _MM_SHUFFLE(3, 3, 3, 3));
+            }
+            // Extract final running_val back to GPR for the scalar tail.
+            running_val = (uint32_t)_mm_cvtsi128_si32(v_run);
+
 #else
             for (int k = 0; k < ZXC_NUM_DEC_BATCH; k++) {
                 running_val += deltas[k];
diff --git a/src/lib/zxc_dispatch.c b/src/lib/zxc_dispatch.c
index 6526bfd9..d37c31e7 100644
--- a/src/lib/zxc_dispatch.c
+++ b/src/lib/zxc_dispatch.c
@@ -65,6 +65,12 @@ int zxc_decompress_chunk_wrapper_safe_avx2(zxc_cctx_t* RESTRICT ctx, const uint8
 int zxc_decompress_chunk_wrapper_safe_avx512(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
                                              const size_t src_sz, uint8_t* RESTRICT dst,
                                              const size_t dst_cap);
+int zxc_decompress_chunk_wrapper_sse2(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
+                                      const size_t src_sz, uint8_t* RESTRICT dst,
+                                      const size_t dst_cap);
+int zxc_decompress_chunk_wrapper_safe_sse2(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
+                                           const size_t src_sz, uint8_t* RESTRICT dst,
+                                           const size_t dst_cap);
 #elif defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
 int zxc_decompress_chunk_wrapper_neon(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
                                       const size_t src_sz, uint8_t* RESTRICT dst,
@@ -99,6 +105,9 @@ int zxc_compress_chunk_wrapper_avx2(zxc_cctx_t* RESTRICT ctx, const uint8_t* RES
 int zxc_compress_chunk_wrapper_avx512(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
                                       const size_t src_sz, uint8_t* RESTRICT dst,
                                       const size_t dst_cap);
+int zxc_compress_chunk_wrapper_sse2(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
+                                    const size_t src_sz, uint8_t* RESTRICT dst,
+                                    const size_t dst_cap);
 #elif defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
 int zxc_compress_chunk_wrapper_neon(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
                                     const size_t src_sz, uint8_t* RESTRICT dst,
@@ -119,7 +128,8 @@ typedef enum {
     ZXC_CPU_GENERIC = 0, /**< @brief Scalar-only fallback.   */
     ZXC_CPU_AVX2 = 1,    /**< @brief x86-64 AVX2 available.  */
     ZXC_CPU_AVX512 = 2,  /**< @brief x86-64 AVX-512F+BW available. */
-    ZXC_CPU_NEON = 3     /**< @brief ARM NEON available.      */
+    ZXC_CPU_NEON = 3,    /**< @brief ARM NEON available.      */
+    ZXC_CPU_SSE2 = 4     /**< @brief x86 SSE2 available (no AVX2); x86-64 baseline. */
 } zxc_cpu_feature_t;
 
 /**
@@ -140,15 +150,17 @@ static zxc_cpu_feature_t zxc_detect_cpu_features(void) {
 #if defined(__x86_64__) || defined(_M_X64)
 #if defined(_MSC_VER)
     // MSVC detection using __cpuid
-    // Function ID 1: EAX=1. ECX: Bit 28=AVX.
+    // Function ID 1: EAX=1. EDX: Bit 26=SSE2. ECX: Bit 28=AVX.
     // Function ID 7: EAX=7, ECX=0. EBX: Bit 5=AVX2, Bit 16=AVX512F, Bit 30=AVX512BW.
     int regs[4];
+    int sse2 = 0;
     int avx = 0;
     int avx2 = 0;
     int avx512 = 0;
 
     __cpuid(regs, 1);
-    if (regs[2] & (1 << 28)) avx = 1;
+    if (regs[3] & (1 << 26)) sse2 = 1;  // EDX bit 26 = SSE2
+    if (regs[2] & (1 << 28)) avx = 1;   // ECX bit 28 = AVX
 
     if (avx) {
         __cpuidex(regs, 7, 0);
@@ -160,6 +172,8 @@ static zxc_cpu_feature_t zxc_detect_cpu_features(void) {
         features = ZXC_CPU_AVX512;
     } else if (avx2) {
         features = ZXC_CPU_AVX2;
+    } else if (sse2) {
+        features = ZXC_CPU_SSE2;
     }
 #else
     // GCC/Clang built-in detection
@@ -169,6 +183,8 @@ static zxc_cpu_feature_t zxc_detect_cpu_features(void) {
         features = ZXC_CPU_AVX512;
     } else if (__builtin_cpu_supports("avx2")) {
         features = ZXC_CPU_AVX2;
+    } else if (__builtin_cpu_supports("sse2")) {
+        features = ZXC_CPU_SSE2;
     }
 #endif
 
@@ -238,6 +254,8 @@ static int zxc_decompress_dispatch_init(zxc_cctx_t* RESTRICT ctx, const uint8_t*
         zxc_decompress_ptr_local = zxc_decompress_chunk_wrapper_avx512;
     else if (cpu == ZXC_CPU_AVX2)
         zxc_decompress_ptr_local = zxc_decompress_chunk_wrapper_avx2;
+    else if (cpu == ZXC_CPU_SSE2)
+        zxc_decompress_ptr_local = zxc_decompress_chunk_wrapper_sse2;
     else
         zxc_decompress_ptr_local = zxc_decompress_chunk_wrapper_default;
 #elif defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
@@ -283,6 +301,8 @@ static int zxc_decompress_safe_dispatch_init(zxc_cctx_t* RESTRICT ctx, const uin
         zxc_decompress_safe_ptr_local = zxc_decompress_chunk_wrapper_safe_avx512;
     else if (cpu == ZXC_CPU_AVX2)
         zxc_decompress_safe_ptr_local = zxc_decompress_chunk_wrapper_safe_avx2;
+    else if (cpu == ZXC_CPU_SSE2)
+        zxc_decompress_safe_ptr_local = zxc_decompress_chunk_wrapper_safe_sse2;
     else
         zxc_decompress_safe_ptr_local = zxc_decompress_chunk_wrapper_safe_default;
 #elif defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
@@ -329,6 +349,8 @@ static int zxc_compress_dispatch_init(zxc_cctx_t* RESTRICT ctx, const uint8_t* R
         zxc_compress_ptr_local = zxc_compress_chunk_wrapper_avx512;
     else if (cpu == ZXC_CPU_AVX2)
         zxc_compress_ptr_local = zxc_compress_chunk_wrapper_avx2;
+    else if (cpu == ZXC_CPU_SSE2)
+        zxc_compress_ptr_local = zxc_compress_chunk_wrapper_sse2;
     else
         zxc_compress_ptr_local = zxc_compress_chunk_wrapper_default;
 #elif defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
diff --git a/src/lib/zxc_internal.h b/src/lib/zxc_internal.h
index c753c2c0..9b5c1255 100644
--- a/src/lib/zxc_internal.h
+++ b/src/lib/zxc_internal.h
@@ -72,9 +72,20 @@ extern "C" {
  * may be defined:
  * - @c ZXC_USE_AVX512 - AVX-512F + AVX-512BW available.
  * - @c ZXC_USE_AVX2   - AVX2 available.
+ * - @c ZXC_USE_SSE2   - SSE2 (x86-64 baseline) available.
  * - @c ZXC_USE_NEON64 - AArch64 NEON available.
  * - @c ZXC_USE_NEON32 - ARMv7 NEON available.
  *
+ * Note: @c -mavx2 / @c -mavx512f imply @c __SSE2__, so @c ZXC_USE_SSE2 is
+ * also defined in the AVX variants. The hand-written SIMD code paths therefore
+ * order their preprocessor branches AVX512 -> AVX2 -> SSE2 so the widest
+ * available path wins; the SSE2 branch is the active one only in the dedicated
+ * @c _sse2 variant (no AVX2/AVX512 flags). SSE2 is the x86-64 baseline, so this
+ * tier covers every 64-bit x86 CPU (and i686 with @c -msse2). The handful of
+ * operations that would otherwise require SSE4.1 (@c _mm_max_epu32,
+ * @c _mm_blendv_epi8, @c _mm_packus_epi32) or SSSE3 (@c _mm_shuffle_epi8) are
+ * emulated with pure SSE2 instruction sequences or fall back to scalar code.
+ *
  * Define @c ZXC_DISABLE_SIMD to gate all hand-written SIMD paths (intrinsics,
  * inline assembly).  Compiler auto-vectorisation is unaffected.
  * @{
@@ -93,6 +104,11 @@ extern "C" {
 #define ZXC_USE_AVX2
 #endif
 #endif
+#if defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+#ifndef ZXC_USE_SSE2
+#define ZXC_USE_SSE2
+#endif
+#endif
 #elif (defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(_M_ARM64) || \
        defined(ZXC_USE_NEON32) || defined(ZXC_USE_NEON64))
 #if !defined(_MSC_VER)
@@ -1008,8 +1024,8 @@ static ZXC_ALWAYS_INLINE uint16_t zxc_hash16(const uint8_t* p) {
  * @param[in] src Pointer to the source memory block.
  */
 static ZXC_ALWAYS_INLINE void zxc_copy16(void* dst, const void* src) {
-#if defined(ZXC_USE_AVX2) || defined(ZXC_USE_AVX512)
-    // AVX2/AVX512: Single 128-bit unaligned load/store
+#if defined(ZXC_USE_AVX2) || defined(ZXC_USE_AVX512) || defined(ZXC_USE_SSE2)
+    // x86 SSE2/AVX2/AVX512: Single 128-bit unaligned load/store
     _mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((const __m128i*)src));
 #elif defined(ZXC_USE_NEON64) || defined(ZXC_USE_NEON32)
     vst1q_u8((uint8_t*)dst, vld1q_u8((const uint8_t*)src));
@@ -1030,6 +1046,11 @@ static ZXC_ALWAYS_INLINE void zxc_copy32(void* dst, const void* src) {
 #if defined(ZXC_USE_AVX2) || defined(ZXC_USE_AVX512)
     // AVX2/AVX512: Single 256-bit (32 byte) unaligned load/store
     _mm256_storeu_si256((__m256i*)dst, _mm256_loadu_si256((const __m256i*)src));
+#elif defined(ZXC_USE_SSE2)
+    // SSE2: Two 128-bit (16 byte) unaligned load/stores (no 256-bit regs)
+    _mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((const __m128i*)src));
+    _mm_storeu_si128((__m128i*)((uint8_t*)dst + 16),
+                     _mm_loadu_si128((const __m128i*)((const uint8_t*)src + 16)));
 #elif defined(ZXC_USE_NEON64) || defined(ZXC_USE_NEON32)
     // NEON: Two 128-bit (16 byte) unaligned load/stores
     vst1q_u8((uint8_t*)dst, vld1q_u8((const uint8_t*)src));
diff --git a/wrappers/rust/zxc-sys/build.rs b/wrappers/rust/zxc-sys/build.rs
index 24d8a4a9..b3530f7f 100644
--- a/wrappers/rust/zxc-sys/build.rs
+++ b/wrappers/rust/zxc-sys/build.rs
@@ -11,7 +11,7 @@
 //! to support runtime CPU feature detection and optimized code paths.
 //!
 //! On ARM64: Compiles `_default` and `_neon` variants
-//! On x86_64: Compiles `_default`, `_avx2`, and `_avx512` variants
+//! On x86_64: Compiles `_default`, `_sse2`, `_avx2`, and `_avx512` variants
 
 use std::env;
 use std::fs;
@@ -217,7 +217,7 @@ fn main() {
     if is_arm64 {
         core_build.flag_if_supported("-march=armv8-a+crc");
     } else if is_x86_64 {
-        core_build.flag_if_supported("-msse4.2");
+        core_build.flag_if_supported("-msse2");
         core_build.flag_if_supported("-mpclmul");
     }
 
@@ -271,6 +271,48 @@ fn main() {
         neon_decompress.compile("zxc_decompress_neon");
         neon_huffman.compile("zxc_huffman_neon");
     } else if is_x86_64 {
+        // SSE2 variant: the x86-64 baseline (also covers any i686 built with
+        // SSE2). Mirrors the _sse2 variant in CMakeLists.txt / meson.build.
+        let mut sse2_compress = cc::Build::new();
+        sse2_compress
+            .include(&include_dir)
+            .include(&src_lib)
+            .include(src_lib.join("vendors"))
+            .define("ZXC_STATIC_DEFINE", None)
+            .file(src_lib.join("zxc_compress.c"))
+            .define("ZXC_FUNCTION_SUFFIX", "_sse2")
+            .flag_if_supported("-msse2")
+            .opt_level(3)
+            .warnings(false);
+
+        let mut sse2_decompress = cc::Build::new();
+        sse2_decompress
+            .include(&include_dir)
+            .include(&src_lib)
+            .include(src_lib.join("vendors"))
+            .define("ZXC_STATIC_DEFINE", None)
+            .file(src_lib.join("zxc_decompress.c"))
+            .define("ZXC_FUNCTION_SUFFIX", "_sse2")
+            .flag_if_supported("-msse2")
+            .opt_level(3)
+            .warnings(false);
+
+        let mut sse2_huffman = cc::Build::new();
+        sse2_huffman
+            .include(&include_dir)
+            .include(&src_lib)
+            .include(src_lib.join("vendors"))
+            .define("ZXC_STATIC_DEFINE", None)
+            .file(src_lib.join("zxc_huffman.c"))
+            .define("ZXC_FUNCTION_SUFFIX", "_sse2")
+            .flag_if_supported("-msse2")
+            .opt_level(3)
+            .warnings(false);
+
+        sse2_compress.compile("zxc_compress_sse2");
+        sse2_decompress.compile("zxc_decompress_sse2");
+        sse2_huffman.compile("zxc_huffman_sse2");
+
         // AVX2 variant
         let mut avx2_compress = cc::Build::new();
         avx2_compress