diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 937cbe32..50b8cee9 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -46,7 +46,8 @@ jobs: - name: Clone LZbench run: | - git clone --depth 1 https://github.com/inikep/lzbench "${LZBENCH_DIR}" + # git clone --depth 1 https://github.com/inikep/lzbench "${LZBENCH_DIR}" + git clone -b zxc-0.12.x https://github.com/hellobertrand/lzbench "${LZBENCH_DIR}" - name: Copy Lib ZXC run: | diff --git a/.github/workflows/multiarch.yml b/.github/workflows/multiarch.yml index 0343945d..94815758 100644 --- a/.github/workflows/multiarch.yml +++ b/.github/workflows/multiarch.yml @@ -167,6 +167,12 @@ jobs: cross_prefix: x86_64-linux-gnu cmake_processor: x86_64 qemu_binary: qemu-x86_64 + - name: Linux amd64 (SSE2) + cross_pkg: gcc + cross_prefix: x86_64-linux-gnu + cmake_processor: x86_64 + qemu_binary: qemu-x86_64 + qemu_cpu: core2duo - name: Linux amd64 (AVX2) cross_pkg: gcc cross_prefix: x86_64-linux-gnu diff --git a/CMakeLists.txt b/CMakeLists.txt index 76185070..98e8ff91 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -178,11 +178,15 @@ else() if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64|x86_64|AMD64") message(STATUS "Building x86_64 AVX2 and AVX512 variants...") if(MSVC) + # SSE2 for MSVC: SSE2 is the x86-64 baseline + zxc_add_variant(_sse2 "/D__SSE2__") # AVX2 for MSVC (Enables AVX2/BMI1/BMI2 sets) zxc_add_variant(_avx2 "/arch:AVX2;/D__BMI__;/D__BMI2__;/D__LZCNT__") # AVX512 for MSVC (VS2019 16.10+ supports /arch:AVX512) zxc_add_variant(_avx512 "/arch:AVX512;/D__BMI__;/D__BMI2__;/D__LZCNT__") else() + # SSE2 for GCC/Clang (x86-64 baseline) + zxc_add_variant(_sse2 "-msse2") # AVX2 for GCC/Clang zxc_add_variant(_avx2 "-mavx2;-mfma;-mbmi;-mbmi2;-mlzcnt") # AVX512 for GCC/Clang diff --git a/meson.build b/meson.build index dac03d92..a8af39f3 100644 --- a/meson.build +++ b/meson.build @@ -52,6 +52,7 @@ variant_sources = ['src/lib/zxc_compress.c', variant_sets = [['_default', []]] if host_cpu == 'x86_64' + variant_sets += [['_sse2', ['-msse2']]] variant_sets += [['_avx2', ['-mavx2', '-mfma', '-mbmi', '-mbmi2', '-mlzcnt']]] variant_sets += [['_avx512', ['-mavx512f', '-mavx512bw', '-mbmi', '-mbmi2', '-mlzcnt']]] elif host_cpu == 'aarch64' diff --git a/src/lib/zxc_compress.c b/src/lib/zxc_compress.c index 29f6a035..694d07d6 100644 --- a/src/lib/zxc_compress.c +++ b/src/lib/zxc_compress.c @@ -76,6 +76,66 @@ static ZXC_ALWAYS_INLINE uint32_t zxc_mm256_reduce_max_epu32(__m256i v) { } #endif +#if defined(ZXC_USE_SSE2) +/** + * @brief SSE2 emulation of SSE4.1 @c _mm_max_epu32 (element-wise unsigned max). + * + * SSE2 has no unsigned 32-bit compare, so we map unsigned ordering to signed + * ordering by flipping the sign bit (XOR 0x80000000) before @c _mm_cmpgt_epi32, + * then select element-wise. + */ +// codeql[cpp/unused-static-function] : Used conditionally when ZXC_USE_SSE2 is defined +static ZXC_ALWAYS_INLINE __m128i zxc_mm_max_epu32_sse2(__m128i a, __m128i b) { + const __m128i bias = _mm_set1_epi32((int)0x80000000); + const __m128i gt = _mm_cmpgt_epi32(_mm_xor_si128(a, bias), _mm_xor_si128(b, bias)); + return _mm_or_si128(_mm_and_si128(gt, a), _mm_andnot_si128(gt, b)); +} + +/** + * @brief SSE2 emulation of SSE4.1 @c _mm_blendv_epi8. + * + * Selects bytes from @p b where the corresponding @p mask byte has its high bit + * set, else from @p a. In every call site the mask lanes are full-width compare + * results (all-ones or all-zero per element), so a plain bitwise select is exact. + */ +// codeql[cpp/unused-static-function] : Used conditionally when ZXC_USE_SSE2 is defined +static ZXC_ALWAYS_INLINE __m128i zxc_mm_blendv_epi8_sse2(__m128i a, __m128i b, __m128i mask) { + return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a)); +} + +/** + * @brief SSE2 emulation of SSE4.1 @c _mm_packus_epi32 (saturating u32 -> u16). + * + * SSE2 only has signed @c _mm_packs_epi32 (saturates to int16). Bias each lane + * by -0x8000 so values in [0, 0xFFFF] land in the signed int16 range with no + * saturation, pack, then add 0x8000 back per 16-bit lane. Exact for inputs in + * [0, 0xFFFF] (all call sites pass match lengths < 2^16). + */ +// codeql[cpp/unused-static-function] : Used conditionally when ZXC_USE_SSE2 is defined +static ZXC_ALWAYS_INLINE __m128i zxc_mm_packus_epi32_sse2(__m128i a, __m128i b) { + const __m128i bias32 = _mm_set1_epi32(0x8000); + const __m128i bias16 = _mm_set1_epi16((short)0x8000); + const __m128i pa = _mm_sub_epi32(a, bias32); + const __m128i pb = _mm_sub_epi32(b, bias32); + return _mm_add_epi16(_mm_packs_epi32(pa, pb), bias16); +} + +/** + * @brief Horizontal maximum of four packed unsigned 32-bit integers (SSE2). + * + * @param[in] v The 128-bit vector containing 4 unsigned 32-bit integers. + * @return The maximum unsigned 32-bit integer found in the vector. + */ +// codeql[cpp/unused-static-function] : Used conditionally when ZXC_USE_SSE2 is defined +static ZXC_ALWAYS_INLINE uint32_t zxc_mm_reduce_max_epu32(__m128i v) { + __m128i vshuf = _mm_shuffle_epi32(v, _MM_SHUFFLE(1, 0, 3, 2)); // Swap 64-bit halves + v = zxc_mm_max_epu32_sse2(v, vshuf); // Max of pairs + vshuf = _mm_shuffle_epi32(v, _MM_SHUFFLE(2, 3, 0, 1)); // Swap adjacent lanes + v = zxc_mm_max_epu32_sse2(v, vshuf); // Final max + return (uint32_t)_mm_cvtsi128_si32(v); // Extract scalar result +} +#endif + /** * @brief Writes a Prefix Varint encoded value to a buffer. * @@ -315,6 +375,22 @@ static ZXC_ALWAYS_INLINE zxc_match_t zxc_lz77_find_best_match( goto _match_len_done; } } +#elif defined(ZXC_USE_SSE2) + const uint8_t* limit_16 = iend - 16; + while (ip + mlen < limit_16) { + const __m128i v_src = _mm_loadu_si128((const __m128i*)(ip + mlen)); + const __m128i v_ref = _mm_loadu_si128((const __m128i*)(ref + mlen)); + const __m128i v_cmp = _mm_cmpeq_epi8(v_src, v_ref); + const uint32_t mask = (uint32_t)_mm_movemask_epi8(v_cmp); + if (mask == 0xFFFFU) + mlen += 16; + else { + // mask != 0xFFFF => a differing byte exists in bits 0..15, + // so the lowest set bit of ~mask lies in that range. + mlen += zxc_ctz32(~mask); + goto _match_len_done; + } + } #elif defined(ZXC_USE_NEON64) { const uint8_t* limit_32 = iend - 32; @@ -669,11 +745,37 @@ static int zxc_encode_block_num(const zxc_cctx_t* RESTRICT ctx, const uint8_t* R max_d = vgetq_lane_u32(v_max1, 0); #endif + if (j > 0) prev = zxc_le32(in_ptr + (j - 1) * sizeof(uint32_t)); + } +#elif defined(ZXC_USE_SSE2) + // SSE2 processes 128-bit vectors (4 uint32 integers) + if (frames >= 4) { + __m128i v_max_accum = _mm_setzero_si128(); // Initialize max accumulator to 0 + + for (; j < (frames & ~3); j += 4) { + if (UNLIKELY(i == 0 && j == 0)) goto _scalar; + + // Load 4 consecutive integers and the same window offset by -1 + const __m128i vc = _mm_loadu_si128((const __m128i*)(in_ptr + j * 4)); + const __m128i vp = _mm_loadu_si128((const __m128i*)(in_ptr + j * 4 - 4)); + + const __m128i diff = _mm_sub_epi32(vc, vp); // Compute deltas: curr - prev + + // ZigZag encode: (diff << 1) ^ (diff >> 31) + const __m128i zigzag = + _mm_xor_si128(_mm_slli_epi32(diff, 1), _mm_srai_epi32(diff, 31)); + _mm_storeu_si128((__m128i*)&deltas[j], zigzag); // Store results + // SSE2 has no unsigned max; use the SSE2 emulation. + v_max_accum = zxc_mm_max_epu32_sse2(v_max_accum, zigzag); // Update max accumulator + } + + max_d = zxc_mm_reduce_max_epu32(v_max_accum); // Horizontal max reduction + if (j > 0) prev = zxc_le32(in_ptr + (j - 1) * sizeof(uint32_t)); } #endif #if defined(ZXC_USE_AVX2) || defined(ZXC_USE_AVX512) || defined(ZXC_USE_NEON64) || \ - defined(ZXC_USE_NEON32) + defined(ZXC_USE_NEON32) || defined(ZXC_USE_SSE2) _scalar: #endif for (; j < frames; j++) { @@ -823,6 +925,34 @@ static ZXC_ALWAYS_INLINE size_t zxc_opt_dp_update_const_cost( vst1_u16(&parent_off[p + L], vbsl_u16(v_mask16, v_off, v_po)); } } +#elif defined(ZXC_USE_SSE2) + if (L + 4 <= L_end) { + const __m128i v_inc = _mm_setr_epi32(0, 1, 2, 3); + const __m128i v_nxt = _mm_set1_epi32((int)nxt); + const __m128i v_bias = _mm_set1_epi32((int)0x80000000); + const __m128i v_nxt_b = _mm_xor_si128(v_nxt, v_bias); + const __m128i v_off = _mm_set1_epi16((short)off_biased); + for (; L + 4 <= L_end; L += 4) { + const __m128i v_L_lanes = _mm_add_epi32(v_inc, _mm_set1_epi32((int)L)); + const __m128i v_dp = _mm_loadu_si128((const __m128i*)&dp[p + L]); + /* Unsigned compare via sign-bit bias (SSE2 cmpgt is signed only): + * (dp ^ 0x80000000) > (nxt ^ 0x80000000) iff dp > nxt. */ + const __m128i v_dp_b = _mm_xor_si128(v_dp, v_bias); + const __m128i v_mask = _mm_cmpgt_epi32(v_dp_b, v_nxt_b); + const __m128i v_dp_new = zxc_mm_blendv_epi8_sse2(v_dp, v_nxt, v_mask); + _mm_storeu_si128((__m128i*)&dp[p + L], v_dp_new); + /* Narrow the 4x int32 mask / length lanes to 4x int16 (low 64 bits). + * packs: 0xFFFFFFFF -> 0xFFFF, 0 -> 0; packus (SSE2-emulated): u32->u16. */ + const __m128i v_mask16 = _mm_packs_epi32(v_mask, v_mask); + const __m128i v_L_u16 = zxc_mm_packus_epi32_sse2(v_L_lanes, v_L_lanes); + __m128i v_pl = _mm_loadl_epi64((const __m128i*)&parent_len[p + L]); + v_pl = zxc_mm_blendv_epi8_sse2(v_pl, v_L_u16, v_mask16); + _mm_storel_epi64((__m128i*)&parent_len[p + L], v_pl); + __m128i v_po = _mm_loadl_epi64((const __m128i*)&parent_off[p + L]); + v_po = zxc_mm_blendv_epi8_sse2(v_po, v_off, v_mask16); + _mm_storel_epi64((__m128i*)&parent_off[p + L], v_po); + } + } #endif /* Scalar tail (and full path on archs without SIMD). * L < L_end <= UINT16_MAX (caller precondition), so the cast is lossless. */ @@ -1398,6 +1528,17 @@ parse_done:; } p += 32; } +#elif defined(ZXC_USE_SSE2) + const __m128i vb = _mm_set1_epi8((char)b); + while (p <= p_end - 16) { + const __m128i v = _mm_loadu_si128((const __m128i*)p); + const uint32_t mask = (uint32_t)_mm_movemask_epi8(_mm_cmpeq_epi8(v, vb)); + if (mask != 0xFFFFU) { + p += zxc_ctz32(~mask); + goto _run_done; + } + p += 16; + } #elif defined(ZXC_USE_NEON64) const uint8x16_t vb = vdupq_n_u8(b); while (p <= p_end - 16) { @@ -1446,7 +1587,7 @@ parse_done:; while (p < p_end && *p == b) p++; #if defined(ZXC_USE_AVX512) || defined(ZXC_USE_AVX2) || defined(ZXC_USE_NEON64) || \ - defined(ZXC_USE_NEON32) + defined(ZXC_USE_NEON32) || defined(ZXC_USE_SSE2) _run_done:; #endif const size_t run = (size_t)(p - run_start); @@ -1497,6 +1638,22 @@ parse_done:; } p += 32; } +#elif defined(ZXC_USE_SSE2) + while (p <= p_end_4 - 16) { + __m128i v0 = _mm_loadu_si128((const __m128i*)p); + __m128i v1 = _mm_loadu_si128((const __m128i*)(p + 1)); + __m128i v2 = _mm_loadu_si128((const __m128i*)(p + 2)); + __m128i v3 = _mm_loadu_si128((const __m128i*)(p + 3)); + __m128i vend = _mm_and_si128( + _mm_cmpeq_epi8(v0, v1), + _mm_and_si128(_mm_cmpeq_epi8(v1, v2), _mm_cmpeq_epi8(v2, v3))); + uint32_t mask = (uint32_t)_mm_movemask_epi8(vend); + if (mask != 0) { + p += zxc_ctz32(mask); + goto _lit_done; + } + p += 16; + } #elif defined(ZXC_USE_NEON64) while (p <= p_end_4 - 16) { uint8x16_t v0 = vld1q_u8(p); @@ -1561,7 +1718,7 @@ parse_done:; } #if defined(ZXC_USE_AVX512) || defined(ZXC_USE_AVX2) || defined(ZXC_USE_NEON64) || \ - defined(ZXC_USE_NEON32) + defined(ZXC_USE_NEON32) || defined(ZXC_USE_SSE2) _lit_done:; #endif const size_t lit_run = (size_t)(p - lit_start); diff --git a/src/lib/zxc_decompress.c b/src/lib/zxc_decompress.c index 8f47a24e..ed5af86d 100644 --- a/src/lib/zxc_decompress.c +++ b/src/lib/zxc_decompress.c @@ -195,11 +195,13 @@ static ZXC_ALWAYS_INLINE void zxc_copy_overlap16(uint8_t* dst, uint32_t off) { vst1_u8(dst + 8, vtbl2_u8(src_tbl, mask_hi)); #elif defined(ZXC_USE_AVX2) || defined(ZXC_USE_AVX512) + // SSSE3 PSHUFB (implied by AVX2/AVX512): single 16-byte table-lookup replicate. __m128i mask = _mm_load_si128((const __m128i*)zxc_overlap_masks[off]); __m128i src_data = _mm_loadu_si128((const __m128i*)(dst - off)); _mm_storeu_si128((__m128i*)dst, _mm_shuffle_epi8(src_data, mask)); #else + // SSE2-only tier and non-SIMD builds: scalar replicate (no PSHUFB). const uint8_t* src = dst - off; for (size_t i = 0; i < 16; i++) { dst[i] = src[i % off]; @@ -308,6 +310,24 @@ static ZXC_ALWAYS_INLINE __m512i zxc_mm512_prefix_sum_epi32(__m512i v) { } #endif +#if defined(ZXC_USE_SSE2) +/** + * @brief Computes the prefix sum of a 128-bit vector of four 32-bit integers + * using SSE2 byte-shift adds (the 128-bit analogue of the NEON helper). + * + * For input [a, b, c, d] the result is [a, a+b, a+b+c, a+b+c+d]. + * + * @param[in] v Input vector of four 32-bit integers. + * @return Vector containing the prefix sums. + */ +// codeql[cpp/unused-static-function] : Used conditionally when ZXC_USE_SSE2 is defined +static ZXC_ALWAYS_INLINE __m128i zxc_mm_prefix_sum_epi32(__m128i v) { + v = _mm_add_epi32(v, _mm_slli_si128(v, 4)); // [a, a+b, b+c, c+d] + v = _mm_add_epi32(v, _mm_slli_si128(v, 8)); // [a, a+b, a+b+c, a+b+c+d] + return v; +} +#endif + /** * @brief Decodes a block of numerical data compressed with the ZXC format. * @@ -442,6 +462,22 @@ static int zxc_decode_block_num(const uint8_t* RESTRICT src, const size_t src_si running_val = vgetq_lane_u32(v_run, 0); // Extract once at the end of the batch #endif +#elif defined(ZXC_USE_SSE2) + __m128i v_run = _mm_set1_epi32((int)running_val); // Broadcast running total + for (int k = 0; k < ZXC_NUM_DEC_BATCH; k += 4) { + __m128i v_deltas = _mm_load_si128((const __m128i*)&deltas[k]); // Load 4 deltas + + __m128i v_sum = zxc_mm_prefix_sum_epi32(v_deltas); // Local prefix sums + v_sum = _mm_add_epi32(v_sum, v_run); // Add base running total + + _mm_storeu_si128((__m128i*)&batch_dst[k], v_sum); // Store decoded values + + // Broadcast 4th element (lane 3) to all lanes for the next iteration. + v_run = _mm_shuffle_epi32(v_sum, _MM_SHUFFLE(3, 3, 3, 3)); + } + // Extract final running_val back to GPR for the scalar tail. + running_val = (uint32_t)_mm_cvtsi128_si32(v_run); + #else for (int k = 0; k < ZXC_NUM_DEC_BATCH; k++) { running_val += deltas[k]; diff --git a/src/lib/zxc_dispatch.c b/src/lib/zxc_dispatch.c index 6526bfd9..d37c31e7 100644 --- a/src/lib/zxc_dispatch.c +++ b/src/lib/zxc_dispatch.c @@ -65,6 +65,12 @@ int zxc_decompress_chunk_wrapper_safe_avx2(zxc_cctx_t* RESTRICT ctx, const uint8 int zxc_decompress_chunk_wrapper_safe_avx512(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src, const size_t src_sz, uint8_t* RESTRICT dst, const size_t dst_cap); +int zxc_decompress_chunk_wrapper_sse2(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src, + const size_t src_sz, uint8_t* RESTRICT dst, + const size_t dst_cap); +int zxc_decompress_chunk_wrapper_safe_sse2(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src, + const size_t src_sz, uint8_t* RESTRICT dst, + const size_t dst_cap); #elif defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM) int zxc_decompress_chunk_wrapper_neon(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src, const size_t src_sz, uint8_t* RESTRICT dst, @@ -99,6 +105,9 @@ int zxc_compress_chunk_wrapper_avx2(zxc_cctx_t* RESTRICT ctx, const uint8_t* RES int zxc_compress_chunk_wrapper_avx512(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src, const size_t src_sz, uint8_t* RESTRICT dst, const size_t dst_cap); +int zxc_compress_chunk_wrapper_sse2(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src, + const size_t src_sz, uint8_t* RESTRICT dst, + const size_t dst_cap); #elif defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM) int zxc_compress_chunk_wrapper_neon(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src, const size_t src_sz, uint8_t* RESTRICT dst, @@ -119,7 +128,8 @@ typedef enum { ZXC_CPU_GENERIC = 0, /**< @brief Scalar-only fallback. */ ZXC_CPU_AVX2 = 1, /**< @brief x86-64 AVX2 available. */ ZXC_CPU_AVX512 = 2, /**< @brief x86-64 AVX-512F+BW available. */ - ZXC_CPU_NEON = 3 /**< @brief ARM NEON available. */ + ZXC_CPU_NEON = 3, /**< @brief ARM NEON available. */ + ZXC_CPU_SSE2 = 4 /**< @brief x86 SSE2 available (no AVX2); x86-64 baseline. */ } zxc_cpu_feature_t; /** @@ -140,15 +150,17 @@ static zxc_cpu_feature_t zxc_detect_cpu_features(void) { #if defined(__x86_64__) || defined(_M_X64) #if defined(_MSC_VER) // MSVC detection using __cpuid - // Function ID 1: EAX=1. ECX: Bit 28=AVX. + // Function ID 1: EAX=1. EDX: Bit 26=SSE2. ECX: Bit 28=AVX. // Function ID 7: EAX=7, ECX=0. EBX: Bit 5=AVX2, Bit 16=AVX512F, Bit 30=AVX512BW. int regs[4]; + int sse2 = 0; int avx = 0; int avx2 = 0; int avx512 = 0; __cpuid(regs, 1); - if (regs[2] & (1 << 28)) avx = 1; + if (regs[3] & (1 << 26)) sse2 = 1; // EDX bit 26 = SSE2 + if (regs[2] & (1 << 28)) avx = 1; // ECX bit 28 = AVX if (avx) { __cpuidex(regs, 7, 0); @@ -160,6 +172,8 @@ static zxc_cpu_feature_t zxc_detect_cpu_features(void) { features = ZXC_CPU_AVX512; } else if (avx2) { features = ZXC_CPU_AVX2; + } else if (sse2) { + features = ZXC_CPU_SSE2; } #else // GCC/Clang built-in detection @@ -169,6 +183,8 @@ static zxc_cpu_feature_t zxc_detect_cpu_features(void) { features = ZXC_CPU_AVX512; } else if (__builtin_cpu_supports("avx2")) { features = ZXC_CPU_AVX2; + } else if (__builtin_cpu_supports("sse2")) { + features = ZXC_CPU_SSE2; } #endif @@ -238,6 +254,8 @@ static int zxc_decompress_dispatch_init(zxc_cctx_t* RESTRICT ctx, const uint8_t* zxc_decompress_ptr_local = zxc_decompress_chunk_wrapper_avx512; else if (cpu == ZXC_CPU_AVX2) zxc_decompress_ptr_local = zxc_decompress_chunk_wrapper_avx2; + else if (cpu == ZXC_CPU_SSE2) + zxc_decompress_ptr_local = zxc_decompress_chunk_wrapper_sse2; else zxc_decompress_ptr_local = zxc_decompress_chunk_wrapper_default; #elif defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM) @@ -283,6 +301,8 @@ static int zxc_decompress_safe_dispatch_init(zxc_cctx_t* RESTRICT ctx, const uin zxc_decompress_safe_ptr_local = zxc_decompress_chunk_wrapper_safe_avx512; else if (cpu == ZXC_CPU_AVX2) zxc_decompress_safe_ptr_local = zxc_decompress_chunk_wrapper_safe_avx2; + else if (cpu == ZXC_CPU_SSE2) + zxc_decompress_safe_ptr_local = zxc_decompress_chunk_wrapper_safe_sse2; else zxc_decompress_safe_ptr_local = zxc_decompress_chunk_wrapper_safe_default; #elif defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM) @@ -329,6 +349,8 @@ static int zxc_compress_dispatch_init(zxc_cctx_t* RESTRICT ctx, const uint8_t* R zxc_compress_ptr_local = zxc_compress_chunk_wrapper_avx512; else if (cpu == ZXC_CPU_AVX2) zxc_compress_ptr_local = zxc_compress_chunk_wrapper_avx2; + else if (cpu == ZXC_CPU_SSE2) + zxc_compress_ptr_local = zxc_compress_chunk_wrapper_sse2; else zxc_compress_ptr_local = zxc_compress_chunk_wrapper_default; #elif defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM) diff --git a/src/lib/zxc_internal.h b/src/lib/zxc_internal.h index c753c2c0..9b5c1255 100644 --- a/src/lib/zxc_internal.h +++ b/src/lib/zxc_internal.h @@ -72,9 +72,20 @@ extern "C" { * may be defined: * - @c ZXC_USE_AVX512 - AVX-512F + AVX-512BW available. * - @c ZXC_USE_AVX2 - AVX2 available. + * - @c ZXC_USE_SSE2 - SSE2 (x86-64 baseline) available. * - @c ZXC_USE_NEON64 - AArch64 NEON available. * - @c ZXC_USE_NEON32 - ARMv7 NEON available. * + * Note: @c -mavx2 / @c -mavx512f imply @c __SSE2__, so @c ZXC_USE_SSE2 is + * also defined in the AVX variants. The hand-written SIMD code paths therefore + * order their preprocessor branches AVX512 -> AVX2 -> SSE2 so the widest + * available path wins; the SSE2 branch is the active one only in the dedicated + * @c _sse2 variant (no AVX2/AVX512 flags). SSE2 is the x86-64 baseline, so this + * tier covers every 64-bit x86 CPU (and i686 with @c -msse2). The handful of + * operations that would otherwise require SSE4.1 (@c _mm_max_epu32, + * @c _mm_blendv_epi8, @c _mm_packus_epi32) or SSSE3 (@c _mm_shuffle_epi8) are + * emulated with pure SSE2 instruction sequences or fall back to scalar code. + * * Define @c ZXC_DISABLE_SIMD to gate all hand-written SIMD paths (intrinsics, * inline assembly). Compiler auto-vectorisation is unaffected. * @{ @@ -93,6 +104,11 @@ extern "C" { #define ZXC_USE_AVX2 #endif #endif +#if defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) +#ifndef ZXC_USE_SSE2 +#define ZXC_USE_SSE2 +#endif +#endif #elif (defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(_M_ARM64) || \ defined(ZXC_USE_NEON32) || defined(ZXC_USE_NEON64)) #if !defined(_MSC_VER) @@ -1008,8 +1024,8 @@ static ZXC_ALWAYS_INLINE uint16_t zxc_hash16(const uint8_t* p) { * @param[in] src Pointer to the source memory block. */ static ZXC_ALWAYS_INLINE void zxc_copy16(void* dst, const void* src) { -#if defined(ZXC_USE_AVX2) || defined(ZXC_USE_AVX512) - // AVX2/AVX512: Single 128-bit unaligned load/store +#if defined(ZXC_USE_AVX2) || defined(ZXC_USE_AVX512) || defined(ZXC_USE_SSE2) + // x86 SSE2/AVX2/AVX512: Single 128-bit unaligned load/store _mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((const __m128i*)src)); #elif defined(ZXC_USE_NEON64) || defined(ZXC_USE_NEON32) vst1q_u8((uint8_t*)dst, vld1q_u8((const uint8_t*)src)); @@ -1030,6 +1046,11 @@ static ZXC_ALWAYS_INLINE void zxc_copy32(void* dst, const void* src) { #if defined(ZXC_USE_AVX2) || defined(ZXC_USE_AVX512) // AVX2/AVX512: Single 256-bit (32 byte) unaligned load/store _mm256_storeu_si256((__m256i*)dst, _mm256_loadu_si256((const __m256i*)src)); +#elif defined(ZXC_USE_SSE2) + // SSE2: Two 128-bit (16 byte) unaligned load/stores (no 256-bit regs) + _mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((const __m128i*)src)); + _mm_storeu_si128((__m128i*)((uint8_t*)dst + 16), + _mm_loadu_si128((const __m128i*)((const uint8_t*)src + 16))); #elif defined(ZXC_USE_NEON64) || defined(ZXC_USE_NEON32) // NEON: Two 128-bit (16 byte) unaligned load/stores vst1q_u8((uint8_t*)dst, vld1q_u8((const uint8_t*)src)); diff --git a/wrappers/rust/zxc-sys/build.rs b/wrappers/rust/zxc-sys/build.rs index 24d8a4a9..b3530f7f 100644 --- a/wrappers/rust/zxc-sys/build.rs +++ b/wrappers/rust/zxc-sys/build.rs @@ -11,7 +11,7 @@ //! to support runtime CPU feature detection and optimized code paths. //! //! On ARM64: Compiles `_default` and `_neon` variants -//! On x86_64: Compiles `_default`, `_avx2`, and `_avx512` variants +//! On x86_64: Compiles `_default`, `_sse2`, `_avx2`, and `_avx512` variants use std::env; use std::fs; @@ -217,7 +217,7 @@ fn main() { if is_arm64 { core_build.flag_if_supported("-march=armv8-a+crc"); } else if is_x86_64 { - core_build.flag_if_supported("-msse4.2"); + core_build.flag_if_supported("-msse2"); core_build.flag_if_supported("-mpclmul"); } @@ -271,6 +271,48 @@ fn main() { neon_decompress.compile("zxc_decompress_neon"); neon_huffman.compile("zxc_huffman_neon"); } else if is_x86_64 { + // SSE2 variant: the x86-64 baseline (also covers any i686 built with + // SSE2). Mirrors the _sse2 variant in CMakeLists.txt / meson.build. + let mut sse2_compress = cc::Build::new(); + sse2_compress + .include(&include_dir) + .include(&src_lib) + .include(src_lib.join("vendors")) + .define("ZXC_STATIC_DEFINE", None) + .file(src_lib.join("zxc_compress.c")) + .define("ZXC_FUNCTION_SUFFIX", "_sse2") + .flag_if_supported("-msse2") + .opt_level(3) + .warnings(false); + + let mut sse2_decompress = cc::Build::new(); + sse2_decompress + .include(&include_dir) + .include(&src_lib) + .include(src_lib.join("vendors")) + .define("ZXC_STATIC_DEFINE", None) + .file(src_lib.join("zxc_decompress.c")) + .define("ZXC_FUNCTION_SUFFIX", "_sse2") + .flag_if_supported("-msse2") + .opt_level(3) + .warnings(false); + + let mut sse2_huffman = cc::Build::new(); + sse2_huffman + .include(&include_dir) + .include(&src_lib) + .include(src_lib.join("vendors")) + .define("ZXC_STATIC_DEFINE", None) + .file(src_lib.join("zxc_huffman.c")) + .define("ZXC_FUNCTION_SUFFIX", "_sse2") + .flag_if_supported("-msse2") + .opt_level(3) + .warnings(false); + + sse2_compress.compile("zxc_compress_sse2"); + sse2_decompress.compile("zxc_decompress_sse2"); + sse2_huffman.compile("zxc_huffman_sse2"); + // AVX2 variant let mut avx2_compress = cc::Build::new(); avx2_compress