Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ jobs:

- name: Clone LZbench
run: |
git clone --depth 1 https://github.com/inikep/lzbench "${LZBENCH_DIR}"
# git clone --depth 1 https://github.com/inikep/lzbench "${LZBENCH_DIR}"
git clone -b zxc-0.12.x https://github.com/hellobertrand/lzbench "${LZBENCH_DIR}"

- name: Copy Lib ZXC
run: |
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/multiarch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,12 @@ jobs:
cross_prefix: x86_64-linux-gnu
cmake_processor: x86_64
qemu_binary: qemu-x86_64
- name: Linux amd64 (SSE2)
cross_pkg: gcc
cross_prefix: x86_64-linux-gnu
cmake_processor: x86_64
qemu_binary: qemu-x86_64
qemu_cpu: core2duo
- name: Linux amd64 (AVX2)
cross_pkg: gcc
cross_prefix: x86_64-linux-gnu
Expand Down
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -178,11 +178,15 @@ else()
if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64|x86_64|AMD64")
message(STATUS "Building x86_64 AVX2 and AVX512 variants...")
if(MSVC)
# SSE2 for MSVC: SSE2 is the x86-64 baseline
zxc_add_variant(_sse2 "/D__SSE2__")
# AVX2 for MSVC (Enables AVX2/BMI1/BMI2 sets)
zxc_add_variant(_avx2 "/arch:AVX2;/D__BMI__;/D__BMI2__;/D__LZCNT__")
# AVX512 for MSVC (VS2019 16.10+ supports /arch:AVX512)
zxc_add_variant(_avx512 "/arch:AVX512;/D__BMI__;/D__BMI2__;/D__LZCNT__")
else()
# SSE2 for GCC/Clang (x86-64 baseline)
zxc_add_variant(_sse2 "-msse2")
# AVX2 for GCC/Clang
zxc_add_variant(_avx2 "-mavx2;-mfma;-mbmi;-mbmi2;-mlzcnt")
# AVX512 for GCC/Clang
Expand Down
1 change: 1 addition & 0 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ variant_sources = ['src/lib/zxc_compress.c',
variant_sets = [['_default', []]]

if host_cpu == 'x86_64'
variant_sets += [['_sse2', ['-msse2']]]
variant_sets += [['_avx2', ['-mavx2', '-mfma', '-mbmi', '-mbmi2', '-mlzcnt']]]
variant_sets += [['_avx512', ['-mavx512f', '-mavx512bw', '-mbmi', '-mbmi2', '-mlzcnt']]]
elif host_cpu == 'aarch64'
Expand Down
163 changes: 160 additions & 3 deletions src/lib/zxc_compress.c
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,66 @@ static ZXC_ALWAYS_INLINE uint32_t zxc_mm256_reduce_max_epu32(__m256i v) {
}
#endif

#if defined(ZXC_USE_SSE2)
/**
* @brief SSE2 emulation of SSE4.1 @c _mm_max_epu32 (element-wise unsigned max).
*
* SSE2 has no unsigned 32-bit compare, so we map unsigned ordering to signed
* ordering by flipping the sign bit (XOR 0x80000000) before @c _mm_cmpgt_epi32,
* then select element-wise.
*/
// codeql[cpp/unused-static-function] : Used conditionally when ZXC_USE_SSE2 is defined
static ZXC_ALWAYS_INLINE __m128i zxc_mm_max_epu32_sse2(__m128i a, __m128i b) {
const __m128i bias = _mm_set1_epi32((int)0x80000000);
const __m128i gt = _mm_cmpgt_epi32(_mm_xor_si128(a, bias), _mm_xor_si128(b, bias));
return _mm_or_si128(_mm_and_si128(gt, a), _mm_andnot_si128(gt, b));
}

/**
* @brief SSE2 emulation of SSE4.1 @c _mm_blendv_epi8.
*
* Selects bytes from @p b where the corresponding @p mask byte has its high bit
* set, else from @p a. In every call site the mask lanes are full-width compare
* results (all-ones or all-zero per element), so a plain bitwise select is exact.
*/
// codeql[cpp/unused-static-function] : Used conditionally when ZXC_USE_SSE2 is defined
static ZXC_ALWAYS_INLINE __m128i zxc_mm_blendv_epi8_sse2(__m128i a, __m128i b, __m128i mask) {
return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a));
}

/**
* @brief SSE2 emulation of SSE4.1 @c _mm_packus_epi32 (saturating u32 -> u16).
*
* SSE2 only has signed @c _mm_packs_epi32 (saturates to int16). Bias each lane
* by -0x8000 so values in [0, 0xFFFF] land in the signed int16 range with no
* saturation, pack, then add 0x8000 back per 16-bit lane. Exact for inputs in
* [0, 0xFFFF] (all call sites pass match lengths < 2^16).
*/
// codeql[cpp/unused-static-function] : Used conditionally when ZXC_USE_SSE2 is defined
static ZXC_ALWAYS_INLINE __m128i zxc_mm_packus_epi32_sse2(__m128i a, __m128i b) {
const __m128i bias32 = _mm_set1_epi32(0x8000);
const __m128i bias16 = _mm_set1_epi16((short)0x8000);
const __m128i pa = _mm_sub_epi32(a, bias32);
const __m128i pb = _mm_sub_epi32(b, bias32);
return _mm_add_epi16(_mm_packs_epi32(pa, pb), bias16);
}

/**
* @brief Horizontal maximum of four packed unsigned 32-bit integers (SSE2).
*
* @param[in] v The 128-bit vector containing 4 unsigned 32-bit integers.
* @return The maximum unsigned 32-bit integer found in the vector.
*/
// codeql[cpp/unused-static-function] : Used conditionally when ZXC_USE_SSE2 is defined
static ZXC_ALWAYS_INLINE uint32_t zxc_mm_reduce_max_epu32(__m128i v) {
__m128i vshuf = _mm_shuffle_epi32(v, _MM_SHUFFLE(1, 0, 3, 2)); // Swap 64-bit halves
v = zxc_mm_max_epu32_sse2(v, vshuf); // Max of pairs
vshuf = _mm_shuffle_epi32(v, _MM_SHUFFLE(2, 3, 0, 1)); // Swap adjacent lanes
v = zxc_mm_max_epu32_sse2(v, vshuf); // Final max
return (uint32_t)_mm_cvtsi128_si32(v); // Extract scalar result
}
#endif

/**
* @brief Writes a Prefix Varint encoded value to a buffer.
*
Expand Down Expand Up @@ -315,6 +375,22 @@ static ZXC_ALWAYS_INLINE zxc_match_t zxc_lz77_find_best_match(
goto _match_len_done;
}
}
#elif defined(ZXC_USE_SSE2)
const uint8_t* limit_16 = iend - 16;
while (ip + mlen < limit_16) {
const __m128i v_src = _mm_loadu_si128((const __m128i*)(ip + mlen));
const __m128i v_ref = _mm_loadu_si128((const __m128i*)(ref + mlen));
const __m128i v_cmp = _mm_cmpeq_epi8(v_src, v_ref);
const uint32_t mask = (uint32_t)_mm_movemask_epi8(v_cmp);
if (mask == 0xFFFFU)
mlen += 16;
else {
// mask != 0xFFFF => a differing byte exists in bits 0..15,
// so the lowest set bit of ~mask lies in that range.
mlen += zxc_ctz32(~mask);
goto _match_len_done;
}
}
#elif defined(ZXC_USE_NEON64)
{
const uint8_t* limit_32 = iend - 32;
Expand Down Expand Up @@ -669,11 +745,37 @@ static int zxc_encode_block_num(const zxc_cctx_t* RESTRICT ctx, const uint8_t* R
max_d = vgetq_lane_u32(v_max1, 0);
#endif

if (j > 0) prev = zxc_le32(in_ptr + (j - 1) * sizeof(uint32_t));
}
#elif defined(ZXC_USE_SSE2)
// SSE2 processes 128-bit vectors (4 uint32 integers)
if (frames >= 4) {
__m128i v_max_accum = _mm_setzero_si128(); // Initialize max accumulator to 0

for (; j < (frames & ~3); j += 4) {
if (UNLIKELY(i == 0 && j == 0)) goto _scalar;

// Load 4 consecutive integers and the same window offset by -1
const __m128i vc = _mm_loadu_si128((const __m128i*)(in_ptr + j * 4));
const __m128i vp = _mm_loadu_si128((const __m128i*)(in_ptr + j * 4 - 4));

const __m128i diff = _mm_sub_epi32(vc, vp); // Compute deltas: curr - prev

// ZigZag encode: (diff << 1) ^ (diff >> 31)
const __m128i zigzag =
_mm_xor_si128(_mm_slli_epi32(diff, 1), _mm_srai_epi32(diff, 31));
_mm_storeu_si128((__m128i*)&deltas[j], zigzag); // Store results
// SSE2 has no unsigned max; use the SSE2 emulation.
v_max_accum = zxc_mm_max_epu32_sse2(v_max_accum, zigzag); // Update max accumulator
}

max_d = zxc_mm_reduce_max_epu32(v_max_accum); // Horizontal max reduction

if (j > 0) prev = zxc_le32(in_ptr + (j - 1) * sizeof(uint32_t));
}
#endif
#if defined(ZXC_USE_AVX2) || defined(ZXC_USE_AVX512) || defined(ZXC_USE_NEON64) || \
defined(ZXC_USE_NEON32)
defined(ZXC_USE_NEON32) || defined(ZXC_USE_SSE2)
_scalar:
#endif
for (; j < frames; j++) {
Expand Down Expand Up @@ -823,6 +925,34 @@ static ZXC_ALWAYS_INLINE size_t zxc_opt_dp_update_const_cost(
vst1_u16(&parent_off[p + L], vbsl_u16(v_mask16, v_off, v_po));
}
}
#elif defined(ZXC_USE_SSE2)
if (L + 4 <= L_end) {
const __m128i v_inc = _mm_setr_epi32(0, 1, 2, 3);
const __m128i v_nxt = _mm_set1_epi32((int)nxt);
const __m128i v_bias = _mm_set1_epi32((int)0x80000000);
const __m128i v_nxt_b = _mm_xor_si128(v_nxt, v_bias);
const __m128i v_off = _mm_set1_epi16((short)off_biased);
for (; L + 4 <= L_end; L += 4) {
const __m128i v_L_lanes = _mm_add_epi32(v_inc, _mm_set1_epi32((int)L));
const __m128i v_dp = _mm_loadu_si128((const __m128i*)&dp[p + L]);
/* Unsigned compare via sign-bit bias (SSE2 cmpgt is signed only):
* (dp ^ 0x80000000) > (nxt ^ 0x80000000) iff dp > nxt. */
const __m128i v_dp_b = _mm_xor_si128(v_dp, v_bias);
const __m128i v_mask = _mm_cmpgt_epi32(v_dp_b, v_nxt_b);
const __m128i v_dp_new = zxc_mm_blendv_epi8_sse2(v_dp, v_nxt, v_mask);
_mm_storeu_si128((__m128i*)&dp[p + L], v_dp_new);
/* Narrow the 4x int32 mask / length lanes to 4x int16 (low 64 bits).
* packs: 0xFFFFFFFF -> 0xFFFF, 0 -> 0; packus (SSE2-emulated): u32->u16. */
const __m128i v_mask16 = _mm_packs_epi32(v_mask, v_mask);
const __m128i v_L_u16 = zxc_mm_packus_epi32_sse2(v_L_lanes, v_L_lanes);
__m128i v_pl = _mm_loadl_epi64((const __m128i*)&parent_len[p + L]);
v_pl = zxc_mm_blendv_epi8_sse2(v_pl, v_L_u16, v_mask16);
_mm_storel_epi64((__m128i*)&parent_len[p + L], v_pl);
__m128i v_po = _mm_loadl_epi64((const __m128i*)&parent_off[p + L]);
v_po = zxc_mm_blendv_epi8_sse2(v_po, v_off, v_mask16);
_mm_storel_epi64((__m128i*)&parent_off[p + L], v_po);
}
}
#endif
/* Scalar tail (and full path on archs without SIMD).
* L < L_end <= UINT16_MAX (caller precondition), so the cast is lossless. */
Expand Down Expand Up @@ -1398,6 +1528,17 @@ parse_done:;
}
p += 32;
}
#elif defined(ZXC_USE_SSE2)
const __m128i vb = _mm_set1_epi8((char)b);
while (p <= p_end - 16) {
const __m128i v = _mm_loadu_si128((const __m128i*)p);
const uint32_t mask = (uint32_t)_mm_movemask_epi8(_mm_cmpeq_epi8(v, vb));
if (mask != 0xFFFFU) {
p += zxc_ctz32(~mask);
goto _run_done;
}
p += 16;
}
#elif defined(ZXC_USE_NEON64)
const uint8x16_t vb = vdupq_n_u8(b);
while (p <= p_end - 16) {
Expand Down Expand Up @@ -1446,7 +1587,7 @@ parse_done:;
while (p < p_end && *p == b) p++;

#if defined(ZXC_USE_AVX512) || defined(ZXC_USE_AVX2) || defined(ZXC_USE_NEON64) || \
defined(ZXC_USE_NEON32)
defined(ZXC_USE_NEON32) || defined(ZXC_USE_SSE2)
_run_done:;
#endif
const size_t run = (size_t)(p - run_start);
Expand Down Expand Up @@ -1497,6 +1638,22 @@ parse_done:;
}
p += 32;
}
#elif defined(ZXC_USE_SSE2)
while (p <= p_end_4 - 16) {
__m128i v0 = _mm_loadu_si128((const __m128i*)p);
__m128i v1 = _mm_loadu_si128((const __m128i*)(p + 1));
__m128i v2 = _mm_loadu_si128((const __m128i*)(p + 2));
__m128i v3 = _mm_loadu_si128((const __m128i*)(p + 3));
__m128i vend = _mm_and_si128(
_mm_cmpeq_epi8(v0, v1),
_mm_and_si128(_mm_cmpeq_epi8(v1, v2), _mm_cmpeq_epi8(v2, v3)));
uint32_t mask = (uint32_t)_mm_movemask_epi8(vend);
if (mask != 0) {
p += zxc_ctz32(mask);
goto _lit_done;
}
p += 16;
}
#elif defined(ZXC_USE_NEON64)
while (p <= p_end_4 - 16) {
uint8x16_t v0 = vld1q_u8(p);
Expand Down Expand Up @@ -1561,7 +1718,7 @@ parse_done:;
}

#if defined(ZXC_USE_AVX512) || defined(ZXC_USE_AVX2) || defined(ZXC_USE_NEON64) || \
defined(ZXC_USE_NEON32)
defined(ZXC_USE_NEON32) || defined(ZXC_USE_SSE2)
_lit_done:;
#endif
const size_t lit_run = (size_t)(p - lit_start);
Expand Down
36 changes: 36 additions & 0 deletions src/lib/zxc_decompress.c
Original file line number Diff line number Diff line change
Expand Up @@ -195,11 +195,13 @@ static ZXC_ALWAYS_INLINE void zxc_copy_overlap16(uint8_t* dst, uint32_t off) {
vst1_u8(dst + 8, vtbl2_u8(src_tbl, mask_hi));

#elif defined(ZXC_USE_AVX2) || defined(ZXC_USE_AVX512)
// SSSE3 PSHUFB (implied by AVX2/AVX512): single 16-byte table-lookup replicate.
__m128i mask = _mm_load_si128((const __m128i*)zxc_overlap_masks[off]);
__m128i src_data = _mm_loadu_si128((const __m128i*)(dst - off));
_mm_storeu_si128((__m128i*)dst, _mm_shuffle_epi8(src_data, mask));

#else
// SSE2-only tier and non-SIMD builds: scalar replicate (no PSHUFB).
const uint8_t* src = dst - off;
for (size_t i = 0; i < 16; i++) {
dst[i] = src[i % off];
Expand Down Expand Up @@ -308,6 +310,24 @@ static ZXC_ALWAYS_INLINE __m512i zxc_mm512_prefix_sum_epi32(__m512i v) {
}
#endif

#if defined(ZXC_USE_SSE2)
/**
* @brief Computes the prefix sum of a 128-bit vector of four 32-bit integers
* using SSE2 byte-shift adds (the 128-bit analogue of the NEON helper).
*
* For input [a, b, c, d] the result is [a, a+b, a+b+c, a+b+c+d].
*
* @param[in] v Input vector of four 32-bit integers.
* @return Vector containing the prefix sums.
*/
// codeql[cpp/unused-static-function] : Used conditionally when ZXC_USE_SSE2 is defined
static ZXC_ALWAYS_INLINE __m128i zxc_mm_prefix_sum_epi32(__m128i v) {
v = _mm_add_epi32(v, _mm_slli_si128(v, 4)); // [a, a+b, b+c, c+d]
v = _mm_add_epi32(v, _mm_slli_si128(v, 8)); // [a, a+b, a+b+c, a+b+c+d]
return v;
}
#endif

/**
* @brief Decodes a block of numerical data compressed with the ZXC format.
*
Expand Down Expand Up @@ -442,6 +462,22 @@ static int zxc_decode_block_num(const uint8_t* RESTRICT src, const size_t src_si
running_val = vgetq_lane_u32(v_run, 0); // Extract once at the end of the batch
#endif

#elif defined(ZXC_USE_SSE2)
__m128i v_run = _mm_set1_epi32((int)running_val); // Broadcast running total
for (int k = 0; k < ZXC_NUM_DEC_BATCH; k += 4) {
__m128i v_deltas = _mm_load_si128((const __m128i*)&deltas[k]); // Load 4 deltas

__m128i v_sum = zxc_mm_prefix_sum_epi32(v_deltas); // Local prefix sums
v_sum = _mm_add_epi32(v_sum, v_run); // Add base running total

_mm_storeu_si128((__m128i*)&batch_dst[k], v_sum); // Store decoded values

// Broadcast 4th element (lane 3) to all lanes for the next iteration.
v_run = _mm_shuffle_epi32(v_sum, _MM_SHUFFLE(3, 3, 3, 3));
}
// Extract final running_val back to GPR for the scalar tail.
running_val = (uint32_t)_mm_cvtsi128_si32(v_run);

#else
for (int k = 0; k < ZXC_NUM_DEC_BATCH; k++) {
running_val += deltas[k];
Expand Down
Loading
Loading