From ce17d63f2c5656ff7fc4404d76150edb0b3ded86 Mon Sep 17 00:00:00 2001 From: mperikov Date: Tue, 16 Dec 2025 16:38:31 +0300 Subject: [PATCH 1/9] lower_bound_zeros_* --- include/bits.h | 101 ++++++++++++++++++++++++++++++++++++++++++++++ src/unittests.cpp | 46 +++++++++++++++++++++ 2 files changed, 147 insertions(+) diff --git a/include/bits.h b/include/bits.h index c7ee26d..1ced870 100644 --- a/include/bits.h +++ b/include/bits.h @@ -107,6 +107,13 @@ uint64_t select_64(uint64_t x, uint64_t rank) { return _tzcnt_u64(_pdep_u64(1ull << rank, x)); } +/** + * @brief Return position of @p rank0 0 bit in @p x + */ +uint64_t select0_64(uint64_t x, uint64_t rank0) { + return select_64(~x, rank0); +} + /** * @brief Return position of @p rank 1 bit in @p x * @details Selecting within 64-bit word can be done @@ -190,6 +197,57 @@ uint16_t lower_bound_4x64(const uint64_t* x, uint64_t y) { #endif } +/** + * @brief Compare 4 64-bit numbers of ( @p pos + @p dlt - @p x ) + * with @p y and return the length of the prefix + * where @p y is less then ( @p pos + @p dlt - @p x ) + */ +uint16_t lower_bound_zeros_4x64(const uint64_t* x, uint64_t y, const uint64_t* dlt, uint64_t pos) { +#ifdef PIXIE_AVX512_SUPPORT + + const __m256i dlt_256 = _mm256_loadu_epi64(dlt); + auto x_256 = _mm256_loadu_epi64(x); + auto pos_4 = _mm256_set1_epi64x(pos); + auto y_4 = _mm256_set1_epi64x(y); + + auto tmp = _mm256_add_epi64(pos_4, dlt_256); + auto reg_256 = _mm256_sub_epi64(tmp, x_256); + auto cmp = _mm256_cmpge_epu64_mask(reg_256, y_4); + + return _tzcnt_u16(cmp); + +#else +#ifdef PIXIE_AVX2_SUPPORT + + const __m256i dlt_256 = _mm256_loadu_si256(reinterpret_cast(dlt)); + auto x_256 = _mm256_loadu_si256(reinterpret_cast(x)); + auto pos_4 = _mm256_set1_epi64x(pos); + auto y_4 = _mm256_set1_epi64x(y); + + auto tmp = _mm256_add_epi64(pos_4, dlt_256); + auto reg_256 = _mm256_sub_epi64(tmp, x_256); + + const __m256i offset = _mm256_set1_epi64x(0x8000000000000000ULL); + __m256i x_offset = _mm256_xor_si256(reg_256, offset); + __m256i y_offset = _mm256_xor_si256(y_4, offset); + auto mask = _mm256_movemask_epi8(_mm256_cmpgt_epi64( + x_offset, _mm256_sub_epi64(y_offset, _mm256_set1_epi64x(1)))); + + return _tzcnt_u32(mask) >> 3; + +#else + + for (uint16_t i = 0; i < 4; ++i) { + if (pos + dlt[i] - x[i] >= y) { + return i; + } + } + return 4; + +#endif +#endif +} + /** * @brief Compare 8 64-bit numbers of @p x with @p y and * return the length of the prefix where @p y is less then @p x @@ -227,6 +285,49 @@ uint16_t lower_bound_8x64(const uint64_t* x, uint64_t y) { #endif } +/** + * @brief Compare 8 64-bit numbers of ( @p pos + @p dlt - @p x ) + * with @p y and return the length of the prefix + * where @p y is less then ( @p pos + @p dlt - @p x ) + */ +uint16_t lower_bound_zeros_8x64(const uint64_t* x, uint64_t y, const uint64_t* dlt, uint64_t pos) { +#ifdef PIXIE_AVX512_SUPPORT + + const __m512i dlt_512 = _mm512_loadu_epi64(dlt); + auto x_512 = _mm512_loadu_epi64(x); + auto pos_8 = _mm512_set1_epi64(pos); + auto y_8 = _mm512_set1_epi64(y); + + auto tmp = _mm512_add_epi64(pos_8, dlt_512); + auto reg_512 = _mm512_sub_epi64(tmp, x_512); + auto cmp = _mm512_cmpge_epu64_mask(reg_512, y_8); + + return _tzcnt_u16(cmp); + +#else +#ifdef PIXIE_AVX2_SUPPORT + + uint16_t len = lower_bound_zeros_4x64(x, y, dlt, pos); + + if (len < 4) { + return len; + } + + return len + lower_bound_zeros_4x64(x + 4, y, dlt + 4, pos); + +#else + + for (uint16_t i = 0; i < 8; ++i) { + if (pos + dlt[i] - x[i] >= y) { + return i; + } + } + return 8; + +#endif +#endif +} + /** * @brief Compare 32 16-bit numbers of @p x with @p y and * return the count of numbers where @p x is less then @p y diff --git a/src/unittests.cpp b/src/unittests.cpp index dddb0b7..9e202e2 100644 --- a/src/unittests.cpp +++ b/src/unittests.cpp @@ -330,6 +330,52 @@ TEST(LowerBound8x64, Random) { } } +TEST(LowerBoundZeros4x64, Random) { + std::vector x(8); + uint64_t dlt[8]; + std::mt19937_64 rng(42); + for (size_t i = 0; i < 100000; i++) { + uint64_t y = rng(); + uint64_t pos = rng(); + uint16_t cnt = 0; + bool fl = 1; + for (size_t j = 0; j < 4; j++) { + dlt[j] = rng(); + x[j] = rng(); + fl &= pos + dlt[j] - x[j] < y; + cnt += fl; + } + if (cnt < 4) { + ASSERT_EQ(lower_bound_zeros_4x64(x.data(), y, dlt, pos), cnt); + } else { + ASSERT_GE(lower_bound_zeros_4x64(x.data(), y, dlt, pos), cnt); + } + } +} + +TEST(LowerBoundZeros8x64, Random) { + std::vector x(8); + uint64_t dlt[8]; + std::mt19937_64 rng(42); + for (size_t i = 0; i < 100000; i++) { + uint64_t y = rng(); + uint64_t pos = rng(); + uint16_t cnt = 0; + bool fl = 1; + for (size_t j = 0; j < 8; j++) { + dlt[j] = rng(); + x[j] = rng(); + fl &= pos + dlt[j] - x[j] < y; + cnt += fl; + } + if (cnt < 8) { + ASSERT_EQ(lower_bound_zeros_8x64(x.data(), y, dlt, pos), cnt); + } else { + ASSERT_GE(lower_bound_zeros_8x64(x.data(), y, dlt, pos), cnt); + } + } +} + TEST(LowerBound32x16, Random) { std::vector x(32); std::mt19937 rng(42); From 2ab5c844cc22e9532372363eb21e149bc6980f7c Mon Sep 17 00:00:00 2001 From: mperikov Date: Wed, 17 Dec 2025 14:34:43 +0300 Subject: [PATCH 2/9] Format fix --- include/bits.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/include/bits.h b/include/bits.h index 1ced870..9af8eea 100644 --- a/include/bits.h +++ b/include/bits.h @@ -202,7 +202,10 @@ uint16_t lower_bound_4x64(const uint64_t* x, uint64_t y) { * with @p y and return the length of the prefix * where @p y is less then ( @p pos + @p dlt - @p x ) */ -uint16_t lower_bound_zeros_4x64(const uint64_t* x, uint64_t y, const uint64_t* dlt, uint64_t pos) { +uint16_t lower_bound_zeros_4x64(const uint64_t* x, + uint64_t y, + const uint64_t* dlt, + uint64_t pos) { #ifdef PIXIE_AVX512_SUPPORT const __m256i dlt_256 = _mm256_loadu_epi64(dlt); @@ -219,7 +222,8 @@ uint16_t lower_bound_zeros_4x64(const uint64_t* x, uint64_t y, const uint64_t* d #else #ifdef PIXIE_AVX2_SUPPORT - const __m256i dlt_256 = _mm256_loadu_si256(reinterpret_cast(dlt)); + const __m256i dlt_256 = + _mm256_loadu_si256(reinterpret_cast(dlt)); auto x_256 = _mm256_loadu_si256(reinterpret_cast(x)); auto pos_4 = _mm256_set1_epi64x(pos); auto y_4 = _mm256_set1_epi64x(y); @@ -290,7 +294,10 @@ uint16_t lower_bound_8x64(const uint64_t* x, uint64_t y) { * with @p y and return the length of the prefix * where @p y is less then ( @p pos + @p dlt - @p x ) */ -uint16_t lower_bound_zeros_8x64(const uint64_t* x, uint64_t y, const uint64_t* dlt, uint64_t pos) { +uint16_t lower_bound_zeros_8x64(const uint64_t* x, + uint64_t y, + const uint64_t* dlt, + uint64_t pos) { #ifdef PIXIE_AVX512_SUPPORT const __m512i dlt_512 = _mm512_loadu_epi64(dlt); From c947e32afc45ec7b35188e92ae601037ece3f880 Mon Sep 17 00:00:00 2001 From: mperikov Date: Thu, 18 Dec 2025 14:00:51 +0300 Subject: [PATCH 3/9] lower_bound_dlt_32x16 + select0_512 + tests --- include/bits.h | 160 ++++++++++++++++++++++++++++++++--------- include/bitvector.h | 169 +++++++++++++++++++++++++++++++++++++++++--- src/unittests.cpp | 85 ++++++++++++++++++---- 3 files changed, 354 insertions(+), 60 deletions(-) diff --git a/include/bits.h b/include/bits.h index 9af8eea..dd87f7f 100644 --- a/include/bits.h +++ b/include/bits.h @@ -107,13 +107,6 @@ uint64_t select_64(uint64_t x, uint64_t rank) { return _tzcnt_u64(_pdep_u64(1ull << rank, x)); } -/** - * @brief Return position of @p rank0 0 bit in @p x - */ -uint64_t select0_64(uint64_t x, uint64_t rank0) { - return select_64(~x, rank0); -} - /** * @brief Return position of @p rank 1 bit in @p x * @details Selecting within 64-bit word can be done @@ -135,8 +128,8 @@ uint64_t select_512(const uint64_t* x, uint64_t rank) { #ifdef PIXIE_AVX512_SUPPORT __m512i res = _mm512_loadu_epi64(x); - std::array counts; - _mm512_storeu_epi64(counts.data(), _mm512_popcnt_epi64(res)); + alignas(64) std::array counts; + _mm512_store_epi64(counts.data(), _mm512_popcnt_epi64(res)); size_t i = 0; while (i < 8 && counts[i] <= rank) { @@ -157,6 +150,37 @@ uint64_t select_512(const uint64_t* x, uint64_t rank) { #endif } +/** + * @brief Return position of @p rank0 0 bit in @p x + * @details select_512 with bit inversion. + */ +uint64_t select0_512(const uint64_t* x, uint64_t rank0) { +#ifdef PIXIE_AVX512_SUPPORT + + __m512i res = _mm512_loadu_epi64(x); + res = _mm512_xor_epi64(res, _mm512_set1_epi64(-1)); + alignas(64) std::array counts; + _mm512_store_epi64(counts.data(), _mm512_popcnt_epi64(res)); + + size_t i = 0; + while (i < 8 && counts[i] <= rank0) { + rank0 -= counts[i++]; + } + return i * 64 + select_64(~x[i], rank0); + +#else + + size_t i = 0; + int popcount = std::popcount(~x[0]); + while (i < 7 && popcount <= rank0) { + rank0 -= popcount; + popcount = std::popcount(~x[++i]); + } + return i * 64 + select_64(~x[i], rank0); + +#endif +} + /** * @brief Compare 4 64-bit numbers of @p x with @p y and * return the length of the prefix where @p y is less then @p x @@ -198,22 +222,22 @@ uint16_t lower_bound_4x64(const uint64_t* x, uint64_t y) { } /** - * @brief Compare 4 64-bit numbers of ( @p pos + @p dlt - @p x ) + * @brief Compare 4 64-bit numbers of ( @p dlt_array + @p dlt_scalar - @p x ) * with @p y and return the length of the prefix - * where @p y is less then ( @p pos + @p dlt - @p x ) + * where @p y is less then ( @p dlt_array + @p dlt_scalar - @p x ) */ -uint16_t lower_bound_zeros_4x64(const uint64_t* x, - uint64_t y, - const uint64_t* dlt, - uint64_t pos) { +uint16_t lower_bound_dlt_4x64(const uint64_t* x, + uint64_t y, + const uint64_t* dlt_array, + uint64_t dlt_scalar) { #ifdef PIXIE_AVX512_SUPPORT - const __m256i dlt_256 = _mm256_loadu_epi64(dlt); + const __m256i dlt_256 = _mm256_loadu_epi64(dlt_array); auto x_256 = _mm256_loadu_epi64(x); - auto pos_4 = _mm256_set1_epi64x(pos); + auto dlt_4 = _mm256_set1_epi64x(dlt_scalar); auto y_4 = _mm256_set1_epi64x(y); - auto tmp = _mm256_add_epi64(pos_4, dlt_256); + auto tmp = _mm256_add_epi64(dlt_4, dlt_256); auto reg_256 = _mm256_sub_epi64(tmp, x_256); auto cmp = _mm256_cmpge_epu64_mask(reg_256, y_4); @@ -223,12 +247,12 @@ uint16_t lower_bound_zeros_4x64(const uint64_t* x, #ifdef PIXIE_AVX2_SUPPORT const __m256i dlt_256 = - _mm256_loadu_si256(reinterpret_cast(dlt)); + _mm256_loadu_si256(reinterpret_cast(dlt_array)); auto x_256 = _mm256_loadu_si256(reinterpret_cast(x)); - auto pos_4 = _mm256_set1_epi64x(pos); + auto dlt_4 = _mm256_set1_epi64x(dlt_scalar); auto y_4 = _mm256_set1_epi64x(y); - auto tmp = _mm256_add_epi64(pos_4, dlt_256); + auto tmp = _mm256_add_epi64(dlt_4, dlt_256); auto reg_256 = _mm256_sub_epi64(tmp, x_256); const __m256i offset = _mm256_set1_epi64x(0x8000000000000000ULL); @@ -242,7 +266,7 @@ uint16_t lower_bound_zeros_4x64(const uint64_t* x, #else for (uint16_t i = 0; i < 4; ++i) { - if (pos + dlt[i] - x[i] >= y) { + if (dlt_array[i] + dlt_scalar - x[i] >= y) { return i; } } @@ -290,22 +314,22 @@ uint16_t lower_bound_8x64(const uint64_t* x, uint64_t y) { } /** - * @brief Compare 8 64-bit numbers of ( @p pos + @p dlt - @p x ) + * @brief Compare 8 64-bit numbers of ( @p dlt_array + @p dlt_scalar - @p x ) * with @p y and return the length of the prefix - * where @p y is less then ( @p pos + @p dlt - @p x ) + * where @p y is less then ( @p dlt_array + @p dlt_scalar - @p x ) */ -uint16_t lower_bound_zeros_8x64(const uint64_t* x, - uint64_t y, - const uint64_t* dlt, - uint64_t pos) { +uint16_t lower_bound_dlt_8x64(const uint64_t* x, + uint64_t y, + const uint64_t* dlt_array, + uint64_t dlt_scalar) { #ifdef PIXIE_AVX512_SUPPORT - const __m512i dlt_512 = _mm512_loadu_epi64(dlt); + const __m512i dlt_512 = _mm512_loadu_epi64(dlt_array); auto x_512 = _mm512_loadu_epi64(x); - auto pos_8 = _mm512_set1_epi64(pos); + auto dlt_8 = _mm512_set1_epi64(dlt_scalar); auto y_8 = _mm512_set1_epi64(y); - auto tmp = _mm512_add_epi64(pos_8, dlt_512); + auto tmp = _mm512_add_epi64(dlt_8, dlt_512); auto reg_512 = _mm512_sub_epi64(tmp, x_512); auto cmp = _mm512_cmpge_epu64_mask(reg_512, y_8); @@ -314,18 +338,18 @@ uint16_t lower_bound_zeros_8x64(const uint64_t* x, #else #ifdef PIXIE_AVX2_SUPPORT - uint16_t len = lower_bound_zeros_4x64(x, y, dlt, pos); + uint16_t len = lower_bound_dlt_4x64(x, y, dlt_array, dlt_scalar); if (len < 4) { return len; } - return len + lower_bound_zeros_4x64(x + 4, y, dlt + 4, pos); + return len + lower_bound_dlt_4x64(x + 4, y, dlt_array + 4, dlt_scalar); #else for (uint16_t i = 0; i < 8; ++i) { - if (pos + dlt[i] - x[i] >= y) { + if (dlt_array[i] + dlt_scalar - x[i] >= y) { return i; } } @@ -339,7 +363,7 @@ uint16_t lower_bound_zeros_8x64(const uint64_t* x, * @brief Compare 32 16-bit numbers of @p x with @p y and * return the count of numbers where @p x is less then @p y */ -uint16_t lower_bound_32x16(const uint16_t* x, uint64_t y) { +uint16_t lower_bound_32x16(const uint16_t* x, uint16_t y) { #ifdef PIXIE_AVX512_SUPPORT auto y_32 = _mm512_set1_epi16(y); @@ -381,6 +405,72 @@ uint16_t lower_bound_32x16(const uint16_t* x, uint64_t y) { #endif } +/** + * @brief Compare 32 16-bit numbers of ( @p dlt_array + @p dlt_scalar - @p x ) + * with @p y and return the count of numbers where + * ( @p dlt_array + @p dlt_scalar - @p x ) is less then @p y + */ +uint16_t lower_bound_dlt_32x16(const uint16_t* x, + uint16_t y, + const uint16_t* dlt_array, + uint16_t dlt_scalar) { +#ifdef PIXIE_AVX512_SUPPORT + + const __m512i dlt_512 = _mm512_loadu_epi64(dlt_array); + auto x_512 = _mm512_loadu_epi64(x); + auto dlt_32 = _mm512_set1_epi16(dlt_scalar); + auto y_32 = _mm512_set1_epi16(y); + + auto tmp = _mm512_add_epi16(dlt_32, dlt_512); + auto reg_512 = _mm512_sub_epi16(tmp, x_512); + auto cmp = _mm512_cmplt_epu16_mask(reg_512, y_32); + return std::popcount(cmp); + +#else +#ifdef PIXIE_AVX2_SUPPORT + + auto dlt_256 = + _mm256_loadu_si256(reinterpret_cast(dlt_array)); + auto x_256 = _mm256_loadu_si256(reinterpret_cast(x)); + auto dlt_16 = _mm256_set1_epi16(dlt_scalar); + auto y_16 = _mm256_set1_epi16(y); + + auto tmp = _mm256_add_epi16(dlt_16, dlt_256); + auto reg_256 = _mm256_sub_epi16(tmp, x_256); + + const __m256i offset = _mm256_set1_epi16(0x8000); + __m256i x_offset = _mm256_xor_si256(reg_256, offset); + __m256i y_offset = _mm256_xor_si256(y_16, offset); + uint32_t mask = _mm256_movemask_epi8(_mm256_cmpgt_epi16(y_offset, x_offset)); + + uint16_t count = std::popcount(mask) >> 1; + + dlt_256 = + _mm256_loadu_si256(reinterpret_cast(dlt_array + 16)); + x_256 = _mm256_loadu_si256(reinterpret_cast(x + 16)); + + tmp = _mm256_add_epi16(dlt_16, dlt_256); + reg_256 = _mm256_sub_epi16(tmp, x_256); + + x_offset = _mm256_xor_si256(reg_256, offset); + mask = _mm256_movemask_epi8(_mm256_cmpgt_epi16(y_offset, x_offset)); + + return count + (std::popcount(mask) >> 1); + +#else + + uint16_t cnt = 0; + for (uint16_t i = 0; i < 32; ++i) { + if (dlt_array[i] + dlt_scalar - x[i] < y) { + cnt++; + } + } + return cnt; + +#endif +#endif +} + /** * @brief Calculates 64 popcounts of 4-bits integers and stores as 64 4-bits * integers (packed into 32 bytes) diff --git a/include/bitvector.h b/include/bitvector.h index 117e00a..de2a530 100644 --- a/include/bitvector.h +++ b/include/bitvector.h @@ -72,9 +72,24 @@ class BitVector { constexpr static size_t kSelectSampleFrequency = 16384; constexpr static size_t kBlocksPerSuperBlock = 128; + struct PreCalculation { + alignas(64) static uint64_t dlt_super[8]; + alignas(64) static uint16_t dlt_basic[32]; + + constexpr PreCalculation() { + for (size_t i = 0; i < 8; ++i) { + dlt_super[i] = i * kSuperBlockSize; + } + for (size_t i = 0; i < 32; ++i) { + dlt_basic[i] = i * kBasicBlockSize; + } + } + }; + std::vector super_block_rank; std::vector basic_block_rank; std::vector select_samples; + std::vector select0_samples; const size_t num_bits_; const size_t padded_size_; size_t max_rank; @@ -119,10 +134,14 @@ class BitVector { */ void build_select() { uint64_t milestone = kSelectSampleFrequency; + uint64_t milestone0 = kSelectSampleFrequency; uint64_t rank = 0; + uint64_t rank0 = 0; select_samples.emplace_back(0); + select0_samples.emplace_back(0); for (size_t i = 0; i < bits.size(); ++i) { auto ones = std::popcount(bits[i]); + auto zeros = 64 - ones; if (rank + ones >= milestone) { auto pos = select_64(bits[i], milestone - rank - 1); // TODO: try including global rank into select samples to save @@ -130,7 +149,13 @@ class BitVector { select_samples.emplace_back((64 * i + pos) / kSuperBlockSize); milestone += kSelectSampleFrequency; } + if (rank0 + zeros >= milestone0) { + auto pos = select_64(~bits[i], milestone0 - rank0 - 1); + select0_samples.emplace_back((64 * i + pos) / kSuperBlockSize); + milestone0 += kSelectSampleFrequency; + } rank += ones; + rank0 += zeros; } } @@ -160,6 +185,36 @@ class BitVector { return left - 1; } + /** + * @brief first step of the select0 operation + */ + uint64_t find_superblock_zeros(uint64_t rank0) const { + uint64_t left = select0_samples[rank0 / kSelectSampleFrequency]; + + while (left + 7 < super_block_rank.size()) { + auto len = lower_bound_dlt_8x64(&super_block_rank[left], rank0, + PreCalculation::dlt_super, + kSuperBlockSize * left); + if (len < 8) { + return left + len - 1; + } + left += 8; + } + if (left + 3 < super_block_rank.size()) { + auto len = lower_bound_dlt_4x64(&super_block_rank[left], rank0, + PreCalculation::dlt_super, + kSuperBlockSize * left); + if (len < 4) { + return left + len - 1; + } + left += 4; + } + while (left < super_block_rank.size() && super_block_rank[left] < rank0) { + left++; + } + return left - 1; + } + /** * @brief SIMD-optimized linear scan * @details @@ -167,17 +222,32 @@ class BitVector { * 4 iterations. */ uint64_t find_basicblock(uint16_t local_rank, uint64_t s_block) const { - auto pos = 0; + for (size_t pos = 0; pos < kBlocksPerSuperBlock; pos += 32) { + auto count = lower_bound_32x16( + &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank); + if (count < 32) { + return kBlocksPerSuperBlock * s_block + pos + count - 1; + } + } + return kBlocksPerSuperBlock * s_block + kBlocksPerSuperBlock - 1; + } - for (size_t i = 0; i < 4; ++i) { - auto count = lower_bound_32x16(&basic_block_rank[128 * s_block + 32 * i], - local_rank); + /** + * @brief SIMD-optimized linear scan + * @details + * Processes 32 16-bit entries at once (full cache line), so there is at most + * 4 iterations. + */ + uint64_t find_basicblock_zeros(uint16_t local_rank0, uint64_t s_block) const { + for (size_t pos = 0; pos < kBlocksPerSuperBlock; pos += 32) { + auto count = lower_bound_dlt_32x16( + &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank0, + PreCalculation::dlt_basic, kBasicBlockSize * pos); if (count < 32) { return kBlocksPerSuperBlock * s_block + pos + count - 1; } - pos += 32; } - return kBlocksPerSuperBlock * s_block + pos - 1; + return kBlocksPerSuperBlock * s_block + kBlocksPerSuperBlock - 1; } /** @@ -197,8 +267,8 @@ class BitVector { pos = pos + 16 < 32 ? 0 : (pos - 16); pos = pos > 96 ? 96 : pos; while (pos < 96) { - auto count = - lower_bound_32x16(&basic_block_rank[128 * s_block + pos], local_rank); + auto count = lower_bound_32x16( + &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank); if (count == 0) { return find_basicblock(local_rank, s_block); } @@ -208,14 +278,54 @@ class BitVector { pos += 32; } pos = 96; - auto count = - lower_bound_32x16(&basic_block_rank[128 * s_block + pos], local_rank); + auto count = lower_bound_32x16( + &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank); if (count == 0) { return find_basicblock(local_rank, s_block); } return kBlocksPerSuperBlock * s_block + pos + count - 1; } + /** + * @brief Interpolation search with SIMD optimization + * @details + * Similar to find_basicblock_zeros but initial guess is based on linear + * interpolation, for random data it should make initial guess correct + * most of the times, we start from the 32 wide block with interpolation + * guess at the center, if we see that select result lie in lower blocks + * we backoff to find_basicblock_zeros + */ + uint64_t find_basicblock_is_zeros(uint16_t local_rank0, + uint64_t s_block) const { + auto lower = kSuperBlockSize * s_block - super_block_rank[s_block]; + auto upper = + kSuperBlockSize * (s_block + 1) - super_block_rank[s_block + 1]; + + uint64_t pos = kBlocksPerSuperBlock * local_rank0 / (upper - lower); + pos = pos + 16 < 32 ? 0 : (pos - 16); + pos = pos > 96 ? 96 : pos; + while (pos < 96) { + auto count = lower_bound_dlt_32x16( + &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank0, + PreCalculation::dlt_basic, kBasicBlockSize * pos); + if (count == 0) { + return find_basicblock_zeros(local_rank0, s_block); + } + if (count < 32) { + return kBlocksPerSuperBlock * s_block + pos + count - 1; + } + pos += 32; + } + pos = 96; + auto count = lower_bound_dlt_32x16( + &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank0, + PreCalculation::dlt_basic, kBasicBlockSize * pos); + if (count == 0) { + return find_basicblock_zeros(local_rank0, s_block); + } + return kBlocksPerSuperBlock * s_block + pos + count - 1; + } + public: /** * Constructor from an external array of uint64_t. @@ -258,6 +368,16 @@ class BitVector { return result; } + /** + * Rank zero operation: count the number of 0-bits up to position pos + * (exclusive) rank_0(pos) = number of 0s in positions [0...pos-1] + */ + uint64_t rank0(size_t pos) const { return pos - rank(pos); } + + /** + * Select operation: find the position of the i-th occurrence of a 1-bit + * select_1(rank) = index of the rank-th occurrence of 1-bit + */ uint64_t select(size_t rank) const { if (rank > max_rank) [[unlikely]] { return num_bits_; @@ -283,6 +403,35 @@ class BitVector { return kWordSize * pos + select_512(&bits[pos], rank - 1); } + /** + * Select zero operation: find the position of the i-th occurrence of a 0-bit + * select_0(rank) = index of the rank-th occurrence of 0-bit + */ + uint64_t select0(size_t rank) const { + if (rank > max_rank) [[unlikely]] { + return num_bits_; + } + if (rank == 0) [[unlikely]] { + return 0; + } + uint64_t s_block = find_superblock(rank); + rank -= super_block_rank[s_block]; + auto pos = find_basicblock_is(rank, s_block); + rank -= basic_block_rank[pos]; + pos *= kWordsPerBlock; + + // Final search + if (pos + kWordsPerBlock - 1 < kWordsPerBlock) [[unlikely]] { + size_t ones = std::popcount(bits[pos]); + while (pos < bits.size() && ones < rank) { + rank -= ones; + ones = std::popcount(bits[++pos]); + } + return kWordSize * pos + select_64(bits[pos], rank - 1); + } + return kWordSize * pos + select_512(&bits[pos], rank - 1); + } + /** * Convert the bit vector to a binary string, for debug purpose only */ diff --git a/src/unittests.cpp b/src/unittests.cpp index 9e202e2..0203f1a 100644 --- a/src/unittests.cpp +++ b/src/unittests.cpp @@ -89,6 +89,14 @@ TEST(Select512, Ones) { } } +TEST(SelectZero512, Zeros) { + std::array a{0, 0, 0, 0, 0, 0, 0, 0}; + for (size_t i = 0; i < 512; ++i) { + auto p = select0_512(a.data(), i); + EXPECT_EQ(p, i); + } +} + TEST(Select512, Random) { std::array a{std::numeric_limits::max(), std::numeric_limits::max(), @@ -114,6 +122,31 @@ TEST(Select512, Random) { } } +TEST(SelectZero512, Random) { + std::array a{std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max()}; + + std::mt19937_64 rng(42); + for (size_t t = 0; t < 1000; ++t) { + for (auto& x : a) { + x = rng(); + } + size_t rank = 0; + for (size_t i = 0; i < 512; ++i) { + if ((1 & (a[i >> 6] >> (i & 63))) == 0) { + auto p = select0_512(a.data(), rank++); + ASSERT_EQ(p, i); + } + } + } +} + TEST(Select512, RankCompativility) { std::array a{std::numeric_limits::max(), std::numeric_limits::max(), @@ -330,48 +363,48 @@ TEST(LowerBound8x64, Random) { } } -TEST(LowerBoundZeros4x64, Random) { - std::vector x(8); - uint64_t dlt[8]; +TEST(LowerBoundDlt4x64, Random) { + std::vector x(4); + uint64_t dlt_array[4]; std::mt19937_64 rng(42); for (size_t i = 0; i < 100000; i++) { uint64_t y = rng(); - uint64_t pos = rng(); + uint64_t dlt_scalar = rng(); uint16_t cnt = 0; bool fl = 1; for (size_t j = 0; j < 4; j++) { - dlt[j] = rng(); + dlt_array[j] = rng(); x[j] = rng(); - fl &= pos + dlt[j] - x[j] < y; + fl &= dlt_scalar + dlt_array[j] - x[j] < y; cnt += fl; } if (cnt < 4) { - ASSERT_EQ(lower_bound_zeros_4x64(x.data(), y, dlt, pos), cnt); + ASSERT_EQ(lower_bound_dlt_4x64(x.data(), y, dlt_array, dlt_scalar), cnt); } else { - ASSERT_GE(lower_bound_zeros_4x64(x.data(), y, dlt, pos), cnt); + ASSERT_GE(lower_bound_dlt_4x64(x.data(), y, dlt_array, dlt_scalar), cnt); } } } -TEST(LowerBoundZeros8x64, Random) { +TEST(LowerBoundDlt8x64, Random) { std::vector x(8); - uint64_t dlt[8]; + uint64_t dlt_array[8]; std::mt19937_64 rng(42); for (size_t i = 0; i < 100000; i++) { uint64_t y = rng(); - uint64_t pos = rng(); + uint64_t dlt_scalar = rng(); uint16_t cnt = 0; bool fl = 1; for (size_t j = 0; j < 8; j++) { - dlt[j] = rng(); + dlt_array[j] = rng(); x[j] = rng(); - fl &= pos + dlt[j] - x[j] < y; + fl &= dlt_scalar + dlt_array[j] - x[j] < y; cnt += fl; } if (cnt < 8) { - ASSERT_EQ(lower_bound_zeros_8x64(x.data(), y, dlt, pos), cnt); + ASSERT_EQ(lower_bound_dlt_8x64(x.data(), y, dlt_array, dlt_scalar), cnt); } else { - ASSERT_GE(lower_bound_zeros_8x64(x.data(), y, dlt, pos), cnt); + ASSERT_GE(lower_bound_dlt_8x64(x.data(), y, dlt_array, dlt_scalar), cnt); } } } @@ -390,6 +423,28 @@ TEST(LowerBound32x16, Random) { } } +TEST(LowerBoundDlt32x16, Random) { + std::vector x(32); + uint16_t dlt_array[32]; + std::mt19937 rng(42); + for (size_t i = 0; i < 100000; i++) { + uint16_t y = rng(); + uint16_t dlt_scalar = rng(); + uint16_t cnt = 0; + for (size_t j = 0; j < 32; j++) { + x[j] = rng(); + if (dlt_scalar < x[j]) { + dlt_array[j] = + x[j] - dlt_scalar + rng() % ((1 << 16) - x[j] + dlt_scalar); + } else { + dlt_array[j] = rng() % ((1 << 16) - dlt_scalar); + } + cnt += dlt_array[j] + dlt_scalar - x[j] < y; + } + ASSERT_EQ(lower_bound_dlt_32x16(x.data(), y, dlt_array, dlt_scalar), cnt); + } +} + TEST(Popcount64x4, Random) { std::vector x(32); std::vector y(32); From a953b868e3fd34443a64f99a4b95725ab474a7af Mon Sep 17 00:00:00 2001 From: mperikov Date: Thu, 18 Dec 2025 15:40:27 +0300 Subject: [PATCH 4/9] rank0/select0 + tests --- include/bitvector.h | 66 ++++++++++++++++++++-------------------- src/unittests.cpp | 74 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 107 insertions(+), 33 deletions(-) diff --git a/include/bitvector.h b/include/bitvector.h index de2a530..2ed9b1a 100644 --- a/include/bitvector.h +++ b/include/bitvector.h @@ -7,6 +7,8 @@ #include #include +//#include + #include "bits.h" namespace pixie { @@ -72,19 +74,8 @@ class BitVector { constexpr static size_t kSelectSampleFrequency = 16384; constexpr static size_t kBlocksPerSuperBlock = 128; - struct PreCalculation { - alignas(64) static uint64_t dlt_super[8]; - alignas(64) static uint16_t dlt_basic[32]; - - constexpr PreCalculation() { - for (size_t i = 0; i < 8; ++i) { - dlt_super[i] = i * kSuperBlockSize; - } - for (size_t i = 0; i < 32; ++i) { - dlt_basic[i] = i * kBasicBlockSize; - } - } - }; + alignas(64) uint64_t dlt_super[8]; + alignas(64) uint16_t dlt_basic[32]; std::vector super_block_rank; std::vector basic_block_rank; @@ -157,6 +148,13 @@ class BitVector { rank += ones; rank0 += zeros; } + + for (size_t i = 0; i < 8; ++i) { + dlt_super[i] = i * kSuperBlockSize; + } + for (size_t i = 0; i < 32; ++i) { + dlt_basic[i] = i * kBasicBlockSize; + } } /** @@ -193,7 +191,7 @@ class BitVector { while (left + 7 < super_block_rank.size()) { auto len = lower_bound_dlt_8x64(&super_block_rank[left], rank0, - PreCalculation::dlt_super, + dlt_super, kSuperBlockSize * left); if (len < 8) { return left + len - 1; @@ -202,7 +200,7 @@ class BitVector { } if (left + 3 < super_block_rank.size()) { auto len = lower_bound_dlt_4x64(&super_block_rank[left], rank0, - PreCalculation::dlt_super, + dlt_super, kSuperBlockSize * left); if (len < 4) { return left + len - 1; @@ -242,7 +240,7 @@ class BitVector { for (size_t pos = 0; pos < kBlocksPerSuperBlock; pos += 32) { auto count = lower_bound_dlt_32x16( &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank0, - PreCalculation::dlt_basic, kBasicBlockSize * pos); + dlt_basic, kBasicBlockSize * pos); if (count < 32) { return kBlocksPerSuperBlock * s_block + pos + count - 1; } @@ -307,7 +305,7 @@ class BitVector { while (pos < 96) { auto count = lower_bound_dlt_32x16( &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank0, - PreCalculation::dlt_basic, kBasicBlockSize * pos); + dlt_basic, kBasicBlockSize * pos); if (count == 0) { return find_basicblock_zeros(local_rank0, s_block); } @@ -319,7 +317,7 @@ class BitVector { pos = 96; auto count = lower_bound_dlt_32x16( &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank0, - PreCalculation::dlt_basic, kBasicBlockSize * pos); + dlt_basic, kBasicBlockSize * pos); if (count == 0) { return find_basicblock_zeros(local_rank0, s_block); } @@ -358,6 +356,9 @@ class BitVector { * rank_1(pos) = number of 1s in positions [0...pos-1] */ uint64_t rank(size_t pos) const { + if (pos >= num_bits_) [[unlikely]] { + return max_rank; + } uint64_t b_block = pos / kBasicBlockSize; uint64_t s_block = pos / kSuperBlockSize; // Precomputed rank @@ -405,31 +406,32 @@ class BitVector { /** * Select zero operation: find the position of the i-th occurrence of a 0-bit - * select_0(rank) = index of the rank-th occurrence of 0-bit + * select_0(rank0) = index of the rank0-th occurrence of 0-bit */ - uint64_t select0(size_t rank) const { - if (rank > max_rank) [[unlikely]] { + uint64_t select0(size_t rank0) const { + if (rank0 > num_bits_ - max_rank) [[unlikely]] { return num_bits_; } - if (rank == 0) [[unlikely]] { + if (rank0 == 0) [[unlikely]] { return 0; } - uint64_t s_block = find_superblock(rank); - rank -= super_block_rank[s_block]; - auto pos = find_basicblock_is(rank, s_block); - rank -= basic_block_rank[pos]; + uint64_t s_block = find_superblock_zeros(rank0); + rank0 -= kSuperBlockSize * s_block - super_block_rank[s_block]; + auto pos = find_basicblock_is_zeros(rank0, s_block); + auto pos_in_super_block = pos & (kBlocksPerSuperBlock - 1); + rank0 -= kBasicBlockSize * pos_in_super_block - basic_block_rank[pos]; pos *= kWordsPerBlock; // Final search if (pos + kWordsPerBlock - 1 < kWordsPerBlock) [[unlikely]] { - size_t ones = std::popcount(bits[pos]); - while (pos < bits.size() && ones < rank) { - rank -= ones; - ones = std::popcount(bits[++pos]); + size_t zeros = std::popcount(~bits[pos]); + while (pos < bits.size() && zeros < rank0) { + rank0 -= zeros; + zeros = std::popcount(~bits[++pos]); } - return kWordSize * pos + select_64(bits[pos], rank - 1); + return kWordSize * pos + select_64(~bits[pos], rank0 - 1); } - return kWordSize * pos + select_512(&bits[pos], rank - 1); + return kWordSize * pos + select0_512(&bits[pos], rank0 - 1); } /** diff --git a/src/unittests.cpp b/src/unittests.cpp index 0203f1a..20db4a6 100644 --- a/src/unittests.cpp +++ b/src/unittests.cpp @@ -242,7 +242,7 @@ TEST(BitVectorTest, RankWithZeros) { TEST(BitVectorTest, SelectBasic) { std::vector bits(8, 0); bits[0] = 0b1100010110010110; - BitVector bv(bits, 5); + BitVector bv(bits, 16); EXPECT_EQ(bv.select(1), 1); EXPECT_EQ(bv.select(2), 2); @@ -288,6 +288,78 @@ TEST(BitVectorTest, MainSelectTest) { } } +TEST(BitVectorTest, RankZeroBasic) { + std::vector bits(8, 0); + bits[0] = 0b10110; + BitVector bv(bits, 10); + + EXPECT_EQ(bv.rank0(0), 0); + EXPECT_EQ(bv.rank0(1), 1); + EXPECT_EQ(bv.rank0(2), 1); + EXPECT_EQ(bv.rank0(3), 1); + EXPECT_EQ(bv.rank0(4), 2); + EXPECT_EQ(bv.rank0(5), 2); +} + +TEST(BitVectorTest, RankZeroWithOnes) { + std::vector bits(8, 0); + BitVector bv(bits, 5); + + for (size_t i = 0; i <= 5; i++) { + EXPECT_EQ(bv.rank0(i), i); + } +} + +TEST(BitVectorTest, SelectZeroBasic) { + std::vector bits(8, 0); + bits[0] = 0b1100010110010110; + BitVector bv(bits, 16); + + EXPECT_EQ(bv.select0(1), 0); + EXPECT_EQ(bv.select0(2), 3); + EXPECT_EQ(bv.select0(3), 5); + EXPECT_EQ(bv.select0(4), 6); + EXPECT_EQ(bv.select0(5), 9); + EXPECT_EQ(bv.select0(6), 11); + EXPECT_EQ(bv.select0(7), 12); + EXPECT_EQ(bv.select0(8), 13); +} + +TEST(BitVectorTest, MainRankZeroTest) { + std::mt19937_64 rng(42); + std::vector bits(65536 * 32); + for (size_t i = 0; i < bits.size(); i++) { + bits[i] = rng(); + } + + BitVector bv(bits, bits.size() * 64); + uint64_t zero_count = 0; + for (size_t i = 0; i < bv.size(); ++i) { + ASSERT_EQ(zero_count, bv.rank0(i)); + zero_count += (bv[i] == 0) ? 1 : 0; + } +} + +TEST(BitVectorTest, MainSelectZeroTest) { + std::mt19937_64 rng(42); + std::vector bits(65536 * 32); + for (size_t i = 0; i < bits.size(); i++) { + bits[i] = rng(); + } + + BitVector bv(bits, bits.size() * 64); + uint64_t zero_rank = 0; + + for (size_t i = 0; i < bv.size(); ++i) { + if (bv[i] == 0) { + zero_rank++; + EXPECT_EQ(bv.select0(zero_rank), i); + EXPECT_EQ(bv.rank0(i), zero_rank - 1); + EXPECT_EQ(bv.rank0(i + 1), zero_rank); + } + } +} + TEST(BitVectorInterleavedTest, AtTest) { std::mt19937_64 rng(42); /* From 13181afe18ca282d7176e14836323dbf238e1965 Mon Sep 17 00:00:00 2001 From: mperikov Date: Thu, 18 Dec 2025 15:41:31 +0300 Subject: [PATCH 5/9] Format fix --- include/bitvector.h | 8 ++-- src/unittests.cpp | 100 ++++++++++++++++++++++---------------------- 2 files changed, 53 insertions(+), 55 deletions(-) diff --git a/include/bitvector.h b/include/bitvector.h index 2ed9b1a..638e687 100644 --- a/include/bitvector.h +++ b/include/bitvector.h @@ -7,7 +7,7 @@ #include #include -//#include +// #include #include "bits.h" @@ -190,8 +190,7 @@ class BitVector { uint64_t left = select0_samples[rank0 / kSelectSampleFrequency]; while (left + 7 < super_block_rank.size()) { - auto len = lower_bound_dlt_8x64(&super_block_rank[left], rank0, - dlt_super, + auto len = lower_bound_dlt_8x64(&super_block_rank[left], rank0, dlt_super, kSuperBlockSize * left); if (len < 8) { return left + len - 1; @@ -199,8 +198,7 @@ class BitVector { left += 8; } if (left + 3 < super_block_rank.size()) { - auto len = lower_bound_dlt_4x64(&super_block_rank[left], rank0, - dlt_super, + auto len = lower_bound_dlt_4x64(&super_block_rank[left], rank0, dlt_super, kSuperBlockSize * left); if (len < 4) { return left + len - 1; diff --git a/src/unittests.cpp b/src/unittests.cpp index 20db4a6..27d037c 100644 --- a/src/unittests.cpp +++ b/src/unittests.cpp @@ -289,75 +289,75 @@ TEST(BitVectorTest, MainSelectTest) { } TEST(BitVectorTest, RankZeroBasic) { - std::vector bits(8, 0); - bits[0] = 0b10110; - BitVector bv(bits, 10); + std::vector bits(8, 0); + bits[0] = 0b10110; + BitVector bv(bits, 10); - EXPECT_EQ(bv.rank0(0), 0); - EXPECT_EQ(bv.rank0(1), 1); - EXPECT_EQ(bv.rank0(2), 1); - EXPECT_EQ(bv.rank0(3), 1); - EXPECT_EQ(bv.rank0(4), 2); - EXPECT_EQ(bv.rank0(5), 2); + EXPECT_EQ(bv.rank0(0), 0); + EXPECT_EQ(bv.rank0(1), 1); + EXPECT_EQ(bv.rank0(2), 1); + EXPECT_EQ(bv.rank0(3), 1); + EXPECT_EQ(bv.rank0(4), 2); + EXPECT_EQ(bv.rank0(5), 2); } TEST(BitVectorTest, RankZeroWithOnes) { - std::vector bits(8, 0); - BitVector bv(bits, 5); + std::vector bits(8, 0); + BitVector bv(bits, 5); - for (size_t i = 0; i <= 5; i++) { - EXPECT_EQ(bv.rank0(i), i); - } + for (size_t i = 0; i <= 5; i++) { + EXPECT_EQ(bv.rank0(i), i); + } } TEST(BitVectorTest, SelectZeroBasic) { - std::vector bits(8, 0); - bits[0] = 0b1100010110010110; - BitVector bv(bits, 16); + std::vector bits(8, 0); + bits[0] = 0b1100010110010110; + BitVector bv(bits, 16); - EXPECT_EQ(bv.select0(1), 0); - EXPECT_EQ(bv.select0(2), 3); - EXPECT_EQ(bv.select0(3), 5); - EXPECT_EQ(bv.select0(4), 6); - EXPECT_EQ(bv.select0(5), 9); - EXPECT_EQ(bv.select0(6), 11); - EXPECT_EQ(bv.select0(7), 12); - EXPECT_EQ(bv.select0(8), 13); + EXPECT_EQ(bv.select0(1), 0); + EXPECT_EQ(bv.select0(2), 3); + EXPECT_EQ(bv.select0(3), 5); + EXPECT_EQ(bv.select0(4), 6); + EXPECT_EQ(bv.select0(5), 9); + EXPECT_EQ(bv.select0(6), 11); + EXPECT_EQ(bv.select0(7), 12); + EXPECT_EQ(bv.select0(8), 13); } TEST(BitVectorTest, MainRankZeroTest) { - std::mt19937_64 rng(42); - std::vector bits(65536 * 32); - for (size_t i = 0; i < bits.size(); i++) { - bits[i] = rng(); - } + std::mt19937_64 rng(42); + std::vector bits(65536 * 32); + for (size_t i = 0; i < bits.size(); i++) { + bits[i] = rng(); + } - BitVector bv(bits, bits.size() * 64); - uint64_t zero_count = 0; - for (size_t i = 0; i < bv.size(); ++i) { - ASSERT_EQ(zero_count, bv.rank0(i)); - zero_count += (bv[i] == 0) ? 1 : 0; - } + BitVector bv(bits, bits.size() * 64); + uint64_t zero_count = 0; + for (size_t i = 0; i < bv.size(); ++i) { + ASSERT_EQ(zero_count, bv.rank0(i)); + zero_count += (bv[i] == 0) ? 1 : 0; + } } TEST(BitVectorTest, MainSelectZeroTest) { - std::mt19937_64 rng(42); - std::vector bits(65536 * 32); - for (size_t i = 0; i < bits.size(); i++) { - bits[i] = rng(); - } + std::mt19937_64 rng(42); + std::vector bits(65536 * 32); + for (size_t i = 0; i < bits.size(); i++) { + bits[i] = rng(); + } - BitVector bv(bits, bits.size() * 64); - uint64_t zero_rank = 0; + BitVector bv(bits, bits.size() * 64); + uint64_t zero_rank = 0; - for (size_t i = 0; i < bv.size(); ++i) { - if (bv[i] == 0) { - zero_rank++; - EXPECT_EQ(bv.select0(zero_rank), i); - EXPECT_EQ(bv.rank0(i), zero_rank - 1); - EXPECT_EQ(bv.rank0(i + 1), zero_rank); - } + for (size_t i = 0; i < bv.size(); ++i) { + if (bv[i] == 0) { + zero_rank++; + EXPECT_EQ(bv.select0(zero_rank), i); + EXPECT_EQ(bv.rank0(i), zero_rank - 1); + EXPECT_EQ(bv.rank0(i + 1), zero_rank); } + } } TEST(BitVectorInterleavedTest, AtTest) { From 92e0609349409c2b1894155e48889eb07eb9a70d Mon Sep 17 00:00:00 2001 From: mperikov Date: Fri, 19 Dec 2025 15:22:40 +0300 Subject: [PATCH 6/9] rank/rank0 and find_superblock_zeros fix --- include/bitvector.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/include/bitvector.h b/include/bitvector.h index 638e687..6f2e959 100644 --- a/include/bitvector.h +++ b/include/bitvector.h @@ -205,7 +205,7 @@ class BitVector { } left += 4; } - while (left < super_block_rank.size() && super_block_rank[left] < rank0) { + while (left < super_block_rank.size() && kSuperBlockSize * left - super_block_rank[left] < rank0) { left++; } return left - 1; @@ -354,7 +354,7 @@ class BitVector { * rank_1(pos) = number of 1s in positions [0...pos-1] */ uint64_t rank(size_t pos) const { - if (pos >= num_bits_) [[unlikely]] { + if (pos >= bits.size() * kWordSize) [[unlikely]] { return max_rank; } uint64_t b_block = pos / kBasicBlockSize; @@ -371,7 +371,12 @@ class BitVector { * Rank zero operation: count the number of 0-bits up to position pos * (exclusive) rank_0(pos) = number of 0s in positions [0...pos-1] */ - uint64_t rank0(size_t pos) const { return pos - rank(pos); } + uint64_t rank0(size_t pos) const { + if (pos >= bits.size() * kWordSize) [[unlikely]] { + return bits.size() * kWordSize - max_rank; + } + return pos - rank(pos); + } /** * Select operation: find the position of the i-th occurrence of a 1-bit From 01819e1c6ea403d3cc9114627a3abaa4001b503c Mon Sep 17 00:00:00 2001 From: mperikov Date: Fri, 19 Dec 2025 15:23:06 +0300 Subject: [PATCH 7/9] select0/rank0 benchmarks --- src/benchmark_tests.cpp | 67 +++++++++++++++++- src/benchmarks.cpp | 152 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 215 insertions(+), 4 deletions(-) diff --git a/src/benchmark_tests.cpp b/src/benchmark_tests.cpp index 1233a3c..2c302f4 100644 --- a/src/benchmark_tests.cpp +++ b/src/benchmark_tests.cpp @@ -10,7 +10,7 @@ using pixie::BitVector; using pixie::BitVectorInterleaved; TEST(BitVectorBenchmarkTest, SelectNonInterleaved10PercentFill) { - for (size_t n = 4; n <= (1ull << 34); n <<= 2) { + for (size_t n = 8; n <= (1ull << 34); n <<= 2) { std::mt19937_64 rng(42); std::vector bits(((8 + n / 64) / 8) * 8); @@ -30,8 +30,29 @@ TEST(BitVectorBenchmarkTest, SelectNonInterleaved10PercentFill) { } } +TEST(BitVectorBenchmarkTest, SelectZeroNonInterleaved10PercentFill) { + for (size_t n = 8; n <= (1ull << 34); n <<= 2) { + std::mt19937_64 rng(42); + + std::vector bits(((8 + n / 64) / 8) * 8); + size_t num_ones = n * 0.1; + for (int i = 0; i < num_ones; i++) { + uint64_t pos = rng() % n; + bits[pos / 64] |= (1ULL << pos % 64); + } + + pixie::BitVector bv(bits, n); + + auto max_rank0 = bv.rank0(bv.size()) + 1; + for (int i = 0; i < 100000; i++) { + uint64_t rank0 = rng() % max_rank0; + bv.select0(rank0); + } + } +} + TEST(BitVectorBenchmarkTest, SelectNonInterleaved90PercentFill) { - for (size_t n = 4; n <= (1ull << 34); n <<= 2) { + for (size_t n = 8; n <= (1ull << 34); n <<= 2) { std::mt19937_64 rng(42); std::vector bits(((8 + n / 64) / 8) * 8); @@ -51,8 +72,29 @@ TEST(BitVectorBenchmarkTest, SelectNonInterleaved90PercentFill) { } } +TEST(BitVectorBenchmarkTest, SelectZeroNonInterleaved90PercentFill) { + for (size_t n = 8; n <= (1ull << 34); n <<= 2) { + std::mt19937_64 rng(42); + + std::vector bits(((8 + n / 64) / 8) * 8); + size_t num_ones = n * 0.9; + for (int i = 0; i < num_ones; i++) { + uint64_t pos = rng() % n; + bits[pos / 64] |= (1ULL << pos % 64); + } + + pixie::BitVector bv(bits, n); + + auto max_rank0 = bv.rank0(bv.size()) + 1; + for (int i = 0; i < 100000; i++) { + uint64_t rank0 = rng() % max_rank0; + bv.select0(rank0); + } + } +} + TEST(BitVectorBenchmarkTest, SelectNonInterleaved) { - for (size_t n = 4; n <= (1ull << 34); n <<= 2) { + for (size_t n = 8; n <= (1ull << 34); n <<= 2) { std::mt19937_64 rng(42); std::vector bits(((8 + n / 64) / 8) * 8); @@ -69,3 +111,22 @@ TEST(BitVectorBenchmarkTest, SelectNonInterleaved) { } } } + +TEST(BitVectorBenchmarkTest, SelectZeroNonInterleaved) { + for (size_t n = 8; n <= (1ull << 34); n <<= 2) { + std::mt19937_64 rng(42); + + std::vector bits(((8 + n / 64) / 8) * 8); + for (auto& x : bits) { + x = rng(); + } + pixie::BitVector bv(bits, n); + + auto max_rank0 = bv.rank0(bv.size()) + 1; + + for (int i = 0; i < 100000; i++) { + uint64_t rank0 = rng() % max_rank0; + bv.select0(rank0); + } + } +} diff --git a/src/benchmarks.cpp b/src/benchmarks.cpp index e2586fa..92bafe2 100644 --- a/src/benchmarks.cpp +++ b/src/benchmarks.cpp @@ -24,6 +24,22 @@ static void BM_RankNonInterleaved(benchmark::State& state) { } } +static void BM_RankZeroNonInterleaved(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); + + std::vector bits(((8 + n / 64) / 8) * 8); + for (auto& x : bits) { + x = rng(); + } + pixie::BitVector bv(bits, n); + + for (auto _ : state) { + uint64_t pos = rng() % n; + benchmark::DoNotOptimize(bv.rank0(pos)); + } +} + static void BM_RankInterleaved(benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); @@ -58,6 +74,24 @@ static void BM_SelectNonInterleaved(benchmark::State& state) { } } +static void BM_SelectZeroNonInterleaved(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); + + std::vector bits(((8 + n / 64) / 8) * 8); + for (auto& x : bits) { + x = rng(); + } + pixie::BitVector bv(bits, n); + + auto max_rank0 = bv.rank0(bv.size()) + 1; + + for (auto _ : state) { + uint64_t rank0 = rng() % max_rank0; + benchmark::DoNotOptimize(bv.select0(rank0)); + } +} + static void BM_RankNonInterleaved10PercentFill(benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); @@ -77,6 +111,25 @@ static void BM_RankNonInterleaved10PercentFill(benchmark::State& state) { } } +static void BM_RankZeroNonInterleaved10PercentFill(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); + + std::vector bits(((8 + n / 64) / 8) * 8); + size_t num_ones = n * 0.1; + for (int i = 0; i < num_ones; i++) { + uint64_t pos = rng() % n; + bits[pos / 64] |= (1ULL << pos % 64); + } + + pixie::BitVector bv(bits, n); + + for (auto _ : state) { + uint64_t pos = rng() % n; + benchmark::DoNotOptimize(bv.rank0(pos)); + } +} + static void BM_SelectNonInterleaved10PercentFill(benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); @@ -98,6 +151,27 @@ static void BM_SelectNonInterleaved10PercentFill(benchmark::State& state) { } } +static void BM_SelectZeroNonInterleaved10PercentFill(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); + + std::vector bits(((8 + n / 64) / 8) * 8); + size_t num_ones = n * 0.1; + for (int i = 0; i < num_ones; i++) { + uint64_t pos = rng() % n; + bits[pos / 64] |= (1ULL << pos % 64); + } + + pixie::BitVector bv(bits, n); + + auto max_rank0 = bv.rank0(bv.size()) + 1; + + for (auto _ : state) { + uint64_t rank0 = rng() % max_rank0; + benchmark::DoNotOptimize(bv.select0(rank0)); + } +} + static void BM_RankNonInterleaved90PercentFill(benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); @@ -117,6 +191,25 @@ static void BM_RankNonInterleaved90PercentFill(benchmark::State& state) { } } +static void BM_RankZeroNonInterleaved90PercentFill(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); + + std::vector bits(((8 + n / 64) / 8) * 8); + size_t num_ones = n * 0.9; + for (int i = 0; i < num_ones; i++) { + uint64_t pos = rng() % n; + bits[pos / 64] |= (1ULL << pos % 64); + } + + pixie::BitVector bv(bits, n); + + for (auto _ : state) { + uint64_t pos = rng() % n; + benchmark::DoNotOptimize(bv.rank0(pos)); + } +} + static void BM_SelectNonInterleaved90PercentFill(benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); @@ -138,13 +231,40 @@ static void BM_SelectNonInterleaved90PercentFill(benchmark::State& state) { } } +static void BM_SelectZeroNonInterleaved90PercentFill(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); + + std::vector bits(((8 + n / 64) / 8) * 8); + size_t num_ones = n * 0.9; + for (int i = 0; i < num_ones; i++) { + uint64_t pos = rng() % n; + bits[pos / 64] |= (1ULL << pos % 64); + } + + pixie::BitVector bv(bits, n); + + auto max_rank0 = bv.rank0(bv.size()) + 1; + + for (auto _ : state) { + uint64_t rank0 = rng() % max_rank0; + benchmark::DoNotOptimize(bv.select(rank0)); + } +} + +BENCHMARK(BM_RankInterleaved) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(8, 1ull << 34) + ->Iterations(100000); + BENCHMARK(BM_RankNonInterleaved) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) ->Iterations(100000); -BENCHMARK(BM_RankInterleaved) +BENCHMARK(BM_RankZeroNonInterleaved) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) @@ -156,26 +276,56 @@ BENCHMARK(BM_SelectNonInterleaved) ->Range(8, 1ull << 34) ->Iterations(100000); +BENCHMARK(BM_SelectZeroNonInterleaved) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(8, 1ull << 34) + ->Iterations(100000); + BENCHMARK(BM_RankNonInterleaved10PercentFill) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) ->Iterations(100000); +BENCHMARK(BM_RankZeroNonInterleaved10PercentFill) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(8, 1ull << 34) + ->Iterations(100000); + BENCHMARK(BM_SelectNonInterleaved10PercentFill) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) ->Iterations(100000); +BENCHMARK(BM_SelectZeroNonInterleaved10PercentFill) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(8, 1ull << 34) + ->Iterations(100000); + BENCHMARK(BM_RankNonInterleaved90PercentFill) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) ->Iterations(100000); +BENCHMARK(BM_RankZeroNonInterleaved90PercentFill) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(8, 1ull << 34) + ->Iterations(100000); + BENCHMARK(BM_SelectNonInterleaved90PercentFill) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) ->Iterations(100000); + +BENCHMARK(BM_SelectZeroNonInterleaved90PercentFill) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(8, 1ull << 34) + ->Iterations(100000); \ No newline at end of file From 5182b450cb293ee7d00ef2e72cda73368df27697 Mon Sep 17 00:00:00 2001 From: mperikov Date: Fri, 19 Dec 2025 15:25:15 +0300 Subject: [PATCH 8/9] Format fix --- include/bitvector.h | 3 ++- src/benchmarks.cpp | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/bitvector.h b/include/bitvector.h index 6f2e959..60a8f7e 100644 --- a/include/bitvector.h +++ b/include/bitvector.h @@ -205,7 +205,8 @@ class BitVector { } left += 4; } - while (left < super_block_rank.size() && kSuperBlockSize * left - super_block_rank[left] < rank0) { + while (left < super_block_rank.size() && + kSuperBlockSize * left - super_block_rank[left] < rank0) { left++; } return left - 1; diff --git a/src/benchmarks.cpp b/src/benchmarks.cpp index 92bafe2..54a0ea6 100644 --- a/src/benchmarks.cpp +++ b/src/benchmarks.cpp @@ -328,4 +328,4 @@ BENCHMARK(BM_SelectZeroNonInterleaved90PercentFill) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) - ->Iterations(100000); \ No newline at end of file + ->Iterations(100000); From 8ae31fbf91dd7db41f3ed7c81501475d1b22054c Mon Sep 17 00:00:00 2001 From: mperikov Date: Fri, 19 Dec 2025 22:03:45 +0300 Subject: [PATCH 9/9] include fix --- include/bitvector.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/bitvector.h b/include/bitvector.h index 60a8f7e..43e5bc9 100644 --- a/include/bitvector.h +++ b/include/bitvector.h @@ -7,8 +7,6 @@ #include #include -// #include - #include "bits.h" namespace pixie {