diff --git a/include/bits.h b/include/bits.h index c7ee26d..dd87f7f 100644 --- a/include/bits.h +++ b/include/bits.h @@ -128,8 +128,8 @@ uint64_t select_512(const uint64_t* x, uint64_t rank) { #ifdef PIXIE_AVX512_SUPPORT __m512i res = _mm512_loadu_epi64(x); - std::array counts; - _mm512_storeu_epi64(counts.data(), _mm512_popcnt_epi64(res)); + alignas(64) std::array counts; + _mm512_store_epi64(counts.data(), _mm512_popcnt_epi64(res)); size_t i = 0; while (i < 8 && counts[i] <= rank) { @@ -150,6 +150,37 @@ uint64_t select_512(const uint64_t* x, uint64_t rank) { #endif } +/** + * @brief Return position of @p rank0 0 bit in @p x + * @details select_512 with bit inversion. + */ +uint64_t select0_512(const uint64_t* x, uint64_t rank0) { +#ifdef PIXIE_AVX512_SUPPORT + + __m512i res = _mm512_loadu_epi64(x); + res = _mm512_xor_epi64(res, _mm512_set1_epi64(-1)); + alignas(64) std::array counts; + _mm512_store_epi64(counts.data(), _mm512_popcnt_epi64(res)); + + size_t i = 0; + while (i < 8 && counts[i] <= rank0) { + rank0 -= counts[i++]; + } + return i * 64 + select_64(~x[i], rank0); + +#else + + size_t i = 0; + int popcount = std::popcount(~x[0]); + while (i < 7 && popcount <= rank0) { + rank0 -= popcount; + popcount = std::popcount(~x[++i]); + } + return i * 64 + select_64(~x[i], rank0); + +#endif +} + /** * @brief Compare 4 64-bit numbers of @p x with @p y and * return the length of the prefix where @p y is less then @p x @@ -190,6 +221,61 @@ uint16_t lower_bound_4x64(const uint64_t* x, uint64_t y) { #endif } +/** + * @brief Compare 4 64-bit numbers of ( @p dlt_array + @p dlt_scalar - @p x ) + * with @p y and return the length of the prefix + * where @p y is less then ( @p dlt_array + @p dlt_scalar - @p x ) + */ +uint16_t lower_bound_dlt_4x64(const uint64_t* x, + uint64_t y, + const uint64_t* dlt_array, + uint64_t dlt_scalar) { +#ifdef PIXIE_AVX512_SUPPORT + + const __m256i dlt_256 = _mm256_loadu_epi64(dlt_array); + auto x_256 = _mm256_loadu_epi64(x); + auto dlt_4 = _mm256_set1_epi64x(dlt_scalar); + auto y_4 = _mm256_set1_epi64x(y); + + auto tmp = _mm256_add_epi64(dlt_4, dlt_256); + auto reg_256 = _mm256_sub_epi64(tmp, x_256); + auto cmp = _mm256_cmpge_epu64_mask(reg_256, y_4); + + return _tzcnt_u16(cmp); + +#else +#ifdef PIXIE_AVX2_SUPPORT + + const __m256i dlt_256 = + _mm256_loadu_si256(reinterpret_cast(dlt_array)); + auto x_256 = _mm256_loadu_si256(reinterpret_cast(x)); + auto dlt_4 = _mm256_set1_epi64x(dlt_scalar); + auto y_4 = _mm256_set1_epi64x(y); + + auto tmp = _mm256_add_epi64(dlt_4, dlt_256); + auto reg_256 = _mm256_sub_epi64(tmp, x_256); + + const __m256i offset = _mm256_set1_epi64x(0x8000000000000000ULL); + __m256i x_offset = _mm256_xor_si256(reg_256, offset); + __m256i y_offset = _mm256_xor_si256(y_4, offset); + auto mask = _mm256_movemask_epi8(_mm256_cmpgt_epi64( + x_offset, _mm256_sub_epi64(y_offset, _mm256_set1_epi64x(1)))); + + return _tzcnt_u32(mask) >> 3; + +#else + + for (uint16_t i = 0; i < 4; ++i) { + if (dlt_array[i] + dlt_scalar - x[i] >= y) { + return i; + } + } + return 4; + +#endif +#endif +} + /** * @brief Compare 8 64-bit numbers of @p x with @p y and * return the length of the prefix where @p y is less then @p x @@ -227,11 +313,57 @@ uint16_t lower_bound_8x64(const uint64_t* x, uint64_t y) { #endif } +/** + * @brief Compare 8 64-bit numbers of ( @p dlt_array + @p dlt_scalar - @p x ) + * with @p y and return the length of the prefix + * where @p y is less then ( @p dlt_array + @p dlt_scalar - @p x ) + */ +uint16_t lower_bound_dlt_8x64(const uint64_t* x, + uint64_t y, + const uint64_t* dlt_array, + uint64_t dlt_scalar) { +#ifdef PIXIE_AVX512_SUPPORT + + const __m512i dlt_512 = _mm512_loadu_epi64(dlt_array); + auto x_512 = _mm512_loadu_epi64(x); + auto dlt_8 = _mm512_set1_epi64(dlt_scalar); + auto y_8 = _mm512_set1_epi64(y); + + auto tmp = _mm512_add_epi64(dlt_8, dlt_512); + auto reg_512 = _mm512_sub_epi64(tmp, x_512); + auto cmp = _mm512_cmpge_epu64_mask(reg_512, y_8); + + return _tzcnt_u16(cmp); + +#else +#ifdef PIXIE_AVX2_SUPPORT + + uint16_t len = lower_bound_dlt_4x64(x, y, dlt_array, dlt_scalar); + + if (len < 4) { + return len; + } + + return len + lower_bound_dlt_4x64(x + 4, y, dlt_array + 4, dlt_scalar); + +#else + + for (uint16_t i = 0; i < 8; ++i) { + if (dlt_array[i] + dlt_scalar - x[i] >= y) { + return i; + } + } + return 8; + +#endif +#endif +} + /** * @brief Compare 32 16-bit numbers of @p x with @p y and * return the count of numbers where @p x is less then @p y */ -uint16_t lower_bound_32x16(const uint16_t* x, uint64_t y) { +uint16_t lower_bound_32x16(const uint16_t* x, uint16_t y) { #ifdef PIXIE_AVX512_SUPPORT auto y_32 = _mm512_set1_epi16(y); @@ -273,6 +405,72 @@ uint16_t lower_bound_32x16(const uint16_t* x, uint64_t y) { #endif } +/** + * @brief Compare 32 16-bit numbers of ( @p dlt_array + @p dlt_scalar - @p x ) + * with @p y and return the count of numbers where + * ( @p dlt_array + @p dlt_scalar - @p x ) is less then @p y + */ +uint16_t lower_bound_dlt_32x16(const uint16_t* x, + uint16_t y, + const uint16_t* dlt_array, + uint16_t dlt_scalar) { +#ifdef PIXIE_AVX512_SUPPORT + + const __m512i dlt_512 = _mm512_loadu_epi64(dlt_array); + auto x_512 = _mm512_loadu_epi64(x); + auto dlt_32 = _mm512_set1_epi16(dlt_scalar); + auto y_32 = _mm512_set1_epi16(y); + + auto tmp = _mm512_add_epi16(dlt_32, dlt_512); + auto reg_512 = _mm512_sub_epi16(tmp, x_512); + auto cmp = _mm512_cmplt_epu16_mask(reg_512, y_32); + return std::popcount(cmp); + +#else +#ifdef PIXIE_AVX2_SUPPORT + + auto dlt_256 = + _mm256_loadu_si256(reinterpret_cast(dlt_array)); + auto x_256 = _mm256_loadu_si256(reinterpret_cast(x)); + auto dlt_16 = _mm256_set1_epi16(dlt_scalar); + auto y_16 = _mm256_set1_epi16(y); + + auto tmp = _mm256_add_epi16(dlt_16, dlt_256); + auto reg_256 = _mm256_sub_epi16(tmp, x_256); + + const __m256i offset = _mm256_set1_epi16(0x8000); + __m256i x_offset = _mm256_xor_si256(reg_256, offset); + __m256i y_offset = _mm256_xor_si256(y_16, offset); + uint32_t mask = _mm256_movemask_epi8(_mm256_cmpgt_epi16(y_offset, x_offset)); + + uint16_t count = std::popcount(mask) >> 1; + + dlt_256 = + _mm256_loadu_si256(reinterpret_cast(dlt_array + 16)); + x_256 = _mm256_loadu_si256(reinterpret_cast(x + 16)); + + tmp = _mm256_add_epi16(dlt_16, dlt_256); + reg_256 = _mm256_sub_epi16(tmp, x_256); + + x_offset = _mm256_xor_si256(reg_256, offset); + mask = _mm256_movemask_epi8(_mm256_cmpgt_epi16(y_offset, x_offset)); + + return count + (std::popcount(mask) >> 1); + +#else + + uint16_t cnt = 0; + for (uint16_t i = 0; i < 32; ++i) { + if (dlt_array[i] + dlt_scalar - x[i] < y) { + cnt++; + } + } + return cnt; + +#endif +#endif +} + /** * @brief Calculates 64 popcounts of 4-bits integers and stores as 64 4-bits * integers (packed into 32 bytes) diff --git a/include/bitvector.h b/include/bitvector.h index 117e00a..43e5bc9 100644 --- a/include/bitvector.h +++ b/include/bitvector.h @@ -72,9 +72,13 @@ class BitVector { constexpr static size_t kSelectSampleFrequency = 16384; constexpr static size_t kBlocksPerSuperBlock = 128; + alignas(64) uint64_t dlt_super[8]; + alignas(64) uint16_t dlt_basic[32]; + std::vector super_block_rank; std::vector basic_block_rank; std::vector select_samples; + std::vector select0_samples; const size_t num_bits_; const size_t padded_size_; size_t max_rank; @@ -119,10 +123,14 @@ class BitVector { */ void build_select() { uint64_t milestone = kSelectSampleFrequency; + uint64_t milestone0 = kSelectSampleFrequency; uint64_t rank = 0; + uint64_t rank0 = 0; select_samples.emplace_back(0); + select0_samples.emplace_back(0); for (size_t i = 0; i < bits.size(); ++i) { auto ones = std::popcount(bits[i]); + auto zeros = 64 - ones; if (rank + ones >= milestone) { auto pos = select_64(bits[i], milestone - rank - 1); // TODO: try including global rank into select samples to save @@ -130,7 +138,20 @@ class BitVector { select_samples.emplace_back((64 * i + pos) / kSuperBlockSize); milestone += kSelectSampleFrequency; } + if (rank0 + zeros >= milestone0) { + auto pos = select_64(~bits[i], milestone0 - rank0 - 1); + select0_samples.emplace_back((64 * i + pos) / kSuperBlockSize); + milestone0 += kSelectSampleFrequency; + } rank += ones; + rank0 += zeros; + } + + for (size_t i = 0; i < 8; ++i) { + dlt_super[i] = i * kSuperBlockSize; + } + for (size_t i = 0; i < 32; ++i) { + dlt_basic[i] = i * kBasicBlockSize; } } @@ -160,6 +181,35 @@ class BitVector { return left - 1; } + /** + * @brief first step of the select0 operation + */ + uint64_t find_superblock_zeros(uint64_t rank0) const { + uint64_t left = select0_samples[rank0 / kSelectSampleFrequency]; + + while (left + 7 < super_block_rank.size()) { + auto len = lower_bound_dlt_8x64(&super_block_rank[left], rank0, dlt_super, + kSuperBlockSize * left); + if (len < 8) { + return left + len - 1; + } + left += 8; + } + if (left + 3 < super_block_rank.size()) { + auto len = lower_bound_dlt_4x64(&super_block_rank[left], rank0, dlt_super, + kSuperBlockSize * left); + if (len < 4) { + return left + len - 1; + } + left += 4; + } + while (left < super_block_rank.size() && + kSuperBlockSize * left - super_block_rank[left] < rank0) { + left++; + } + return left - 1; + } + /** * @brief SIMD-optimized linear scan * @details @@ -167,17 +217,32 @@ class BitVector { * 4 iterations. */ uint64_t find_basicblock(uint16_t local_rank, uint64_t s_block) const { - auto pos = 0; + for (size_t pos = 0; pos < kBlocksPerSuperBlock; pos += 32) { + auto count = lower_bound_32x16( + &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank); + if (count < 32) { + return kBlocksPerSuperBlock * s_block + pos + count - 1; + } + } + return kBlocksPerSuperBlock * s_block + kBlocksPerSuperBlock - 1; + } - for (size_t i = 0; i < 4; ++i) { - auto count = lower_bound_32x16(&basic_block_rank[128 * s_block + 32 * i], - local_rank); + /** + * @brief SIMD-optimized linear scan + * @details + * Processes 32 16-bit entries at once (full cache line), so there is at most + * 4 iterations. + */ + uint64_t find_basicblock_zeros(uint16_t local_rank0, uint64_t s_block) const { + for (size_t pos = 0; pos < kBlocksPerSuperBlock; pos += 32) { + auto count = lower_bound_dlt_32x16( + &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank0, + dlt_basic, kBasicBlockSize * pos); if (count < 32) { return kBlocksPerSuperBlock * s_block + pos + count - 1; } - pos += 32; } - return kBlocksPerSuperBlock * s_block + pos - 1; + return kBlocksPerSuperBlock * s_block + kBlocksPerSuperBlock - 1; } /** @@ -197,8 +262,8 @@ class BitVector { pos = pos + 16 < 32 ? 0 : (pos - 16); pos = pos > 96 ? 96 : pos; while (pos < 96) { - auto count = - lower_bound_32x16(&basic_block_rank[128 * s_block + pos], local_rank); + auto count = lower_bound_32x16( + &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank); if (count == 0) { return find_basicblock(local_rank, s_block); } @@ -208,14 +273,54 @@ class BitVector { pos += 32; } pos = 96; - auto count = - lower_bound_32x16(&basic_block_rank[128 * s_block + pos], local_rank); + auto count = lower_bound_32x16( + &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank); if (count == 0) { return find_basicblock(local_rank, s_block); } return kBlocksPerSuperBlock * s_block + pos + count - 1; } + /** + * @brief Interpolation search with SIMD optimization + * @details + * Similar to find_basicblock_zeros but initial guess is based on linear + * interpolation, for random data it should make initial guess correct + * most of the times, we start from the 32 wide block with interpolation + * guess at the center, if we see that select result lie in lower blocks + * we backoff to find_basicblock_zeros + */ + uint64_t find_basicblock_is_zeros(uint16_t local_rank0, + uint64_t s_block) const { + auto lower = kSuperBlockSize * s_block - super_block_rank[s_block]; + auto upper = + kSuperBlockSize * (s_block + 1) - super_block_rank[s_block + 1]; + + uint64_t pos = kBlocksPerSuperBlock * local_rank0 / (upper - lower); + pos = pos + 16 < 32 ? 0 : (pos - 16); + pos = pos > 96 ? 96 : pos; + while (pos < 96) { + auto count = lower_bound_dlt_32x16( + &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank0, + dlt_basic, kBasicBlockSize * pos); + if (count == 0) { + return find_basicblock_zeros(local_rank0, s_block); + } + if (count < 32) { + return kBlocksPerSuperBlock * s_block + pos + count - 1; + } + pos += 32; + } + pos = 96; + auto count = lower_bound_dlt_32x16( + &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank0, + dlt_basic, kBasicBlockSize * pos); + if (count == 0) { + return find_basicblock_zeros(local_rank0, s_block); + } + return kBlocksPerSuperBlock * s_block + pos + count - 1; + } + public: /** * Constructor from an external array of uint64_t. @@ -248,6 +353,9 @@ class BitVector { * rank_1(pos) = number of 1s in positions [0...pos-1] */ uint64_t rank(size_t pos) const { + if (pos >= bits.size() * kWordSize) [[unlikely]] { + return max_rank; + } uint64_t b_block = pos / kBasicBlockSize; uint64_t s_block = pos / kSuperBlockSize; // Precomputed rank @@ -258,6 +366,21 @@ class BitVector { return result; } + /** + * Rank zero operation: count the number of 0-bits up to position pos + * (exclusive) rank_0(pos) = number of 0s in positions [0...pos-1] + */ + uint64_t rank0(size_t pos) const { + if (pos >= bits.size() * kWordSize) [[unlikely]] { + return bits.size() * kWordSize - max_rank; + } + return pos - rank(pos); + } + + /** + * Select operation: find the position of the i-th occurrence of a 1-bit + * select_1(rank) = index of the rank-th occurrence of 1-bit + */ uint64_t select(size_t rank) const { if (rank > max_rank) [[unlikely]] { return num_bits_; @@ -283,6 +406,36 @@ class BitVector { return kWordSize * pos + select_512(&bits[pos], rank - 1); } + /** + * Select zero operation: find the position of the i-th occurrence of a 0-bit + * select_0(rank0) = index of the rank0-th occurrence of 0-bit + */ + uint64_t select0(size_t rank0) const { + if (rank0 > num_bits_ - max_rank) [[unlikely]] { + return num_bits_; + } + if (rank0 == 0) [[unlikely]] { + return 0; + } + uint64_t s_block = find_superblock_zeros(rank0); + rank0 -= kSuperBlockSize * s_block - super_block_rank[s_block]; + auto pos = find_basicblock_is_zeros(rank0, s_block); + auto pos_in_super_block = pos & (kBlocksPerSuperBlock - 1); + rank0 -= kBasicBlockSize * pos_in_super_block - basic_block_rank[pos]; + pos *= kWordsPerBlock; + + // Final search + if (pos + kWordsPerBlock - 1 < kWordsPerBlock) [[unlikely]] { + size_t zeros = std::popcount(~bits[pos]); + while (pos < bits.size() && zeros < rank0) { + rank0 -= zeros; + zeros = std::popcount(~bits[++pos]); + } + return kWordSize * pos + select_64(~bits[pos], rank0 - 1); + } + return kWordSize * pos + select0_512(&bits[pos], rank0 - 1); + } + /** * Convert the bit vector to a binary string, for debug purpose only */ diff --git a/src/benchmark_tests.cpp b/src/benchmark_tests.cpp index 1233a3c..2c302f4 100644 --- a/src/benchmark_tests.cpp +++ b/src/benchmark_tests.cpp @@ -10,7 +10,7 @@ using pixie::BitVector; using pixie::BitVectorInterleaved; TEST(BitVectorBenchmarkTest, SelectNonInterleaved10PercentFill) { - for (size_t n = 4; n <= (1ull << 34); n <<= 2) { + for (size_t n = 8; n <= (1ull << 34); n <<= 2) { std::mt19937_64 rng(42); std::vector bits(((8 + n / 64) / 8) * 8); @@ -30,8 +30,29 @@ TEST(BitVectorBenchmarkTest, SelectNonInterleaved10PercentFill) { } } +TEST(BitVectorBenchmarkTest, SelectZeroNonInterleaved10PercentFill) { + for (size_t n = 8; n <= (1ull << 34); n <<= 2) { + std::mt19937_64 rng(42); + + std::vector bits(((8 + n / 64) / 8) * 8); + size_t num_ones = n * 0.1; + for (int i = 0; i < num_ones; i++) { + uint64_t pos = rng() % n; + bits[pos / 64] |= (1ULL << pos % 64); + } + + pixie::BitVector bv(bits, n); + + auto max_rank0 = bv.rank0(bv.size()) + 1; + for (int i = 0; i < 100000; i++) { + uint64_t rank0 = rng() % max_rank0; + bv.select0(rank0); + } + } +} + TEST(BitVectorBenchmarkTest, SelectNonInterleaved90PercentFill) { - for (size_t n = 4; n <= (1ull << 34); n <<= 2) { + for (size_t n = 8; n <= (1ull << 34); n <<= 2) { std::mt19937_64 rng(42); std::vector bits(((8 + n / 64) / 8) * 8); @@ -51,8 +72,29 @@ TEST(BitVectorBenchmarkTest, SelectNonInterleaved90PercentFill) { } } +TEST(BitVectorBenchmarkTest, SelectZeroNonInterleaved90PercentFill) { + for (size_t n = 8; n <= (1ull << 34); n <<= 2) { + std::mt19937_64 rng(42); + + std::vector bits(((8 + n / 64) / 8) * 8); + size_t num_ones = n * 0.9; + for (int i = 0; i < num_ones; i++) { + uint64_t pos = rng() % n; + bits[pos / 64] |= (1ULL << pos % 64); + } + + pixie::BitVector bv(bits, n); + + auto max_rank0 = bv.rank0(bv.size()) + 1; + for (int i = 0; i < 100000; i++) { + uint64_t rank0 = rng() % max_rank0; + bv.select0(rank0); + } + } +} + TEST(BitVectorBenchmarkTest, SelectNonInterleaved) { - for (size_t n = 4; n <= (1ull << 34); n <<= 2) { + for (size_t n = 8; n <= (1ull << 34); n <<= 2) { std::mt19937_64 rng(42); std::vector bits(((8 + n / 64) / 8) * 8); @@ -69,3 +111,22 @@ TEST(BitVectorBenchmarkTest, SelectNonInterleaved) { } } } + +TEST(BitVectorBenchmarkTest, SelectZeroNonInterleaved) { + for (size_t n = 8; n <= (1ull << 34); n <<= 2) { + std::mt19937_64 rng(42); + + std::vector bits(((8 + n / 64) / 8) * 8); + for (auto& x : bits) { + x = rng(); + } + pixie::BitVector bv(bits, n); + + auto max_rank0 = bv.rank0(bv.size()) + 1; + + for (int i = 0; i < 100000; i++) { + uint64_t rank0 = rng() % max_rank0; + bv.select0(rank0); + } + } +} diff --git a/src/benchmarks.cpp b/src/benchmarks.cpp index e2586fa..54a0ea6 100644 --- a/src/benchmarks.cpp +++ b/src/benchmarks.cpp @@ -24,6 +24,22 @@ static void BM_RankNonInterleaved(benchmark::State& state) { } } +static void BM_RankZeroNonInterleaved(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); + + std::vector bits(((8 + n / 64) / 8) * 8); + for (auto& x : bits) { + x = rng(); + } + pixie::BitVector bv(bits, n); + + for (auto _ : state) { + uint64_t pos = rng() % n; + benchmark::DoNotOptimize(bv.rank0(pos)); + } +} + static void BM_RankInterleaved(benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); @@ -58,6 +74,24 @@ static void BM_SelectNonInterleaved(benchmark::State& state) { } } +static void BM_SelectZeroNonInterleaved(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); + + std::vector bits(((8 + n / 64) / 8) * 8); + for (auto& x : bits) { + x = rng(); + } + pixie::BitVector bv(bits, n); + + auto max_rank0 = bv.rank0(bv.size()) + 1; + + for (auto _ : state) { + uint64_t rank0 = rng() % max_rank0; + benchmark::DoNotOptimize(bv.select0(rank0)); + } +} + static void BM_RankNonInterleaved10PercentFill(benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); @@ -77,6 +111,25 @@ static void BM_RankNonInterleaved10PercentFill(benchmark::State& state) { } } +static void BM_RankZeroNonInterleaved10PercentFill(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); + + std::vector bits(((8 + n / 64) / 8) * 8); + size_t num_ones = n * 0.1; + for (int i = 0; i < num_ones; i++) { + uint64_t pos = rng() % n; + bits[pos / 64] |= (1ULL << pos % 64); + } + + pixie::BitVector bv(bits, n); + + for (auto _ : state) { + uint64_t pos = rng() % n; + benchmark::DoNotOptimize(bv.rank0(pos)); + } +} + static void BM_SelectNonInterleaved10PercentFill(benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); @@ -98,6 +151,27 @@ static void BM_SelectNonInterleaved10PercentFill(benchmark::State& state) { } } +static void BM_SelectZeroNonInterleaved10PercentFill(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); + + std::vector bits(((8 + n / 64) / 8) * 8); + size_t num_ones = n * 0.1; + for (int i = 0; i < num_ones; i++) { + uint64_t pos = rng() % n; + bits[pos / 64] |= (1ULL << pos % 64); + } + + pixie::BitVector bv(bits, n); + + auto max_rank0 = bv.rank0(bv.size()) + 1; + + for (auto _ : state) { + uint64_t rank0 = rng() % max_rank0; + benchmark::DoNotOptimize(bv.select0(rank0)); + } +} + static void BM_RankNonInterleaved90PercentFill(benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); @@ -117,6 +191,25 @@ static void BM_RankNonInterleaved90PercentFill(benchmark::State& state) { } } +static void BM_RankZeroNonInterleaved90PercentFill(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); + + std::vector bits(((8 + n / 64) / 8) * 8); + size_t num_ones = n * 0.9; + for (int i = 0; i < num_ones; i++) { + uint64_t pos = rng() % n; + bits[pos / 64] |= (1ULL << pos % 64); + } + + pixie::BitVector bv(bits, n); + + for (auto _ : state) { + uint64_t pos = rng() % n; + benchmark::DoNotOptimize(bv.rank0(pos)); + } +} + static void BM_SelectNonInterleaved90PercentFill(benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); @@ -138,13 +231,40 @@ static void BM_SelectNonInterleaved90PercentFill(benchmark::State& state) { } } +static void BM_SelectZeroNonInterleaved90PercentFill(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); + + std::vector bits(((8 + n / 64) / 8) * 8); + size_t num_ones = n * 0.9; + for (int i = 0; i < num_ones; i++) { + uint64_t pos = rng() % n; + bits[pos / 64] |= (1ULL << pos % 64); + } + + pixie::BitVector bv(bits, n); + + auto max_rank0 = bv.rank0(bv.size()) + 1; + + for (auto _ : state) { + uint64_t rank0 = rng() % max_rank0; + benchmark::DoNotOptimize(bv.select(rank0)); + } +} + +BENCHMARK(BM_RankInterleaved) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(8, 1ull << 34) + ->Iterations(100000); + BENCHMARK(BM_RankNonInterleaved) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) ->Iterations(100000); -BENCHMARK(BM_RankInterleaved) +BENCHMARK(BM_RankZeroNonInterleaved) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) @@ -156,26 +276,56 @@ BENCHMARK(BM_SelectNonInterleaved) ->Range(8, 1ull << 34) ->Iterations(100000); +BENCHMARK(BM_SelectZeroNonInterleaved) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(8, 1ull << 34) + ->Iterations(100000); + BENCHMARK(BM_RankNonInterleaved10PercentFill) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) ->Iterations(100000); +BENCHMARK(BM_RankZeroNonInterleaved10PercentFill) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(8, 1ull << 34) + ->Iterations(100000); + BENCHMARK(BM_SelectNonInterleaved10PercentFill) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) ->Iterations(100000); +BENCHMARK(BM_SelectZeroNonInterleaved10PercentFill) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(8, 1ull << 34) + ->Iterations(100000); + BENCHMARK(BM_RankNonInterleaved90PercentFill) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) ->Iterations(100000); +BENCHMARK(BM_RankZeroNonInterleaved90PercentFill) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(8, 1ull << 34) + ->Iterations(100000); + BENCHMARK(BM_SelectNonInterleaved90PercentFill) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) ->Iterations(100000); + +BENCHMARK(BM_SelectZeroNonInterleaved90PercentFill) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(8, 1ull << 34) + ->Iterations(100000); diff --git a/src/unittests.cpp b/src/unittests.cpp index dddb0b7..27d037c 100644 --- a/src/unittests.cpp +++ b/src/unittests.cpp @@ -89,6 +89,14 @@ TEST(Select512, Ones) { } } +TEST(SelectZero512, Zeros) { + std::array a{0, 0, 0, 0, 0, 0, 0, 0}; + for (size_t i = 0; i < 512; ++i) { + auto p = select0_512(a.data(), i); + EXPECT_EQ(p, i); + } +} + TEST(Select512, Random) { std::array a{std::numeric_limits::max(), std::numeric_limits::max(), @@ -114,6 +122,31 @@ TEST(Select512, Random) { } } +TEST(SelectZero512, Random) { + std::array a{std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max()}; + + std::mt19937_64 rng(42); + for (size_t t = 0; t < 1000; ++t) { + for (auto& x : a) { + x = rng(); + } + size_t rank = 0; + for (size_t i = 0; i < 512; ++i) { + if ((1 & (a[i >> 6] >> (i & 63))) == 0) { + auto p = select0_512(a.data(), rank++); + ASSERT_EQ(p, i); + } + } + } +} + TEST(Select512, RankCompativility) { std::array a{std::numeric_limits::max(), std::numeric_limits::max(), @@ -209,7 +242,7 @@ TEST(BitVectorTest, RankWithZeros) { TEST(BitVectorTest, SelectBasic) { std::vector bits(8, 0); bits[0] = 0b1100010110010110; - BitVector bv(bits, 5); + BitVector bv(bits, 16); EXPECT_EQ(bv.select(1), 1); EXPECT_EQ(bv.select(2), 2); @@ -255,6 +288,78 @@ TEST(BitVectorTest, MainSelectTest) { } } +TEST(BitVectorTest, RankZeroBasic) { + std::vector bits(8, 0); + bits[0] = 0b10110; + BitVector bv(bits, 10); + + EXPECT_EQ(bv.rank0(0), 0); + EXPECT_EQ(bv.rank0(1), 1); + EXPECT_EQ(bv.rank0(2), 1); + EXPECT_EQ(bv.rank0(3), 1); + EXPECT_EQ(bv.rank0(4), 2); + EXPECT_EQ(bv.rank0(5), 2); +} + +TEST(BitVectorTest, RankZeroWithOnes) { + std::vector bits(8, 0); + BitVector bv(bits, 5); + + for (size_t i = 0; i <= 5; i++) { + EXPECT_EQ(bv.rank0(i), i); + } +} + +TEST(BitVectorTest, SelectZeroBasic) { + std::vector bits(8, 0); + bits[0] = 0b1100010110010110; + BitVector bv(bits, 16); + + EXPECT_EQ(bv.select0(1), 0); + EXPECT_EQ(bv.select0(2), 3); + EXPECT_EQ(bv.select0(3), 5); + EXPECT_EQ(bv.select0(4), 6); + EXPECT_EQ(bv.select0(5), 9); + EXPECT_EQ(bv.select0(6), 11); + EXPECT_EQ(bv.select0(7), 12); + EXPECT_EQ(bv.select0(8), 13); +} + +TEST(BitVectorTest, MainRankZeroTest) { + std::mt19937_64 rng(42); + std::vector bits(65536 * 32); + for (size_t i = 0; i < bits.size(); i++) { + bits[i] = rng(); + } + + BitVector bv(bits, bits.size() * 64); + uint64_t zero_count = 0; + for (size_t i = 0; i < bv.size(); ++i) { + ASSERT_EQ(zero_count, bv.rank0(i)); + zero_count += (bv[i] == 0) ? 1 : 0; + } +} + +TEST(BitVectorTest, MainSelectZeroTest) { + std::mt19937_64 rng(42); + std::vector bits(65536 * 32); + for (size_t i = 0; i < bits.size(); i++) { + bits[i] = rng(); + } + + BitVector bv(bits, bits.size() * 64); + uint64_t zero_rank = 0; + + for (size_t i = 0; i < bv.size(); ++i) { + if (bv[i] == 0) { + zero_rank++; + EXPECT_EQ(bv.select0(zero_rank), i); + EXPECT_EQ(bv.rank0(i), zero_rank - 1); + EXPECT_EQ(bv.rank0(i + 1), zero_rank); + } + } +} + TEST(BitVectorInterleavedTest, AtTest) { std::mt19937_64 rng(42); /* @@ -330,6 +435,52 @@ TEST(LowerBound8x64, Random) { } } +TEST(LowerBoundDlt4x64, Random) { + std::vector x(4); + uint64_t dlt_array[4]; + std::mt19937_64 rng(42); + for (size_t i = 0; i < 100000; i++) { + uint64_t y = rng(); + uint64_t dlt_scalar = rng(); + uint16_t cnt = 0; + bool fl = 1; + for (size_t j = 0; j < 4; j++) { + dlt_array[j] = rng(); + x[j] = rng(); + fl &= dlt_scalar + dlt_array[j] - x[j] < y; + cnt += fl; + } + if (cnt < 4) { + ASSERT_EQ(lower_bound_dlt_4x64(x.data(), y, dlt_array, dlt_scalar), cnt); + } else { + ASSERT_GE(lower_bound_dlt_4x64(x.data(), y, dlt_array, dlt_scalar), cnt); + } + } +} + +TEST(LowerBoundDlt8x64, Random) { + std::vector x(8); + uint64_t dlt_array[8]; + std::mt19937_64 rng(42); + for (size_t i = 0; i < 100000; i++) { + uint64_t y = rng(); + uint64_t dlt_scalar = rng(); + uint16_t cnt = 0; + bool fl = 1; + for (size_t j = 0; j < 8; j++) { + dlt_array[j] = rng(); + x[j] = rng(); + fl &= dlt_scalar + dlt_array[j] - x[j] < y; + cnt += fl; + } + if (cnt < 8) { + ASSERT_EQ(lower_bound_dlt_8x64(x.data(), y, dlt_array, dlt_scalar), cnt); + } else { + ASSERT_GE(lower_bound_dlt_8x64(x.data(), y, dlt_array, dlt_scalar), cnt); + } + } +} + TEST(LowerBound32x16, Random) { std::vector x(32); std::mt19937 rng(42); @@ -344,6 +495,28 @@ TEST(LowerBound32x16, Random) { } } +TEST(LowerBoundDlt32x16, Random) { + std::vector x(32); + uint16_t dlt_array[32]; + std::mt19937 rng(42); + for (size_t i = 0; i < 100000; i++) { + uint16_t y = rng(); + uint16_t dlt_scalar = rng(); + uint16_t cnt = 0; + for (size_t j = 0; j < 32; j++) { + x[j] = rng(); + if (dlt_scalar < x[j]) { + dlt_array[j] = + x[j] - dlt_scalar + rng() % ((1 << 16) - x[j] + dlt_scalar); + } else { + dlt_array[j] = rng() % ((1 << 16) - dlt_scalar); + } + cnt += dlt_array[j] + dlt_scalar - x[j] < y; + } + ASSERT_EQ(lower_bound_dlt_32x16(x.data(), y, dlt_array, dlt_scalar), cnt); + } +} + TEST(Popcount64x4, Random) { std::vector x(32); std::vector y(32);