From ce17d63f2c5656ff7fc4404d76150edb0b3ded86 Mon Sep 17 00:00:00 2001
From: mperikov <mperikov2@mail.ru>
Date: Tue, 16 Dec 2025 16:38:31 +0300
Subject: [PATCH 1/9] lower_bound_zeros_*

---
 include/bits.h    | 101 ++++++++++++++++++++++++++++++++++++++++++++++
 src/unittests.cpp |  46 +++++++++++++++++++++
 2 files changed, 147 insertions(+)
diff --git a/include/bits.h b/include/bits.h
index c7ee26d..1ced870 100644
--- a/include/bits.h
+++ b/include/bits.h
@@ -107,6 +107,13 @@ uint64_t select_64(uint64_t x, uint64_t rank) {
   return _tzcnt_u64(_pdep_u64(1ull << rank, x));
 }
 
+/**
+ * @brief Return position of @p rank0 0 bit in @p x
+ */
+uint64_t select0_64(uint64_t x, uint64_t rank0) {
+  return select_64(~x, rank0);
+}
+
 /**
  * @brief Return position of @p rank 1 bit in @p x
  * @details Selecting within 64-bit word can be done
@@ -190,6 +197,57 @@ uint16_t lower_bound_4x64(const uint64_t* x, uint64_t y) {
 #endif
 }
 
+/**
+ * @brief Compare 4 64-bit numbers of ( @p pos + @p dlt - @p x )
+ * with @p y and return the length of the prefix
+ * where @p y is less then ( @p pos + @p dlt - @p x )
+ */
+uint16_t lower_bound_zeros_4x64(const uint64_t* x, uint64_t y, const uint64_t* dlt, uint64_t pos) {
+#ifdef PIXIE_AVX512_SUPPORT
+
+  const __m256i dlt_256 = _mm256_loadu_epi64(dlt);
+  auto x_256 = _mm256_loadu_epi64(x);
+  auto pos_4 = _mm256_set1_epi64x(pos);
+  auto y_4 = _mm256_set1_epi64x(y);
+
+  auto tmp = _mm256_add_epi64(pos_4, dlt_256);
+  auto reg_256 = _mm256_sub_epi64(tmp, x_256);
+  auto cmp = _mm256_cmpge_epu64_mask(reg_256, y_4);
+
+  return _tzcnt_u16(cmp);
+
+#else
+#ifdef PIXIE_AVX2_SUPPORT
+
+  const __m256i dlt_256 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(dlt));
+  auto x_256 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(x));
+  auto pos_4 = _mm256_set1_epi64x(pos);
+  auto y_4 = _mm256_set1_epi64x(y);
+
+  auto tmp = _mm256_add_epi64(pos_4, dlt_256);
+  auto reg_256 = _mm256_sub_epi64(tmp, x_256);
+
+  const __m256i offset = _mm256_set1_epi64x(0x8000000000000000ULL);
+  __m256i x_offset = _mm256_xor_si256(reg_256, offset);
+  __m256i y_offset = _mm256_xor_si256(y_4, offset);
+  auto mask = _mm256_movemask_epi8(_mm256_cmpgt_epi64(
+      x_offset, _mm256_sub_epi64(y_offset, _mm256_set1_epi64x(1))));
+
+  return _tzcnt_u32(mask) >> 3;
+
+#else
+
+  for (uint16_t i = 0; i < 4; ++i) {
+    if (pos + dlt[i] - x[i] >= y) {
+      return i;
+    }
+  }
+  return 4;
+
+#endif
+#endif
+}
+
 /**
  * @brief Compare 8 64-bit numbers of @p x with @p y and
  * return the length of the prefix where @p y is less then @p x
@@ -227,6 +285,49 @@ uint16_t lower_bound_8x64(const uint64_t* x, uint64_t y) {
 #endif
 }
 
+/**
+ * @brief Compare 8 64-bit numbers of ( @p pos + @p dlt - @p x )
+ * with @p y and return the length of the prefix
+ * where @p y is less then ( @p pos + @p dlt - @p x )
+ */
+uint16_t lower_bound_zeros_8x64(const uint64_t* x, uint64_t y, const uint64_t* dlt, uint64_t pos) {
+#ifdef PIXIE_AVX512_SUPPORT
+
+  const __m512i dlt_512 = _mm512_loadu_epi64(dlt);
+  auto x_512 = _mm512_loadu_epi64(x);
+  auto pos_8 = _mm512_set1_epi64(pos);
+  auto y_8 = _mm512_set1_epi64(y);
+
+  auto tmp = _mm512_add_epi64(pos_8, dlt_512);
+  auto reg_512 = _mm512_sub_epi64(tmp, x_512);
+  auto cmp = _mm512_cmpge_epu64_mask(reg_512, y_8);
+
+  return _tzcnt_u16(cmp);
+
+#else
+#ifdef PIXIE_AVX2_SUPPORT
+
+  uint16_t len = lower_bound_zeros_4x64(x, y, dlt, pos);
+
+  if (len < 4) {
+    return len;
+  }
+
+  return len + lower_bound_zeros_4x64(x + 4, y, dlt + 4, pos);
+
+#else
+
+  for (uint16_t i = 0; i < 8; ++i) {
+    if (pos + dlt[i] - x[i] >= y) {
+      return i;
+    }
+  }
+  return 8;
+
+#endif
+#endif
+}
+
 /**
  * @brief Compare 32 16-bit numbers of @p x with @p y and
  * return the count of numbers where @p x is less then @p y
diff --git a/src/unittests.cpp b/src/unittests.cpp
index dddb0b7..9e202e2 100644
--- a/src/unittests.cpp
+++ b/src/unittests.cpp
@@ -330,6 +330,52 @@ TEST(LowerBound8x64, Random) {
   }
 }
 
+TEST(LowerBoundZeros4x64, Random) {
+  std::vector<uint64_t> x(8);
+  uint64_t dlt[8];
+  std::mt19937_64 rng(42);
+  for (size_t i = 0; i < 100000; i++) {
+    uint64_t y = rng();
+    uint64_t pos = rng();
+    uint16_t cnt = 0;
+    bool fl = 1;
+    for (size_t j = 0; j < 4; j++) {
+      dlt[j] = rng();
+      x[j] = rng();
+      fl &= pos + dlt[j] - x[j] < y;
+      cnt += fl;
+    }
+    if (cnt < 4) {
+      ASSERT_EQ(lower_bound_zeros_4x64(x.data(), y, dlt, pos), cnt);
+    } else {
+      ASSERT_GE(lower_bound_zeros_4x64(x.data(), y, dlt, pos), cnt);
+    }
+  }
+}
+
+TEST(LowerBoundZeros8x64, Random) {
+  std::vector<uint64_t> x(8);
+  uint64_t dlt[8];
+  std::mt19937_64 rng(42);
+  for (size_t i = 0; i < 100000; i++) {
+    uint64_t y = rng();
+    uint64_t pos = rng();
+    uint16_t cnt = 0;
+    bool fl = 1;
+    for (size_t j = 0; j < 8; j++) {
+      dlt[j] = rng();
+      x[j] = rng();
+      fl &= pos + dlt[j] - x[j] < y;
+      cnt += fl;
+    }
+    if (cnt < 8) {
+      ASSERT_EQ(lower_bound_zeros_8x64(x.data(), y, dlt, pos), cnt);
+    } else {
+      ASSERT_GE(lower_bound_zeros_8x64(x.data(), y, dlt, pos), cnt);
+    }
+  }
+}
+
 TEST(LowerBound32x16, Random) {
   std::vector<uint16_t> x(32);
   std::mt19937 rng(42);

From 2ab5c844cc22e9532372363eb21e149bc6980f7c Mon Sep 17 00:00:00 2001
From: mperikov <mperikov2@mail.ru>
Date: Wed, 17 Dec 2025 14:34:43 +0300
Subject: [PATCH 2/9] Format fix

---
 include/bits.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/include/bits.h b/include/bits.h
index 1ced870..9af8eea 100644
--- a/include/bits.h
+++ b/include/bits.h
@@ -202,7 +202,10 @@ uint16_t lower_bound_4x64(const uint64_t* x, uint64_t y) {
  * with @p y and return the length of the prefix
  * where @p y is less then ( @p pos + @p dlt - @p x )
  */
-uint16_t lower_bound_zeros_4x64(const uint64_t* x, uint64_t y, const uint64_t* dlt, uint64_t pos) {
+uint16_t lower_bound_zeros_4x64(const uint64_t* x,
+                                uint64_t y,
+                                const uint64_t* dlt,
+                                uint64_t pos) {
 #ifdef PIXIE_AVX512_SUPPORT
 
   const __m256i dlt_256 = _mm256_loadu_epi64(dlt);
@@ -219,7 +222,8 @@ uint16_t lower_bound_zeros_4x64(const uint64_t* x, uint64_t y, const uint64_t* d
 #else
 #ifdef PIXIE_AVX2_SUPPORT
 
-  const __m256i dlt_256 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(dlt));
+  const __m256i dlt_256 =
+      _mm256_loadu_si256(reinterpret_cast<const __m256i*>(dlt));
   auto x_256 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(x));
   auto pos_4 = _mm256_set1_epi64x(pos);
   auto y_4 = _mm256_set1_epi64x(y);
@@ -290,7 +294,10 @@ uint16_t lower_bound_8x64(const uint64_t* x, uint64_t y) {
  * with @p y and return the length of the prefix
  * where @p y is less then ( @p pos + @p dlt - @p x )
  */
-uint16_t lower_bound_zeros_8x64(const uint64_t* x, uint64_t y, const uint64_t* dlt, uint64_t pos) {
+uint16_t lower_bound_zeros_8x64(const uint64_t* x,
+                                uint64_t y,
+                                const uint64_t* dlt,
+                                uint64_t pos) {
 #ifdef PIXIE_AVX512_SUPPORT
 
   const __m512i dlt_512 = _mm512_loadu_epi64(dlt);

From c947e32afc45ec7b35188e92ae601037ece3f880 Mon Sep 17 00:00:00 2001
From: mperikov <mperikov2@mail.ru>
Date: Thu, 18 Dec 2025 14:00:51 +0300
Subject: [PATCH 3/9] lower_bound_dlt_32x16 + select0_512 + tests

---
 include/bits.h      | 160 ++++++++++++++++++++++++++++++++---------
 include/bitvector.h | 169 +++++++++++++++++++++++++++++++++++++++++---
 src/unittests.cpp   |  85 ++++++++++++++++++----
 3 files changed, 354 insertions(+), 60 deletions(-)

diff --git a/include/bits.h b/include/bits.h
index 9af8eea..dd87f7f 100644
--- a/include/bits.h
+++ b/include/bits.h
@@ -107,13 +107,6 @@ uint64_t select_64(uint64_t x, uint64_t rank) {
   return _tzcnt_u64(_pdep_u64(1ull << rank, x));
 }
 
-/**
- * @brief Return position of @p rank0 0 bit in @p x
- */
-uint64_t select0_64(uint64_t x, uint64_t rank0) {
-  return select_64(~x, rank0);
-}
-
 /**
  * @brief Return position of @p rank 1 bit in @p x
  * @details Selecting within 64-bit word can be done
@@ -135,8 +128,8 @@ uint64_t select_512(const uint64_t* x, uint64_t rank) {
 #ifdef PIXIE_AVX512_SUPPORT
 
   __m512i res = _mm512_loadu_epi64(x);
-  std::array<uint64_t, 8> counts;
-  _mm512_storeu_epi64(counts.data(), _mm512_popcnt_epi64(res));
+  alignas(64) std::array<uint64_t, 8> counts;
+  _mm512_store_epi64(counts.data(), _mm512_popcnt_epi64(res));
 
   size_t i = 0;
   while (i < 8 && counts[i] <= rank) {
@@ -157,6 +150,37 @@ uint64_t select_512(const uint64_t* x, uint64_t rank) {
 #endif
 }
 
+/**
+ * @brief Return position of @p rank0 0 bit in @p x
+ * @details select_512 with bit inversion.
+ */
+uint64_t select0_512(const uint64_t* x, uint64_t rank0) {
+#ifdef PIXIE_AVX512_SUPPORT
+
+  __m512i res = _mm512_loadu_epi64(x);
+  res = _mm512_xor_epi64(res, _mm512_set1_epi64(-1));
+  alignas(64) std::array<uint64_t, 8> counts;
+  _mm512_store_epi64(counts.data(), _mm512_popcnt_epi64(res));
+
+  size_t i = 0;
+  while (i < 8 && counts[i] <= rank0) {
+    rank0 -= counts[i++];
+  }
+  return i * 64 + select_64(~x[i], rank0);
+
+#else
+
+  size_t i = 0;
+  int popcount = std::popcount(~x[0]);
+  while (i < 7 && popcount <= rank0) {
+    rank0 -= popcount;
+    popcount = std::popcount(~x[++i]);
+  }
+  return i * 64 + select_64(~x[i], rank0);
+
+#endif
+}
+
 /**
  * @brief Compare 4 64-bit numbers of @p x with @p y and
  * return the length of the prefix where @p y is less then @p x
@@ -198,22 +222,22 @@ uint16_t lower_bound_4x64(const uint64_t* x, uint64_t y) {
 }
 
 /**
- * @brief Compare 4 64-bit numbers of ( @p pos + @p dlt - @p x )
+ * @brief Compare 4 64-bit numbers of ( @p dlt_array + @p dlt_scalar - @p x )
  * with @p y and return the length of the prefix
- * where @p y is less then ( @p pos + @p dlt - @p x )
+ * where @p y is less then ( @p dlt_array + @p dlt_scalar - @p x )
  */
-uint16_t lower_bound_zeros_4x64(const uint64_t* x,
-                                uint64_t y,
-                                const uint64_t* dlt,
-                                uint64_t pos) {
+uint16_t lower_bound_dlt_4x64(const uint64_t* x,
+                              uint64_t y,
+                              const uint64_t* dlt_array,
+                              uint64_t dlt_scalar) {
 #ifdef PIXIE_AVX512_SUPPORT
 
-  const __m256i dlt_256 = _mm256_loadu_epi64(dlt);
+  const __m256i dlt_256 = _mm256_loadu_epi64(dlt_array);
   auto x_256 = _mm256_loadu_epi64(x);
-  auto pos_4 = _mm256_set1_epi64x(pos);
+  auto dlt_4 = _mm256_set1_epi64x(dlt_scalar);
   auto y_4 = _mm256_set1_epi64x(y);
 
-  auto tmp = _mm256_add_epi64(pos_4, dlt_256);
+  auto tmp = _mm256_add_epi64(dlt_4, dlt_256);
   auto reg_256 = _mm256_sub_epi64(tmp, x_256);
   auto cmp = _mm256_cmpge_epu64_mask(reg_256, y_4);
 
@@ -223,12 +247,12 @@ uint16_t lower_bound_zeros_4x64(const uint64_t* x,
 #ifdef PIXIE_AVX2_SUPPORT
 
   const __m256i dlt_256 =
-      _mm256_loadu_si256(reinterpret_cast<const __m256i*>(dlt));
+      _mm256_loadu_si256(reinterpret_cast<const __m256i*>(dlt_array));
   auto x_256 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(x));
-  auto pos_4 = _mm256_set1_epi64x(pos);
+  auto dlt_4 = _mm256_set1_epi64x(dlt_scalar);
   auto y_4 = _mm256_set1_epi64x(y);
 
-  auto tmp = _mm256_add_epi64(pos_4, dlt_256);
+  auto tmp = _mm256_add_epi64(dlt_4, dlt_256);
   auto reg_256 = _mm256_sub_epi64(tmp, x_256);
 
   const __m256i offset = _mm256_set1_epi64x(0x8000000000000000ULL);
@@ -242,7 +266,7 @@ uint16_t lower_bound_zeros_4x64(const uint64_t* x,
 #else
 
   for (uint16_t i = 0; i < 4; ++i) {
-    if (pos + dlt[i] - x[i] >= y) {
+    if (dlt_array[i] + dlt_scalar - x[i] >= y) {
       return i;
     }
   }
@@ -290,22 +314,22 @@ uint16_t lower_bound_8x64(const uint64_t* x, uint64_t y) {
 }
 
 /**
- * @brief Compare 8 64-bit numbers of ( @p pos + @p dlt - @p x )
+ * @brief Compare 8 64-bit numbers of ( @p dlt_array + @p dlt_scalar - @p x )
  * with @p y and return the length of the prefix
- * where @p y is less then ( @p pos + @p dlt - @p x )
+ * where @p y is less then ( @p dlt_array + @p dlt_scalar - @p x )
  */
-uint16_t lower_bound_zeros_8x64(const uint64_t* x,
-                                uint64_t y,
-                                const uint64_t* dlt,
-                                uint64_t pos) {
+uint16_t lower_bound_dlt_8x64(const uint64_t* x,
+                              uint64_t y,
+                              const uint64_t* dlt_array,
+                              uint64_t dlt_scalar) {
 #ifdef PIXIE_AVX512_SUPPORT
 
-  const __m512i dlt_512 = _mm512_loadu_epi64(dlt);
+  const __m512i dlt_512 = _mm512_loadu_epi64(dlt_array);
   auto x_512 = _mm512_loadu_epi64(x);
-  auto pos_8 = _mm512_set1_epi64(pos);
+  auto dlt_8 = _mm512_set1_epi64(dlt_scalar);
   auto y_8 = _mm512_set1_epi64(y);
 
-  auto tmp = _mm512_add_epi64(pos_8, dlt_512);
+  auto tmp = _mm512_add_epi64(dlt_8, dlt_512);
   auto reg_512 = _mm512_sub_epi64(tmp, x_512);
   auto cmp = _mm512_cmpge_epu64_mask(reg_512, y_8);
 
@@ -314,18 +338,18 @@ uint16_t lower_bound_zeros_8x64(const uint64_t* x,
 #else
 #ifdef PIXIE_AVX2_SUPPORT
 
-  uint16_t len = lower_bound_zeros_4x64(x, y, dlt, pos);
+  uint16_t len = lower_bound_dlt_4x64(x, y, dlt_array, dlt_scalar);
 
   if (len < 4) {
     return len;
   }
 
-  return len + lower_bound_zeros_4x64(x + 4, y, dlt + 4, pos);
+  return len + lower_bound_dlt_4x64(x + 4, y, dlt_array + 4, dlt_scalar);
 
 #else
 
   for (uint16_t i = 0; i < 8; ++i) {
-    if (pos + dlt[i] - x[i] >= y) {
+    if (dlt_array[i] + dlt_scalar - x[i] >= y) {
       return i;
     }
   }
@@ -339,7 +363,7 @@ uint16_t lower_bound_zeros_8x64(const uint64_t* x,
  * @brief Compare 32 16-bit numbers of @p x with @p y and
  * return the count of numbers where @p x is less then @p y
  */
-uint16_t lower_bound_32x16(const uint16_t* x, uint64_t y) {
+uint16_t lower_bound_32x16(const uint16_t* x, uint16_t y) {
 #ifdef PIXIE_AVX512_SUPPORT
 
   auto y_32 = _mm512_set1_epi16(y);
@@ -381,6 +405,72 @@ uint16_t lower_bound_32x16(const uint16_t* x, uint64_t y) {
 #endif
 }
 
+/**
+ * @brief Compare 32 16-bit numbers of ( @p dlt_array + @p dlt_scalar - @p x )
+ * with @p y and return the count of numbers where
+ * ( @p dlt_array + @p dlt_scalar - @p x ) is less then @p y
+ */
+uint16_t lower_bound_dlt_32x16(const uint16_t* x,
+                               uint16_t y,
+                               const uint16_t* dlt_array,
+                               uint16_t dlt_scalar) {
+#ifdef PIXIE_AVX512_SUPPORT
+
+  const __m512i dlt_512 = _mm512_loadu_epi64(dlt_array);
+  auto x_512 = _mm512_loadu_epi64(x);
+  auto dlt_32 = _mm512_set1_epi16(dlt_scalar);
+  auto y_32 = _mm512_set1_epi16(y);
+
+  auto tmp = _mm512_add_epi16(dlt_32, dlt_512);
+  auto reg_512 = _mm512_sub_epi16(tmp, x_512);
+  auto cmp = _mm512_cmplt_epu16_mask(reg_512, y_32);
+  return std::popcount(cmp);
+
+#else
+#ifdef PIXIE_AVX2_SUPPORT
+
+  auto dlt_256 =
+      _mm256_loadu_si256(reinterpret_cast<const __m256i*>(dlt_array));
+  auto x_256 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(x));
+  auto dlt_16 = _mm256_set1_epi16(dlt_scalar);
+  auto y_16 = _mm256_set1_epi16(y);
+
+  auto tmp = _mm256_add_epi16(dlt_16, dlt_256);
+  auto reg_256 = _mm256_sub_epi16(tmp, x_256);
+
+  const __m256i offset = _mm256_set1_epi16(0x8000);
+  __m256i x_offset = _mm256_xor_si256(reg_256, offset);
+  __m256i y_offset = _mm256_xor_si256(y_16, offset);
+  uint32_t mask = _mm256_movemask_epi8(_mm256_cmpgt_epi16(y_offset, x_offset));
+
+  uint16_t count = std::popcount(mask) >> 1;
+
+  dlt_256 =
+      _mm256_loadu_si256(reinterpret_cast<const __m256i*>(dlt_array + 16));
+  x_256 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(x + 16));
+
+  tmp = _mm256_add_epi16(dlt_16, dlt_256);
+  reg_256 = _mm256_sub_epi16(tmp, x_256);
+
+  x_offset = _mm256_xor_si256(reg_256, offset);
+  mask = _mm256_movemask_epi8(_mm256_cmpgt_epi16(y_offset, x_offset));
+
+  return count + (std::popcount(mask) >> 1);
+
+#else
+
+  uint16_t cnt = 0;
+  for (uint16_t i = 0; i < 32; ++i) {
+    if (dlt_array[i] + dlt_scalar - x[i] < y) {
+      cnt++;
+    }
+  }
+  return cnt;
+
+#endif
+#endif
+}
+
 /**
  * @brief Calculates 64 popcounts of 4-bits integers and stores as 64 4-bits
  * integers (packed into 32 bytes)
diff --git a/include/bitvector.h b/include/bitvector.h
index 117e00a..de2a530 100644
--- a/include/bitvector.h
+++ b/include/bitvector.h
@@ -72,9 +72,24 @@ class BitVector {
   constexpr static size_t kSelectSampleFrequency = 16384;
   constexpr static size_t kBlocksPerSuperBlock = 128;
 
+  struct PreCalculation {
+    alignas(64) static uint64_t dlt_super[8];
+    alignas(64) static uint16_t dlt_basic[32];
+
+    constexpr PreCalculation() {
+      for (size_t i = 0; i < 8; ++i) {
+        dlt_super[i] = i * kSuperBlockSize;
+      }
+      for (size_t i = 0; i < 32; ++i) {
+        dlt_basic[i] = i * kBasicBlockSize;
+      }
+    }
+  };
+
   std::vector<uint64_t> super_block_rank;
   std::vector<uint16_t> basic_block_rank;
   std::vector<uint64_t> select_samples;
+  std::vector<uint64_t> select0_samples;
   const size_t num_bits_;
   const size_t padded_size_;
   size_t max_rank;
@@ -119,10 +134,14 @@ class BitVector {
    */
   void build_select() {
     uint64_t milestone = kSelectSampleFrequency;
+    uint64_t milestone0 = kSelectSampleFrequency;
     uint64_t rank = 0;
+    uint64_t rank0 = 0;
     select_samples.emplace_back(0);
+    select0_samples.emplace_back(0);
     for (size_t i = 0; i < bits.size(); ++i) {
       auto ones = std::popcount(bits[i]);
+      auto zeros = 64 - ones;
       if (rank + ones >= milestone) {
         auto pos = select_64(bits[i], milestone - rank - 1);
         // TODO: try including global rank into select samples to save
@@ -130,7 +149,13 @@ class BitVector {
         select_samples.emplace_back((64 * i + pos) / kSuperBlockSize);
         milestone += kSelectSampleFrequency;
       }
+      if (rank0 + zeros >= milestone0) {
+        auto pos = select_64(~bits[i], milestone0 - rank0 - 1);
+        select0_samples.emplace_back((64 * i + pos) / kSuperBlockSize);
+        milestone0 += kSelectSampleFrequency;
+      }
       rank += ones;
+      rank0 += zeros;
     }
   }
 
@@ -160,6 +185,36 @@ class BitVector {
     return left - 1;
   }
 
+  /**
+   * @brief first step of the select0 operation
+   */
+  uint64_t find_superblock_zeros(uint64_t rank0) const {
+    uint64_t left = select0_samples[rank0 / kSelectSampleFrequency];
+
+    while (left + 7 < super_block_rank.size()) {
+      auto len = lower_bound_dlt_8x64(&super_block_rank[left], rank0,
+                                      PreCalculation::dlt_super,
+                                      kSuperBlockSize * left);
+      if (len < 8) {
+        return left + len - 1;
+      }
+      left += 8;
+    }
+    if (left + 3 < super_block_rank.size()) {
+      auto len = lower_bound_dlt_4x64(&super_block_rank[left], rank0,
+                                      PreCalculation::dlt_super,
+                                      kSuperBlockSize * left);
+      if (len < 4) {
+        return left + len - 1;
+      }
+      left += 4;
+    }
+    while (left < super_block_rank.size() && super_block_rank[left] < rank0) {
+      left++;
+    }
+    return left - 1;
+  }
+
   /**
    * @brief SIMD-optimized linear scan
    * @details
@@ -167,17 +222,32 @@ class BitVector {
    * 4 iterations.
    */
   uint64_t find_basicblock(uint16_t local_rank, uint64_t s_block) const {
-    auto pos = 0;
+    for (size_t pos = 0; pos < kBlocksPerSuperBlock; pos += 32) {
+      auto count = lower_bound_32x16(
+          &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank);
+      if (count < 32) {
+        return kBlocksPerSuperBlock * s_block + pos + count - 1;
+      }
+    }
+    return kBlocksPerSuperBlock * s_block + kBlocksPerSuperBlock - 1;
+  }
 
-    for (size_t i = 0; i < 4; ++i) {
-      auto count = lower_bound_32x16(&basic_block_rank[128 * s_block + 32 * i],
-                                     local_rank);
+  /**
+   * @brief SIMD-optimized linear scan
+   * @details
+   * Processes 32 16-bit entries at once (full cache line), so there is at most
+   * 4 iterations.
+   */
+  uint64_t find_basicblock_zeros(uint16_t local_rank0, uint64_t s_block) const {
+    for (size_t pos = 0; pos < kBlocksPerSuperBlock; pos += 32) {
+      auto count = lower_bound_dlt_32x16(
+          &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank0,
+          PreCalculation::dlt_basic, kBasicBlockSize * pos);
       if (count < 32) {
         return kBlocksPerSuperBlock * s_block + pos + count - 1;
       }
-      pos += 32;
     }
-    return kBlocksPerSuperBlock * s_block + pos - 1;
+    return kBlocksPerSuperBlock * s_block + kBlocksPerSuperBlock - 1;
   }
 
   /**
@@ -197,8 +267,8 @@ class BitVector {
     pos = pos + 16 < 32 ? 0 : (pos - 16);
     pos = pos > 96 ? 96 : pos;
     while (pos < 96) {
-      auto count =
-          lower_bound_32x16(&basic_block_rank[128 * s_block + pos], local_rank);
+      auto count = lower_bound_32x16(
+          &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank);
       if (count == 0) {
         return find_basicblock(local_rank, s_block);
       }
@@ -208,14 +278,54 @@ class BitVector {
       pos += 32;
     }
     pos = 96;
-    auto count =
-        lower_bound_32x16(&basic_block_rank[128 * s_block + pos], local_rank);
+    auto count = lower_bound_32x16(
+        &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank);
     if (count == 0) {
       return find_basicblock(local_rank, s_block);
     }
     return kBlocksPerSuperBlock * s_block + pos + count - 1;
   }
 
+  /**
+   * @brief Interpolation search with SIMD optimization
+   * @details
+   * Similar to find_basicblock_zeros but initial guess is based on linear
+   * interpolation, for random data it should make initial guess correct
+   * most of the times, we start from the 32 wide block with interpolation
+   * guess at the center, if we see that select result lie in lower blocks
+   * we backoff to find_basicblock_zeros
+   */
+  uint64_t find_basicblock_is_zeros(uint16_t local_rank0,
+                                    uint64_t s_block) const {
+    auto lower = kSuperBlockSize * s_block - super_block_rank[s_block];
+    auto upper =
+        kSuperBlockSize * (s_block + 1) - super_block_rank[s_block + 1];
+
+    uint64_t pos = kBlocksPerSuperBlock * local_rank0 / (upper - lower);
+    pos = pos + 16 < 32 ? 0 : (pos - 16);
+    pos = pos > 96 ? 96 : pos;
+    while (pos < 96) {
+      auto count = lower_bound_dlt_32x16(
+          &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank0,
+          PreCalculation::dlt_basic, kBasicBlockSize * pos);
+      if (count == 0) {
+        return find_basicblock_zeros(local_rank0, s_block);
+      }
+      if (count < 32) {
+        return kBlocksPerSuperBlock * s_block + pos + count - 1;
+      }
+      pos += 32;
+    }
+    pos = 96;
+    auto count = lower_bound_dlt_32x16(
+        &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank0,
+        PreCalculation::dlt_basic, kBasicBlockSize * pos);
+    if (count == 0) {
+      return find_basicblock_zeros(local_rank0, s_block);
+    }
+    return kBlocksPerSuperBlock * s_block + pos + count - 1;
+  }
+
  public:
   /**
    * Constructor from an external array of uint64_t.
@@ -258,6 +368,16 @@ class BitVector {
     return result;
   }
 
+  /**
+   * Rank zero operation: count the number of 0-bits up to position pos
+   * (exclusive) rank_0(pos) = number of 0s in positions [0...pos-1]
+   */
+  uint64_t rank0(size_t pos) const { return pos - rank(pos); }
+
+  /**
+   * Select operation: find the position of the i-th occurrence of a 1-bit
+   * select_1(rank) = index of the rank-th occurrence of 1-bit
+   */
   uint64_t select(size_t rank) const {
     if (rank > max_rank) [[unlikely]] {
       return num_bits_;
@@ -283,6 +403,35 @@ class BitVector {
     return kWordSize * pos + select_512(&bits[pos], rank - 1);
   }
 
+  /**
+   * Select zero operation: find the position of the i-th occurrence of a 0-bit
+   * select_0(rank) = index of the rank-th occurrence of 0-bit
+   */
+  uint64_t select0(size_t rank) const {
+    if (rank > max_rank) [[unlikely]] {
+      return num_bits_;
+    }
+    if (rank == 0) [[unlikely]] {
+      return 0;
+    }
+    uint64_t s_block = find_superblock(rank);
+    rank -= super_block_rank[s_block];
+    auto pos = find_basicblock_is(rank, s_block);
+    rank -= basic_block_rank[pos];
+    pos *= kWordsPerBlock;
+
+    // Final search
+    if (pos + kWordsPerBlock - 1 < kWordsPerBlock) [[unlikely]] {
+      size_t ones = std::popcount(bits[pos]);
+      while (pos < bits.size() && ones < rank) {
+        rank -= ones;
+        ones = std::popcount(bits[++pos]);
+      }
+      return kWordSize * pos + select_64(bits[pos], rank - 1);
+    }
+    return kWordSize * pos + select_512(&bits[pos], rank - 1);
+  }
+
   /**
    * Convert the bit vector to a binary string, for debug purpose only
    */
diff --git a/src/unittests.cpp b/src/unittests.cpp
index 9e202e2..0203f1a 100644
--- a/src/unittests.cpp
+++ b/src/unittests.cpp
@@ -89,6 +89,14 @@ TEST(Select512, Ones) {
   }
 }
 
+TEST(SelectZero512, Zeros) {
+  std::array<uint64_t, 8> a{0, 0, 0, 0, 0, 0, 0, 0};
+  for (size_t i = 0; i < 512; ++i) {
+    auto p = select0_512(a.data(), i);
+    EXPECT_EQ(p, i);
+  }
+}
+
 TEST(Select512, Random) {
   std::array<uint64_t, 8> a{std::numeric_limits<uint64_t>::max(),
                             std::numeric_limits<uint64_t>::max(),
@@ -114,6 +122,31 @@ TEST(Select512, Random) {
   }
 }
 
+TEST(SelectZero512, Random) {
+  std::array<uint64_t, 8> a{std::numeric_limits<uint64_t>::max(),
+                            std::numeric_limits<uint64_t>::max(),
+                            std::numeric_limits<uint64_t>::max(),
+                            std::numeric_limits<uint64_t>::max(),
+                            std::numeric_limits<uint64_t>::max(),
+                            std::numeric_limits<uint64_t>::max(),
+                            std::numeric_limits<uint64_t>::max(),
+                            std::numeric_limits<uint64_t>::max()};
+
+  std::mt19937_64 rng(42);
+  for (size_t t = 0; t < 1000; ++t) {
+    for (auto& x : a) {
+      x = rng();
+    }
+    size_t rank = 0;
+    for (size_t i = 0; i < 512; ++i) {
+      if ((1 & (a[i >> 6] >> (i & 63))) == 0) {
+        auto p = select0_512(a.data(), rank++);
+        ASSERT_EQ(p, i);
+      }
+    }
+  }
+}
+
 TEST(Select512, RankCompativility) {
   std::array<uint64_t, 8> a{std::numeric_limits<uint64_t>::max(),
                             std::numeric_limits<uint64_t>::max(),
@@ -330,48 +363,48 @@ TEST(LowerBound8x64, Random) {
   }
 }
 
-TEST(LowerBoundZeros4x64, Random) {
-  std::vector<uint64_t> x(8);
-  uint64_t dlt[8];
+TEST(LowerBoundDlt4x64, Random) {
+  std::vector<uint64_t> x(4);
+  uint64_t dlt_array[4];
   std::mt19937_64 rng(42);
   for (size_t i = 0; i < 100000; i++) {
     uint64_t y = rng();
-    uint64_t pos = rng();
+    uint64_t dlt_scalar = rng();
     uint16_t cnt = 0;
     bool fl = 1;
     for (size_t j = 0; j < 4; j++) {
-      dlt[j] = rng();
+      dlt_array[j] = rng();
       x[j] = rng();
-      fl &= pos + dlt[j] - x[j] < y;
+      fl &= dlt_scalar + dlt_array[j] - x[j] < y;
       cnt += fl;
     }
     if (cnt < 4) {
-      ASSERT_EQ(lower_bound_zeros_4x64(x.data(), y, dlt, pos), cnt);
+      ASSERT_EQ(lower_bound_dlt_4x64(x.data(), y, dlt_array, dlt_scalar), cnt);
     } else {
-      ASSERT_GE(lower_bound_zeros_4x64(x.data(), y, dlt, pos), cnt);
+      ASSERT_GE(lower_bound_dlt_4x64(x.data(), y, dlt_array, dlt_scalar), cnt);
     }
   }
 }
 
-TEST(LowerBoundZeros8x64, Random) {
+TEST(LowerBoundDlt8x64, Random) {
   std::vector<uint64_t> x(8);
-  uint64_t dlt[8];
+  uint64_t dlt_array[8];
   std::mt19937_64 rng(42);
   for (size_t i = 0; i < 100000; i++) {
     uint64_t y = rng();
-    uint64_t pos = rng();
+    uint64_t dlt_scalar = rng();
     uint16_t cnt = 0;
     bool fl = 1;
     for (size_t j = 0; j < 8; j++) {
-      dlt[j] = rng();
+      dlt_array[j] = rng();
       x[j] = rng();
-      fl &= pos + dlt[j] - x[j] < y;
+      fl &= dlt_scalar + dlt_array[j] - x[j] < y;
       cnt += fl;
     }
     if (cnt < 8) {
-      ASSERT_EQ(lower_bound_zeros_8x64(x.data(), y, dlt, pos), cnt);
+      ASSERT_EQ(lower_bound_dlt_8x64(x.data(), y, dlt_array, dlt_scalar), cnt);
     } else {
-      ASSERT_GE(lower_bound_zeros_8x64(x.data(), y, dlt, pos), cnt);
+      ASSERT_GE(lower_bound_dlt_8x64(x.data(), y, dlt_array, dlt_scalar), cnt);
     }
   }
 }
@@ -390,6 +423,28 @@ TEST(LowerBound32x16, Random) {
   }
 }
 
+TEST(LowerBoundDlt32x16, Random) {
+  std::vector<uint16_t> x(32);
+  uint16_t dlt_array[32];
+  std::mt19937 rng(42);
+  for (size_t i = 0; i < 100000; i++) {
+    uint16_t y = rng();
+    uint16_t dlt_scalar = rng();
+    uint16_t cnt = 0;
+    for (size_t j = 0; j < 32; j++) {
+      x[j] = rng();
+      if (dlt_scalar < x[j]) {
+        dlt_array[j] =
+            x[j] - dlt_scalar + rng() % ((1 << 16) - x[j] + dlt_scalar);
+      } else {
+        dlt_array[j] = rng() % ((1 << 16) - dlt_scalar);
+      }
+      cnt += dlt_array[j] + dlt_scalar - x[j] < y;
+    }
+    ASSERT_EQ(lower_bound_dlt_32x16(x.data(), y, dlt_array, dlt_scalar), cnt);
+  }
+}
+
 TEST(Popcount64x4, Random) {
   std::vector<uint8_t> x(32);
   std::vector<uint8_t> y(32);

From a953b868e3fd34443a64f99a4b95725ab474a7af Mon Sep 17 00:00:00 2001
From: mperikov <mperikov2@mail.ru>
Date: Thu, 18 Dec 2025 15:40:27 +0300
Subject: [PATCH 4/9] rank0/select0 + tests

---
 include/bitvector.h | 66 ++++++++++++++++++++--------------------
 src/unittests.cpp   | 74 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 107 insertions(+), 33 deletions(-)

diff --git a/include/bitvector.h b/include/bitvector.h
index de2a530..2ed9b1a 100644
--- a/include/bitvector.h
+++ b/include/bitvector.h
@@ -7,6 +7,8 @@
 #include <string>
 #include <vector>
 
+//#include <iostream>
+
 #include "bits.h"
 
 namespace pixie {
@@ -72,19 +74,8 @@ class BitVector {
   constexpr static size_t kSelectSampleFrequency = 16384;
   constexpr static size_t kBlocksPerSuperBlock = 128;
 
-  struct PreCalculation {
-    alignas(64) static uint64_t dlt_super[8];
-    alignas(64) static uint16_t dlt_basic[32];
-
-    constexpr PreCalculation() {
-      for (size_t i = 0; i < 8; ++i) {
-        dlt_super[i] = i * kSuperBlockSize;
-      }
-      for (size_t i = 0; i < 32; ++i) {
-        dlt_basic[i] = i * kBasicBlockSize;
-      }
-    }
-  };
+  alignas(64) uint64_t dlt_super[8];
+  alignas(64) uint16_t dlt_basic[32];
 
   std::vector<uint64_t> super_block_rank;
   std::vector<uint16_t> basic_block_rank;
@@ -157,6 +148,13 @@ class BitVector {
       rank += ones;
       rank0 += zeros;
     }
+
+    for (size_t i = 0; i < 8; ++i) {
+      dlt_super[i] = i * kSuperBlockSize;
+    }
+    for (size_t i = 0; i < 32; ++i) {
+      dlt_basic[i] = i * kBasicBlockSize;
+    }
   }
 
   /**
@@ -193,7 +191,7 @@ class BitVector {
 
     while (left + 7 < super_block_rank.size()) {
       auto len = lower_bound_dlt_8x64(&super_block_rank[left], rank0,
-                                      PreCalculation::dlt_super,
+                                      dlt_super,
                                       kSuperBlockSize * left);
       if (len < 8) {
         return left + len - 1;
@@ -202,7 +200,7 @@ class BitVector {
     }
     if (left + 3 < super_block_rank.size()) {
       auto len = lower_bound_dlt_4x64(&super_block_rank[left], rank0,
-                                      PreCalculation::dlt_super,
+                                      dlt_super,
                                       kSuperBlockSize * left);
       if (len < 4) {
         return left + len - 1;
@@ -242,7 +240,7 @@ class BitVector {
     for (size_t pos = 0; pos < kBlocksPerSuperBlock; pos += 32) {
       auto count = lower_bound_dlt_32x16(
           &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank0,
-          PreCalculation::dlt_basic, kBasicBlockSize * pos);
+          dlt_basic, kBasicBlockSize * pos);
       if (count < 32) {
         return kBlocksPerSuperBlock * s_block + pos + count - 1;
       }
@@ -307,7 +305,7 @@ class BitVector {
     while (pos < 96) {
       auto count = lower_bound_dlt_32x16(
           &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank0,
-          PreCalculation::dlt_basic, kBasicBlockSize * pos);
+          dlt_basic, kBasicBlockSize * pos);
       if (count == 0) {
         return find_basicblock_zeros(local_rank0, s_block);
       }
@@ -319,7 +317,7 @@ class BitVector {
     pos = 96;
     auto count = lower_bound_dlt_32x16(
         &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank0,
-        PreCalculation::dlt_basic, kBasicBlockSize * pos);
+        dlt_basic, kBasicBlockSize * pos);
     if (count == 0) {
       return find_basicblock_zeros(local_rank0, s_block);
     }
@@ -358,6 +356,9 @@ class BitVector {
    * rank_1(pos) = number of 1s in positions [0...pos-1]
    */
   uint64_t rank(size_t pos) const {
+    if (pos >= num_bits_) [[unlikely]] {
+      return max_rank;
+    }
     uint64_t b_block = pos / kBasicBlockSize;
     uint64_t s_block = pos / kSuperBlockSize;
     // Precomputed rank
@@ -405,31 +406,32 @@ class BitVector {
 
   /**
    * Select zero operation: find the position of the i-th occurrence of a 0-bit
-   * select_0(rank) = index of the rank-th occurrence of 0-bit
+   * select_0(rank0) = index of the rank0-th occurrence of 0-bit
    */
-  uint64_t select0(size_t rank) const {
-    if (rank > max_rank) [[unlikely]] {
+  uint64_t select0(size_t rank0) const {
+    if (rank0 > num_bits_ - max_rank) [[unlikely]] {
       return num_bits_;
     }
-    if (rank == 0) [[unlikely]] {
+    if (rank0 == 0) [[unlikely]] {
       return 0;
     }
-    uint64_t s_block = find_superblock(rank);
-    rank -= super_block_rank[s_block];
-    auto pos = find_basicblock_is(rank, s_block);
-    rank -= basic_block_rank[pos];
+    uint64_t s_block = find_superblock_zeros(rank0);
+    rank0 -= kSuperBlockSize * s_block - super_block_rank[s_block];
+    auto pos = find_basicblock_is_zeros(rank0, s_block);
+    auto pos_in_super_block = pos & (kBlocksPerSuperBlock - 1);
+    rank0 -= kBasicBlockSize * pos_in_super_block - basic_block_rank[pos];
     pos *= kWordsPerBlock;
 
     // Final search
     if (pos + kWordsPerBlock - 1 < kWordsPerBlock) [[unlikely]] {
-      size_t ones = std::popcount(bits[pos]);
-      while (pos < bits.size() && ones < rank) {
-        rank -= ones;
-        ones = std::popcount(bits[++pos]);
+      size_t zeros = std::popcount(~bits[pos]);
+      while (pos < bits.size() && zeros < rank0) {
+        rank0 -= zeros;
+        zeros = std::popcount(~bits[++pos]);
       }
-      return kWordSize * pos + select_64(bits[pos], rank - 1);
+      return kWordSize * pos + select_64(~bits[pos], rank0 - 1);
     }
-    return kWordSize * pos + select_512(&bits[pos], rank - 1);
+    return kWordSize * pos + select0_512(&bits[pos], rank0 - 1);
   }
 
   /**
diff --git a/src/unittests.cpp b/src/unittests.cpp
index 0203f1a..20db4a6 100644
--- a/src/unittests.cpp
+++ b/src/unittests.cpp
@@ -242,7 +242,7 @@ TEST(BitVectorTest, RankWithZeros) {
 TEST(BitVectorTest, SelectBasic) {
   std::vector<uint64_t> bits(8, 0);
   bits[0] = 0b1100010110010110;
-  BitVector bv(bits, 5);
+  BitVector bv(bits, 16);
 
   EXPECT_EQ(bv.select(1), 1);
   EXPECT_EQ(bv.select(2), 2);
@@ -288,6 +288,78 @@ TEST(BitVectorTest, MainSelectTest) {
   }
 }
 
+TEST(BitVectorTest, RankZeroBasic) {
+    std::vector<uint64_t> bits(8, 0);
+    bits[0] = 0b10110;
+    BitVector bv(bits, 10);
+
+    EXPECT_EQ(bv.rank0(0), 0);  
+    EXPECT_EQ(bv.rank0(1), 1);  
+    EXPECT_EQ(bv.rank0(2), 1); 
+    EXPECT_EQ(bv.rank0(3), 1);  
+    EXPECT_EQ(bv.rank0(4), 2);  
+    EXPECT_EQ(bv.rank0(5), 2);  
+}
+
+TEST(BitVectorTest, RankZeroWithOnes) {
+    std::vector<uint64_t> bits(8, 0);
+    BitVector bv(bits, 5);
+
+    for (size_t i = 0; i <= 5; i++) {
+        EXPECT_EQ(bv.rank0(i), i); 
+    }
+}
+
+TEST(BitVectorTest, SelectZeroBasic) {
+    std::vector<uint64_t> bits(8, 0);
+    bits[0] = 0b1100010110010110;
+    BitVector bv(bits, 16);
+
+    EXPECT_EQ(bv.select0(1), 0);  
+    EXPECT_EQ(bv.select0(2), 3);
+    EXPECT_EQ(bv.select0(3), 5);
+    EXPECT_EQ(bv.select0(4), 6);
+    EXPECT_EQ(bv.select0(5), 9);
+    EXPECT_EQ(bv.select0(6), 11);
+    EXPECT_EQ(bv.select0(7), 12);
+    EXPECT_EQ(bv.select0(8), 13);
+}
+
+TEST(BitVectorTest, MainRankZeroTest) {
+    std::mt19937_64 rng(42);
+    std::vector<uint64_t> bits(65536 * 32);
+    for (size_t i = 0; i < bits.size(); i++) {
+        bits[i] = rng();
+    }
+
+    BitVector bv(bits, bits.size() * 64);
+    uint64_t zero_count = 0;
+    for (size_t i = 0; i < bv.size(); ++i) {
+        ASSERT_EQ(zero_count, bv.rank0(i));
+        zero_count += (bv[i] == 0) ? 1 : 0;
+    }
+}
+
+TEST(BitVectorTest, MainSelectZeroTest) {
+    std::mt19937_64 rng(42);
+    std::vector<uint64_t> bits(65536 * 32);
+    for (size_t i = 0; i < bits.size(); i++) {
+        bits[i] = rng();
+    }
+
+    BitVector bv(bits, bits.size() * 64);
+    uint64_t zero_rank = 0;
+
+    for (size_t i = 0; i < bv.size(); ++i) {
+        if (bv[i] == 0) {
+            zero_rank++;
+            EXPECT_EQ(bv.select0(zero_rank), i);
+            EXPECT_EQ(bv.rank0(i), zero_rank - 1);
+            EXPECT_EQ(bv.rank0(i + 1), zero_rank);
+        }
+    }
+}
+
 TEST(BitVectorInterleavedTest, AtTest) {
   std::mt19937_64 rng(42);
   /*

From 13181afe18ca282d7176e14836323dbf238e1965 Mon Sep 17 00:00:00 2001
From: mperikov <mperikov2@mail.ru>
Date: Thu, 18 Dec 2025 15:41:31 +0300
Subject: [PATCH 5/9] Format fix

---
 include/bitvector.h |   8 ++--
 src/unittests.cpp   | 100 ++++++++++++++++++++++----------------------
 2 files changed, 53 insertions(+), 55 deletions(-)

diff --git a/include/bitvector.h b/include/bitvector.h
index 2ed9b1a..638e687 100644
--- a/include/bitvector.h
+++ b/include/bitvector.h
@@ -7,7 +7,7 @@
 #include <string>
 #include <vector>
 
-//#include <iostream>
+// #include <iostream>
 
 #include "bits.h"
 
@@ -190,8 +190,7 @@ class BitVector {
     uint64_t left = select0_samples[rank0 / kSelectSampleFrequency];
 
     while (left + 7 < super_block_rank.size()) {
-      auto len = lower_bound_dlt_8x64(&super_block_rank[left], rank0,
-                                      dlt_super,
+      auto len = lower_bound_dlt_8x64(&super_block_rank[left], rank0, dlt_super,
                                       kSuperBlockSize * left);
       if (len < 8) {
         return left + len - 1;
@@ -199,8 +198,7 @@ class BitVector {
       left += 8;
     }
     if (left + 3 < super_block_rank.size()) {
-      auto len = lower_bound_dlt_4x64(&super_block_rank[left], rank0,
-                                      dlt_super,
+      auto len = lower_bound_dlt_4x64(&super_block_rank[left], rank0, dlt_super,
                                       kSuperBlockSize * left);
       if (len < 4) {
         return left + len - 1;
diff --git a/src/unittests.cpp b/src/unittests.cpp
index 20db4a6..27d037c 100644
--- a/src/unittests.cpp
+++ b/src/unittests.cpp
@@ -289,75 +289,75 @@ TEST(BitVectorTest, MainSelectTest) {
 }
 
 TEST(BitVectorTest, RankZeroBasic) {
-    std::vector<uint64_t> bits(8, 0);
-    bits[0] = 0b10110;
-    BitVector bv(bits, 10);
+  std::vector<uint64_t> bits(8, 0);
+  bits[0] = 0b10110;
+  BitVector bv(bits, 10);
 
-    EXPECT_EQ(bv.rank0(0), 0);  
-    EXPECT_EQ(bv.rank0(1), 1);  
-    EXPECT_EQ(bv.rank0(2), 1); 
-    EXPECT_EQ(bv.rank0(3), 1);  
-    EXPECT_EQ(bv.rank0(4), 2);  
-    EXPECT_EQ(bv.rank0(5), 2);  
+  EXPECT_EQ(bv.rank0(0), 0);
+  EXPECT_EQ(bv.rank0(1), 1);
+  EXPECT_EQ(bv.rank0(2), 1);
+  EXPECT_EQ(bv.rank0(3), 1);
+  EXPECT_EQ(bv.rank0(4), 2);
+  EXPECT_EQ(bv.rank0(5), 2);
 }
 
 TEST(BitVectorTest, RankZeroWithOnes) {
-    std::vector<uint64_t> bits(8, 0);
-    BitVector bv(bits, 5);
+  std::vector<uint64_t> bits(8, 0);
+  BitVector bv(bits, 5);
 
-    for (size_t i = 0; i <= 5; i++) {
-        EXPECT_EQ(bv.rank0(i), i); 
-    }
+  for (size_t i = 0; i <= 5; i++) {
+    EXPECT_EQ(bv.rank0(i), i);
+  }
 }
 
 TEST(BitVectorTest, SelectZeroBasic) {
-    std::vector<uint64_t> bits(8, 0);
-    bits[0] = 0b1100010110010110;
-    BitVector bv(bits, 16);
+  std::vector<uint64_t> bits(8, 0);
+  bits[0] = 0b1100010110010110;
+  BitVector bv(bits, 16);
 
-    EXPECT_EQ(bv.select0(1), 0);  
-    EXPECT_EQ(bv.select0(2), 3);
-    EXPECT_EQ(bv.select0(3), 5);
-    EXPECT_EQ(bv.select0(4), 6);
-    EXPECT_EQ(bv.select0(5), 9);
-    EXPECT_EQ(bv.select0(6), 11);
-    EXPECT_EQ(bv.select0(7), 12);
-    EXPECT_EQ(bv.select0(8), 13);
+  EXPECT_EQ(bv.select0(1), 0);
+  EXPECT_EQ(bv.select0(2), 3);
+  EXPECT_EQ(bv.select0(3), 5);
+  EXPECT_EQ(bv.select0(4), 6);
+  EXPECT_EQ(bv.select0(5), 9);
+  EXPECT_EQ(bv.select0(6), 11);
+  EXPECT_EQ(bv.select0(7), 12);
+  EXPECT_EQ(bv.select0(8), 13);
 }
 
 TEST(BitVectorTest, MainRankZeroTest) {
-    std::mt19937_64 rng(42);
-    std::vector<uint64_t> bits(65536 * 32);
-    for (size_t i = 0; i < bits.size(); i++) {
-        bits[i] = rng();
-    }
+  std::mt19937_64 rng(42);
+  std::vector<uint64_t> bits(65536 * 32);
+  for (size_t i = 0; i < bits.size(); i++) {
+    bits[i] = rng();
+  }
 
-    BitVector bv(bits, bits.size() * 64);
-    uint64_t zero_count = 0;
-    for (size_t i = 0; i < bv.size(); ++i) {
-        ASSERT_EQ(zero_count, bv.rank0(i));
-        zero_count += (bv[i] == 0) ? 1 : 0;
-    }
+  BitVector bv(bits, bits.size() * 64);
+  uint64_t zero_count = 0;
+  for (size_t i = 0; i < bv.size(); ++i) {
+    ASSERT_EQ(zero_count, bv.rank0(i));
+    zero_count += (bv[i] == 0) ? 1 : 0;
+  }
 }
 
 TEST(BitVectorTest, MainSelectZeroTest) {
-    std::mt19937_64 rng(42);
-    std::vector<uint64_t> bits(65536 * 32);
-    for (size_t i = 0; i < bits.size(); i++) {
-        bits[i] = rng();
-    }
+  std::mt19937_64 rng(42);
+  std::vector<uint64_t> bits(65536 * 32);
+  for (size_t i = 0; i < bits.size(); i++) {
+    bits[i] = rng();
+  }
 
-    BitVector bv(bits, bits.size() * 64);
-    uint64_t zero_rank = 0;
+  BitVector bv(bits, bits.size() * 64);
+  uint64_t zero_rank = 0;
 
-    for (size_t i = 0; i < bv.size(); ++i) {
-        if (bv[i] == 0) {
-            zero_rank++;
-            EXPECT_EQ(bv.select0(zero_rank), i);
-            EXPECT_EQ(bv.rank0(i), zero_rank - 1);
-            EXPECT_EQ(bv.rank0(i + 1), zero_rank);
-        }
+  for (size_t i = 0; i < bv.size(); ++i) {
+    if (bv[i] == 0) {
+      zero_rank++;
+      EXPECT_EQ(bv.select0(zero_rank), i);
+      EXPECT_EQ(bv.rank0(i), zero_rank - 1);
+      EXPECT_EQ(bv.rank0(i + 1), zero_rank);
     }
+  }
 }
 
 TEST(BitVectorInterleavedTest, AtTest) {

From 92e0609349409c2b1894155e48889eb07eb9a70d Mon Sep 17 00:00:00 2001
From: mperikov <mperikov2@mail.ru>
Date: Fri, 19 Dec 2025 15:22:40 +0300
Subject: [PATCH 6/9] rank/rank0 and find_superblock_zeros fix

---
 include/bitvector.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/include/bitvector.h b/include/bitvector.h
index 638e687..6f2e959 100644
--- a/include/bitvector.h
+++ b/include/bitvector.h
@@ -205,7 +205,7 @@ class BitVector {
       }
       left += 4;
     }
-    while (left < super_block_rank.size() && super_block_rank[left] < rank0) {
+    while (left < super_block_rank.size() && kSuperBlockSize * left - super_block_rank[left] < rank0) {
       left++;
     }
     return left - 1;
@@ -354,7 +354,7 @@ class BitVector {
    * rank_1(pos) = number of 1s in positions [0...pos-1]
    */
   uint64_t rank(size_t pos) const {
-    if (pos >= num_bits_) [[unlikely]] {
+    if (pos >= bits.size() * kWordSize) [[unlikely]] {
       return max_rank;
     }
     uint64_t b_block = pos / kBasicBlockSize;
@@ -371,7 +371,12 @@ class BitVector {
    * Rank zero operation: count the number of 0-bits up to position pos
    * (exclusive) rank_0(pos) = number of 0s in positions [0...pos-1]
    */
-  uint64_t rank0(size_t pos) const { return pos - rank(pos); }
+  uint64_t rank0(size_t pos) const {
+    if (pos >= bits.size() * kWordSize) [[unlikely]] {
+      return bits.size() * kWordSize - max_rank;
+    }
+    return pos - rank(pos);
+  }
 
   /**
    * Select operation: find the position of the i-th occurrence of a 1-bit

From 01819e1c6ea403d3cc9114627a3abaa4001b503c Mon Sep 17 00:00:00 2001
From: mperikov <mperikov2@mail.ru>
Date: Fri, 19 Dec 2025 15:23:06 +0300
Subject: [PATCH 7/9] select0/rank0 benchmarks

---
 src/benchmark_tests.cpp |  67 +++++++++++++++++-
 src/benchmarks.cpp      | 152 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 215 insertions(+), 4 deletions(-)

diff --git a/src/benchmark_tests.cpp b/src/benchmark_tests.cpp
index 1233a3c..2c302f4 100644
--- a/src/benchmark_tests.cpp
+++ b/src/benchmark_tests.cpp
@@ -10,7 +10,7 @@ using pixie::BitVector;
 using pixie::BitVectorInterleaved;
 
 TEST(BitVectorBenchmarkTest, SelectNonInterleaved10PercentFill) {
-  for (size_t n = 4; n <= (1ull << 34); n <<= 2) {
+  for (size_t n = 8; n <= (1ull << 34); n <<= 2) {
     std::mt19937_64 rng(42);
 
     std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
@@ -30,8 +30,29 @@ TEST(BitVectorBenchmarkTest, SelectNonInterleaved10PercentFill) {
   }
 }
 
+TEST(BitVectorBenchmarkTest, SelectZeroNonInterleaved10PercentFill) {
+  for (size_t n = 8; n <= (1ull << 34); n <<= 2) {
+    std::mt19937_64 rng(42);
+
+    std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
+    size_t num_ones = n * 0.1;
+    for (int i = 0; i < num_ones; i++) {
+      uint64_t pos = rng() % n;
+      bits[pos / 64] |= (1ULL << pos % 64);
+    }
+
+    pixie::BitVector bv(bits, n);
+
+    auto max_rank0 = bv.rank0(bv.size()) + 1;
+    for (int i = 0; i < 100000; i++) {
+      uint64_t rank0 = rng() % max_rank0;
+      bv.select0(rank0);
+    }
+  }
+}
+
 TEST(BitVectorBenchmarkTest, SelectNonInterleaved90PercentFill) {
-  for (size_t n = 4; n <= (1ull << 34); n <<= 2) {
+  for (size_t n = 8; n <= (1ull << 34); n <<= 2) {
     std::mt19937_64 rng(42);
 
     std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
@@ -51,8 +72,29 @@ TEST(BitVectorBenchmarkTest, SelectNonInterleaved90PercentFill) {
   }
 }
 
+TEST(BitVectorBenchmarkTest, SelectZeroNonInterleaved90PercentFill) {
+  for (size_t n = 8; n <= (1ull << 34); n <<= 2) {
+    std::mt19937_64 rng(42);
+
+    std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
+    size_t num_ones = n * 0.9;
+    for (int i = 0; i < num_ones; i++) {
+      uint64_t pos = rng() % n;
+      bits[pos / 64] |= (1ULL << pos % 64);
+    }
+
+    pixie::BitVector bv(bits, n);
+
+    auto max_rank0 = bv.rank0(bv.size()) + 1;
+    for (int i = 0; i < 100000; i++) {
+      uint64_t rank0 = rng() % max_rank0;
+      bv.select0(rank0);
+    }
+  }
+}
+
 TEST(BitVectorBenchmarkTest, SelectNonInterleaved) {
-  for (size_t n = 4; n <= (1ull << 34); n <<= 2) {
+  for (size_t n = 8; n <= (1ull << 34); n <<= 2) {
     std::mt19937_64 rng(42);
 
     std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
@@ -69,3 +111,22 @@ TEST(BitVectorBenchmarkTest, SelectNonInterleaved) {
     }
   }
 }
+
+TEST(BitVectorBenchmarkTest, SelectZeroNonInterleaved) {
+  for (size_t n = 8; n <= (1ull << 34); n <<= 2) {
+    std::mt19937_64 rng(42);
+
+    std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
+    for (auto& x : bits) {
+      x = rng();
+    }
+    pixie::BitVector bv(bits, n);
+
+    auto max_rank0 = bv.rank0(bv.size()) + 1;
+
+    for (int i = 0; i < 100000; i++) {
+      uint64_t rank0 = rng() % max_rank0;
+      bv.select0(rank0);
+    }
+  }
+}
diff --git a/src/benchmarks.cpp b/src/benchmarks.cpp
index e2586fa..92bafe2 100644
--- a/src/benchmarks.cpp
+++ b/src/benchmarks.cpp
@@ -24,6 +24,22 @@ static void BM_RankNonInterleaved(benchmark::State& state) {
   }
 }
 
+static void BM_RankZeroNonInterleaved(benchmark::State& state) {
+  size_t n = state.range(0);
+  std::mt19937_64 rng(42);
+
+  std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
+  for (auto& x : bits) {
+    x = rng();
+  }
+  pixie::BitVector bv(bits, n);
+
+  for (auto _ : state) {
+    uint64_t pos = rng() % n;
+    benchmark::DoNotOptimize(bv.rank0(pos));
+  }
+}
+
 static void BM_RankInterleaved(benchmark::State& state) {
   size_t n = state.range(0);
   std::mt19937_64 rng(42);
@@ -58,6 +74,24 @@ static void BM_SelectNonInterleaved(benchmark::State& state) {
   }
 }
 
+static void BM_SelectZeroNonInterleaved(benchmark::State& state) {
+  size_t n = state.range(0);
+  std::mt19937_64 rng(42);
+
+  std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
+  for (auto& x : bits) {
+    x = rng();
+  }
+  pixie::BitVector bv(bits, n);
+
+  auto max_rank0 = bv.rank0(bv.size()) + 1;
+
+  for (auto _ : state) {
+    uint64_t rank0 = rng() % max_rank0;
+    benchmark::DoNotOptimize(bv.select0(rank0));
+  }
+}
+
 static void BM_RankNonInterleaved10PercentFill(benchmark::State& state) {
   size_t n = state.range(0);
   std::mt19937_64 rng(42);
@@ -77,6 +111,25 @@ static void BM_RankNonInterleaved10PercentFill(benchmark::State& state) {
   }
 }
 
+static void BM_RankZeroNonInterleaved10PercentFill(benchmark::State& state) {
+  size_t n = state.range(0);
+  std::mt19937_64 rng(42);
+
+  std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
+  size_t num_ones = n * 0.1;
+  for (int i = 0; i < num_ones; i++) {
+    uint64_t pos = rng() % n;
+    bits[pos / 64] |= (1ULL << pos % 64);
+  }
+
+  pixie::BitVector bv(bits, n);
+
+  for (auto _ : state) {
+    uint64_t pos = rng() % n;
+    benchmark::DoNotOptimize(bv.rank0(pos));
+  }
+}
+
 static void BM_SelectNonInterleaved10PercentFill(benchmark::State& state) {
   size_t n = state.range(0);
   std::mt19937_64 rng(42);
@@ -98,6 +151,27 @@ static void BM_SelectNonInterleaved10PercentFill(benchmark::State& state) {
   }
 }
 
+static void BM_SelectZeroNonInterleaved10PercentFill(benchmark::State& state) {
+  size_t n = state.range(0);
+  std::mt19937_64 rng(42);
+
+  std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
+  size_t num_ones = n * 0.1;
+  for (int i = 0; i < num_ones; i++) {
+    uint64_t pos = rng() % n;
+    bits[pos / 64] |= (1ULL << pos % 64);
+  }
+
+  pixie::BitVector bv(bits, n);
+
+  auto max_rank0 = bv.rank0(bv.size()) + 1;
+
+  for (auto _ : state) {
+    uint64_t rank0 = rng() % max_rank0;
+    benchmark::DoNotOptimize(bv.select0(rank0));
+  }
+}
+
 static void BM_RankNonInterleaved90PercentFill(benchmark::State& state) {
   size_t n = state.range(0);
   std::mt19937_64 rng(42);
@@ -117,6 +191,25 @@ static void BM_RankNonInterleaved90PercentFill(benchmark::State& state) {
   }
 }
 
+static void BM_RankZeroNonInterleaved90PercentFill(benchmark::State& state) {
+  size_t n = state.range(0);
+  std::mt19937_64 rng(42);
+
+  std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
+  size_t num_ones = n * 0.9;
+  for (int i = 0; i < num_ones; i++) {
+    uint64_t pos = rng() % n;
+    bits[pos / 64] |= (1ULL << pos % 64);
+  }
+
+  pixie::BitVector bv(bits, n);
+
+  for (auto _ : state) {
+    uint64_t pos = rng() % n;
+    benchmark::DoNotOptimize(bv.rank0(pos));
+  }
+}
+
 static void BM_SelectNonInterleaved90PercentFill(benchmark::State& state) {
   size_t n = state.range(0);
   std::mt19937_64 rng(42);
@@ -138,13 +231,40 @@ static void BM_SelectNonInterleaved90PercentFill(benchmark::State& state) {
   }
 }
 
+static void BM_SelectZeroNonInterleaved90PercentFill(benchmark::State& state) {
+  size_t n = state.range(0);
+  std::mt19937_64 rng(42);
+
+  std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
+  size_t num_ones = n * 0.9;
+  for (int i = 0; i < num_ones; i++) {
+    uint64_t pos = rng() % n;
+    bits[pos / 64] |= (1ULL << pos % 64);
+  }
+
+  pixie::BitVector bv(bits, n);
+
+  auto max_rank0 = bv.rank0(bv.size()) + 1;
+
+  for (auto _ : state) {
+    uint64_t rank0 = rng() % max_rank0;
+    benchmark::DoNotOptimize(bv.select(rank0));
+  }
+}
+
+BENCHMARK(BM_RankInterleaved)
+    ->ArgNames({"n"})
+    ->RangeMultiplier(4)
+    ->Range(8, 1ull << 34)
+    ->Iterations(100000);
+
 BENCHMARK(BM_RankNonInterleaved)
     ->ArgNames({"n"})
     ->RangeMultiplier(4)
     ->Range(8, 1ull << 34)
     ->Iterations(100000);
 
-BENCHMARK(BM_RankInterleaved)
+BENCHMARK(BM_RankZeroNonInterleaved)
     ->ArgNames({"n"})
     ->RangeMultiplier(4)
     ->Range(8, 1ull << 34)
@@ -156,26 +276,56 @@ BENCHMARK(BM_SelectNonInterleaved)
     ->Range(8, 1ull << 34)
     ->Iterations(100000);
 
+BENCHMARK(BM_SelectZeroNonInterleaved)
+    ->ArgNames({"n"})
+    ->RangeMultiplier(4)
+    ->Range(8, 1ull << 34)
+    ->Iterations(100000);
+
 BENCHMARK(BM_RankNonInterleaved10PercentFill)
     ->ArgNames({"n"})
     ->RangeMultiplier(4)
     ->Range(8, 1ull << 34)
     ->Iterations(100000);
 
+BENCHMARK(BM_RankZeroNonInterleaved10PercentFill)
+    ->ArgNames({"n"})
+    ->RangeMultiplier(4)
+    ->Range(8, 1ull << 34)
+    ->Iterations(100000);
+
 BENCHMARK(BM_SelectNonInterleaved10PercentFill)
     ->ArgNames({"n"})
     ->RangeMultiplier(4)
     ->Range(8, 1ull << 34)
     ->Iterations(100000);
 
+BENCHMARK(BM_SelectZeroNonInterleaved10PercentFill)
+    ->ArgNames({"n"})
+    ->RangeMultiplier(4)
+    ->Range(8, 1ull << 34)
+    ->Iterations(100000);
+
 BENCHMARK(BM_RankNonInterleaved90PercentFill)
     ->ArgNames({"n"})
     ->RangeMultiplier(4)
     ->Range(8, 1ull << 34)
     ->Iterations(100000);
 
+BENCHMARK(BM_RankZeroNonInterleaved90PercentFill)
+    ->ArgNames({"n"})
+    ->RangeMultiplier(4)
+    ->Range(8, 1ull << 34)
+    ->Iterations(100000);
+
 BENCHMARK(BM_SelectNonInterleaved90PercentFill)
     ->ArgNames({"n"})
     ->RangeMultiplier(4)
     ->Range(8, 1ull << 34)
     ->Iterations(100000);
+
+BENCHMARK(BM_SelectZeroNonInterleaved90PercentFill)
+    ->ArgNames({"n"})
+    ->RangeMultiplier(4)
+    ->Range(8, 1ull << 34)
+    ->Iterations(100000);
\ No newline at end of file

From 5182b450cb293ee7d00ef2e72cda73368df27697 Mon Sep 17 00:00:00 2001
From: mperikov <mperikov2@mail.ru>
Date: Fri, 19 Dec 2025 15:25:15 +0300
Subject: [PATCH 8/9] Format fix

---
 include/bitvector.h | 3 ++-
 src/benchmarks.cpp  | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/bitvector.h b/include/bitvector.h
index 6f2e959..60a8f7e 100644
--- a/include/bitvector.h
+++ b/include/bitvector.h
@@ -205,7 +205,8 @@ class BitVector {
       }
       left += 4;
     }
-    while (left < super_block_rank.size() && kSuperBlockSize * left - super_block_rank[left] < rank0) {
+    while (left < super_block_rank.size() &&
+           kSuperBlockSize * left - super_block_rank[left] < rank0) {
       left++;
     }
     return left - 1;
diff --git a/src/benchmarks.cpp b/src/benchmarks.cpp
index 92bafe2..54a0ea6 100644
--- a/src/benchmarks.cpp
+++ b/src/benchmarks.cpp
@@ -328,4 +328,4 @@ BENCHMARK(BM_SelectZeroNonInterleaved90PercentFill)
     ->ArgNames({"n"})
     ->RangeMultiplier(4)
     ->Range(8, 1ull << 34)
-    ->Iterations(100000);
\ No newline at end of file
+    ->Iterations(100000);

From 8ae31fbf91dd7db41f3ed7c81501475d1b22054c Mon Sep 17 00:00:00 2001
From: mperikov <mperikov2@mail.ru>
Date: Fri, 19 Dec 2025 22:03:45 +0300
Subject: [PATCH 9/9] include fix

---
 include/bitvector.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/bitvector.h b/include/bitvector.h
index 60a8f7e..43e5bc9 100644
--- a/include/bitvector.h
+++ b/include/bitvector.h
@@ -7,8 +7,6 @@
 #include <string>
 #include <vector>
 
-// #include <iostream>
-
 #include "bits.h"
 
 namespace pixie {