diff --git a/include/bits.h b/include/bits.h
index c7ee26d..dd87f7f 100644
--- a/include/bits.h
+++ b/include/bits.h
@@ -128,8 +128,8 @@ uint64_t select_512(const uint64_t* x, uint64_t rank) {
 #ifdef PIXIE_AVX512_SUPPORT
 
   __m512i res = _mm512_loadu_epi64(x);
-  std::array<uint64_t, 8> counts;
-  _mm512_storeu_epi64(counts.data(), _mm512_popcnt_epi64(res));
+  alignas(64) std::array<uint64_t, 8> counts;
+  _mm512_store_epi64(counts.data(), _mm512_popcnt_epi64(res));
 
   size_t i = 0;
   while (i < 8 && counts[i] <= rank) {
@@ -150,6 +150,37 @@ uint64_t select_512(const uint64_t* x, uint64_t rank) {
 #endif
 }
 
+/**
+ * @brief Return position of @p rank0 0 bit in @p x
+ * @details select_512 with bit inversion.
+ */
+uint64_t select0_512(const uint64_t* x, uint64_t rank0) {
+#ifdef PIXIE_AVX512_SUPPORT
+
+  __m512i res = _mm512_loadu_epi64(x);
+  res = _mm512_xor_epi64(res, _mm512_set1_epi64(-1));
+  alignas(64) std::array<uint64_t, 8> counts;
+  _mm512_store_epi64(counts.data(), _mm512_popcnt_epi64(res));
+
+  size_t i = 0;
+  while (i < 8 && counts[i] <= rank0) {
+    rank0 -= counts[i++];
+  }
+  return i * 64 + select_64(~x[i], rank0);
+
+#else
+
+  size_t i = 0;
+  int popcount = std::popcount(~x[0]);
+  while (i < 7 && popcount <= rank0) {
+    rank0 -= popcount;
+    popcount = std::popcount(~x[++i]);
+  }
+  return i * 64 + select_64(~x[i], rank0);
+
+#endif
+}
+
 /**
  * @brief Compare 4 64-bit numbers of @p x with @p y and
  * return the length of the prefix where @p y is less then @p x
@@ -190,6 +221,61 @@ uint16_t lower_bound_4x64(const uint64_t* x, uint64_t y) {
 #endif
 }
 
+/**
+ * @brief Compare 4 64-bit numbers of ( @p dlt_array + @p dlt_scalar - @p x )
+ * with @p y and return the length of the prefix
+ * where @p y is less then ( @p dlt_array + @p dlt_scalar - @p x )
+ */
+uint16_t lower_bound_dlt_4x64(const uint64_t* x,
+                              uint64_t y,
+                              const uint64_t* dlt_array,
+                              uint64_t dlt_scalar) {
+#ifdef PIXIE_AVX512_SUPPORT
+
+  const __m256i dlt_256 = _mm256_loadu_epi64(dlt_array);
+  auto x_256 = _mm256_loadu_epi64(x);
+  auto dlt_4 = _mm256_set1_epi64x(dlt_scalar);
+  auto y_4 = _mm256_set1_epi64x(y);
+
+  auto tmp = _mm256_add_epi64(dlt_4, dlt_256);
+  auto reg_256 = _mm256_sub_epi64(tmp, x_256);
+  auto cmp = _mm256_cmpge_epu64_mask(reg_256, y_4);
+
+  return _tzcnt_u16(cmp);
+
+#else
+#ifdef PIXIE_AVX2_SUPPORT
+
+  const __m256i dlt_256 =
+      _mm256_loadu_si256(reinterpret_cast<const __m256i*>(dlt_array));
+  auto x_256 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(x));
+  auto dlt_4 = _mm256_set1_epi64x(dlt_scalar);
+  auto y_4 = _mm256_set1_epi64x(y);
+
+  auto tmp = _mm256_add_epi64(dlt_4, dlt_256);
+  auto reg_256 = _mm256_sub_epi64(tmp, x_256);
+
+  const __m256i offset = _mm256_set1_epi64x(0x8000000000000000ULL);
+  __m256i x_offset = _mm256_xor_si256(reg_256, offset);
+  __m256i y_offset = _mm256_xor_si256(y_4, offset);
+  auto mask = _mm256_movemask_epi8(_mm256_cmpgt_epi64(
+      x_offset, _mm256_sub_epi64(y_offset, _mm256_set1_epi64x(1))));
+
+  return _tzcnt_u32(mask) >> 3;
+
+#else
+
+  for (uint16_t i = 0; i < 4; ++i) {
+    if (dlt_array[i] + dlt_scalar - x[i] >= y) {
+      return i;
+    }
+  }
+  return 4;
+
+#endif
+#endif
+}
+
 /**
  * @brief Compare 8 64-bit numbers of @p x with @p y and
  * return the length of the prefix where @p y is less then @p x
@@ -227,11 +313,57 @@ uint16_t lower_bound_8x64(const uint64_t* x, uint64_t y) {
 #endif
 }
 
+/**
+ * @brief Compare 8 64-bit numbers of ( @p dlt_array + @p dlt_scalar - @p x )
+ * with @p y and return the length of the prefix
+ * where @p y is less then ( @p dlt_array + @p dlt_scalar - @p x )
+ */
+uint16_t lower_bound_dlt_8x64(const uint64_t* x,
+                              uint64_t y,
+                              const uint64_t* dlt_array,
+                              uint64_t dlt_scalar) {
+#ifdef PIXIE_AVX512_SUPPORT
+
+  const __m512i dlt_512 = _mm512_loadu_epi64(dlt_array);
+  auto x_512 = _mm512_loadu_epi64(x);
+  auto dlt_8 = _mm512_set1_epi64(dlt_scalar);
+  auto y_8 = _mm512_set1_epi64(y);
+
+  auto tmp = _mm512_add_epi64(dlt_8, dlt_512);
+  auto reg_512 = _mm512_sub_epi64(tmp, x_512);
+  auto cmp = _mm512_cmpge_epu64_mask(reg_512, y_8);
+
+  return _tzcnt_u16(cmp);
+
+#else
+#ifdef PIXIE_AVX2_SUPPORT
+
+  uint16_t len = lower_bound_dlt_4x64(x, y, dlt_array, dlt_scalar);
+
+  if (len < 4) {
+    return len;
+  }
+
+  return len + lower_bound_dlt_4x64(x + 4, y, dlt_array + 4, dlt_scalar);
+
+#else
+
+  for (uint16_t i = 0; i < 8; ++i) {
+    if (dlt_array[i] + dlt_scalar - x[i] >= y) {
+      return i;
+    }
+  }
+  return 8;
+
+#endif
+#endif
+}
+
 /**
  * @brief Compare 32 16-bit numbers of @p x with @p y and
  * return the count of numbers where @p x is less then @p y
  */
-uint16_t lower_bound_32x16(const uint16_t* x, uint64_t y) {
+uint16_t lower_bound_32x16(const uint16_t* x, uint16_t y) {
 #ifdef PIXIE_AVX512_SUPPORT
 
   auto y_32 = _mm512_set1_epi16(y);
@@ -273,6 +405,72 @@ uint16_t lower_bound_32x16(const uint16_t* x, uint64_t y) {
 #endif
 }
 
+/**
+ * @brief Compare 32 16-bit numbers of ( @p dlt_array + @p dlt_scalar - @p x )
+ * with @p y and return the count of numbers where
+ * ( @p dlt_array + @p dlt_scalar - @p x ) is less then @p y
+ */
+uint16_t lower_bound_dlt_32x16(const uint16_t* x,
+                               uint16_t y,
+                               const uint16_t* dlt_array,
+                               uint16_t dlt_scalar) {
+#ifdef PIXIE_AVX512_SUPPORT
+
+  const __m512i dlt_512 = _mm512_loadu_epi64(dlt_array);
+  auto x_512 = _mm512_loadu_epi64(x);
+  auto dlt_32 = _mm512_set1_epi16(dlt_scalar);
+  auto y_32 = _mm512_set1_epi16(y);
+
+  auto tmp = _mm512_add_epi16(dlt_32, dlt_512);
+  auto reg_512 = _mm512_sub_epi16(tmp, x_512);
+  auto cmp = _mm512_cmplt_epu16_mask(reg_512, y_32);
+  return std::popcount(cmp);
+
+#else
+#ifdef PIXIE_AVX2_SUPPORT
+
+  auto dlt_256 =
+      _mm256_loadu_si256(reinterpret_cast<const __m256i*>(dlt_array));
+  auto x_256 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(x));
+  auto dlt_16 = _mm256_set1_epi16(dlt_scalar);
+  auto y_16 = _mm256_set1_epi16(y);
+
+  auto tmp = _mm256_add_epi16(dlt_16, dlt_256);
+  auto reg_256 = _mm256_sub_epi16(tmp, x_256);
+
+  const __m256i offset = _mm256_set1_epi16(0x8000);
+  __m256i x_offset = _mm256_xor_si256(reg_256, offset);
+  __m256i y_offset = _mm256_xor_si256(y_16, offset);
+  uint32_t mask = _mm256_movemask_epi8(_mm256_cmpgt_epi16(y_offset, x_offset));
+
+  uint16_t count = std::popcount(mask) >> 1;
+
+  dlt_256 =
+      _mm256_loadu_si256(reinterpret_cast<const __m256i*>(dlt_array + 16));
+  x_256 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(x + 16));
+
+  tmp = _mm256_add_epi16(dlt_16, dlt_256);
+  reg_256 = _mm256_sub_epi16(tmp, x_256);
+
+  x_offset = _mm256_xor_si256(reg_256, offset);
+  mask = _mm256_movemask_epi8(_mm256_cmpgt_epi16(y_offset, x_offset));
+
+  return count + (std::popcount(mask) >> 1);
+
+#else
+
+  uint16_t cnt = 0;
+  for (uint16_t i = 0; i < 32; ++i) {
+    if (dlt_array[i] + dlt_scalar - x[i] < y) {
+      cnt++;
+    }
+  }
+  return cnt;
+
+#endif
+#endif
+}
+
 /**
  * @brief Calculates 64 popcounts of 4-bits integers and stores as 64 4-bits
  * integers (packed into 32 bytes)
diff --git a/include/bitvector.h b/include/bitvector.h
index 117e00a..43e5bc9 100644
--- a/include/bitvector.h
+++ b/include/bitvector.h
@@ -72,9 +72,13 @@ class BitVector {
   constexpr static size_t kSelectSampleFrequency = 16384;
   constexpr static size_t kBlocksPerSuperBlock = 128;
 
+  alignas(64) uint64_t dlt_super[8];
+  alignas(64) uint16_t dlt_basic[32];
+
   std::vector<uint64_t> super_block_rank;
   std::vector<uint16_t> basic_block_rank;
   std::vector<uint64_t> select_samples;
+  std::vector<uint64_t> select0_samples;
   const size_t num_bits_;
   const size_t padded_size_;
   size_t max_rank;
@@ -119,10 +123,14 @@ class BitVector {
    */
   void build_select() {
     uint64_t milestone = kSelectSampleFrequency;
+    uint64_t milestone0 = kSelectSampleFrequency;
     uint64_t rank = 0;
+    uint64_t rank0 = 0;
     select_samples.emplace_back(0);
+    select0_samples.emplace_back(0);
     for (size_t i = 0; i < bits.size(); ++i) {
       auto ones = std::popcount(bits[i]);
+      auto zeros = 64 - ones;
       if (rank + ones >= milestone) {
         auto pos = select_64(bits[i], milestone - rank - 1);
         // TODO: try including global rank into select samples to save
@@ -130,7 +138,20 @@ class BitVector {
         select_samples.emplace_back((64 * i + pos) / kSuperBlockSize);
         milestone += kSelectSampleFrequency;
       }
+      if (rank0 + zeros >= milestone0) {
+        auto pos = select_64(~bits[i], milestone0 - rank0 - 1);
+        select0_samples.emplace_back((64 * i + pos) / kSuperBlockSize);
+        milestone0 += kSelectSampleFrequency;
+      }
       rank += ones;
+      rank0 += zeros;
+    }
+
+    for (size_t i = 0; i < 8; ++i) {
+      dlt_super[i] = i * kSuperBlockSize;
+    }
+    for (size_t i = 0; i < 32; ++i) {
+      dlt_basic[i] = i * kBasicBlockSize;
     }
   }
 
@@ -160,6 +181,35 @@ class BitVector {
     return left - 1;
   }
 
+  /**
+   * @brief first step of the select0 operation
+   */
+  uint64_t find_superblock_zeros(uint64_t rank0) const {
+    uint64_t left = select0_samples[rank0 / kSelectSampleFrequency];
+
+    while (left + 7 < super_block_rank.size()) {
+      auto len = lower_bound_dlt_8x64(&super_block_rank[left], rank0, dlt_super,
+                                      kSuperBlockSize * left);
+      if (len < 8) {
+        return left + len - 1;
+      }
+      left += 8;
+    }
+    if (left + 3 < super_block_rank.size()) {
+      auto len = lower_bound_dlt_4x64(&super_block_rank[left], rank0, dlt_super,
+                                      kSuperBlockSize * left);
+      if (len < 4) {
+        return left + len - 1;
+      }
+      left += 4;
+    }
+    while (left < super_block_rank.size() &&
+           kSuperBlockSize * left - super_block_rank[left] < rank0) {
+      left++;
+    }
+    return left - 1;
+  }
+
   /**
    * @brief SIMD-optimized linear scan
    * @details
@@ -167,17 +217,32 @@ class BitVector {
    * 4 iterations.
    */
   uint64_t find_basicblock(uint16_t local_rank, uint64_t s_block) const {
-    auto pos = 0;
+    for (size_t pos = 0; pos < kBlocksPerSuperBlock; pos += 32) {
+      auto count = lower_bound_32x16(
+          &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank);
+      if (count < 32) {
+        return kBlocksPerSuperBlock * s_block + pos + count - 1;
+      }
+    }
+    return kBlocksPerSuperBlock * s_block + kBlocksPerSuperBlock - 1;
+  }
 
-    for (size_t i = 0; i < 4; ++i) {
-      auto count = lower_bound_32x16(&basic_block_rank[128 * s_block + 32 * i],
-                                     local_rank);
+  /**
+   * @brief SIMD-optimized linear scan
+   * @details
+   * Processes 32 16-bit entries at once (full cache line), so there is at most
+   * 4 iterations.
+   */
+  uint64_t find_basicblock_zeros(uint16_t local_rank0, uint64_t s_block) const {
+    for (size_t pos = 0; pos < kBlocksPerSuperBlock; pos += 32) {
+      auto count = lower_bound_dlt_32x16(
+          &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank0,
+          dlt_basic, kBasicBlockSize * pos);
       if (count < 32) {
         return kBlocksPerSuperBlock * s_block + pos + count - 1;
       }
-      pos += 32;
     }
-    return kBlocksPerSuperBlock * s_block + pos - 1;
+    return kBlocksPerSuperBlock * s_block + kBlocksPerSuperBlock - 1;
   }
 
   /**
@@ -197,8 +262,8 @@ class BitVector {
     pos = pos + 16 < 32 ? 0 : (pos - 16);
     pos = pos > 96 ? 96 : pos;
     while (pos < 96) {
-      auto count =
-          lower_bound_32x16(&basic_block_rank[128 * s_block + pos], local_rank);
+      auto count = lower_bound_32x16(
+          &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank);
       if (count == 0) {
         return find_basicblock(local_rank, s_block);
       }
@@ -208,14 +273,54 @@ class BitVector {
       pos += 32;
     }
     pos = 96;
-    auto count =
-        lower_bound_32x16(&basic_block_rank[128 * s_block + pos], local_rank);
+    auto count = lower_bound_32x16(
+        &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank);
     if (count == 0) {
       return find_basicblock(local_rank, s_block);
     }
     return kBlocksPerSuperBlock * s_block + pos + count - 1;
   }
 
+  /**
+   * @brief Interpolation search with SIMD optimization
+   * @details
+   * Similar to find_basicblock_zeros but initial guess is based on linear
+   * interpolation, for random data it should make initial guess correct
+   * most of the times, we start from the 32 wide block with interpolation
+   * guess at the center, if we see that select result lie in lower blocks
+   * we backoff to find_basicblock_zeros
+   */
+  uint64_t find_basicblock_is_zeros(uint16_t local_rank0,
+                                    uint64_t s_block) const {
+    auto lower = kSuperBlockSize * s_block - super_block_rank[s_block];
+    auto upper =
+        kSuperBlockSize * (s_block + 1) - super_block_rank[s_block + 1];
+
+    uint64_t pos = kBlocksPerSuperBlock * local_rank0 / (upper - lower);
+    pos = pos + 16 < 32 ? 0 : (pos - 16);
+    pos = pos > 96 ? 96 : pos;
+    while (pos < 96) {
+      auto count = lower_bound_dlt_32x16(
+          &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank0,
+          dlt_basic, kBasicBlockSize * pos);
+      if (count == 0) {
+        return find_basicblock_zeros(local_rank0, s_block);
+      }
+      if (count < 32) {
+        return kBlocksPerSuperBlock * s_block + pos + count - 1;
+      }
+      pos += 32;
+    }
+    pos = 96;
+    auto count = lower_bound_dlt_32x16(
+        &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank0,
+        dlt_basic, kBasicBlockSize * pos);
+    if (count == 0) {
+      return find_basicblock_zeros(local_rank0, s_block);
+    }
+    return kBlocksPerSuperBlock * s_block + pos + count - 1;
+  }
+
  public:
   /**
    * Constructor from an external array of uint64_t.
@@ -248,6 +353,9 @@ class BitVector {
    * rank_1(pos) = number of 1s in positions [0...pos-1]
    */
   uint64_t rank(size_t pos) const {
+    if (pos >= bits.size() * kWordSize) [[unlikely]] {
+      return max_rank;
+    }
     uint64_t b_block = pos / kBasicBlockSize;
     uint64_t s_block = pos / kSuperBlockSize;
     // Precomputed rank
@@ -258,6 +366,21 @@ class BitVector {
     return result;
   }
 
+  /**
+   * Rank zero operation: count the number of 0-bits up to position pos
+   * (exclusive) rank_0(pos) = number of 0s in positions [0...pos-1]
+   */
+  uint64_t rank0(size_t pos) const {
+    if (pos >= bits.size() * kWordSize) [[unlikely]] {
+      return bits.size() * kWordSize - max_rank;
+    }
+    return pos - rank(pos);
+  }
+
+  /**
+   * Select operation: find the position of the i-th occurrence of a 1-bit
+   * select_1(rank) = index of the rank-th occurrence of 1-bit
+   */
   uint64_t select(size_t rank) const {
     if (rank > max_rank) [[unlikely]] {
       return num_bits_;
@@ -283,6 +406,36 @@ class BitVector {
     return kWordSize * pos + select_512(&bits[pos], rank - 1);
   }
 
+  /**
+   * Select zero operation: find the position of the i-th occurrence of a 0-bit
+   * select_0(rank0) = index of the rank0-th occurrence of 0-bit
+   */
+  uint64_t select0(size_t rank0) const {
+    if (rank0 > num_bits_ - max_rank) [[unlikely]] {
+      return num_bits_;
+    }
+    if (rank0 == 0) [[unlikely]] {
+      return 0;
+    }
+    uint64_t s_block = find_superblock_zeros(rank0);
+    rank0 -= kSuperBlockSize * s_block - super_block_rank[s_block];
+    auto pos = find_basicblock_is_zeros(rank0, s_block);
+    auto pos_in_super_block = pos & (kBlocksPerSuperBlock - 1);
+    rank0 -= kBasicBlockSize * pos_in_super_block - basic_block_rank[pos];
+    pos *= kWordsPerBlock;
+
+    // Final search
+    if (pos + kWordsPerBlock - 1 < kWordsPerBlock) [[unlikely]] {
+      size_t zeros = std::popcount(~bits[pos]);
+      while (pos < bits.size() && zeros < rank0) {
+        rank0 -= zeros;
+        zeros = std::popcount(~bits[++pos]);
+      }
+      return kWordSize * pos + select_64(~bits[pos], rank0 - 1);
+    }
+    return kWordSize * pos + select0_512(&bits[pos], rank0 - 1);
+  }
+
   /**
    * Convert the bit vector to a binary string, for debug purpose only
    */
diff --git a/src/benchmark_tests.cpp b/src/benchmark_tests.cpp
index 1233a3c..2c302f4 100644
--- a/src/benchmark_tests.cpp
+++ b/src/benchmark_tests.cpp
@@ -10,7 +10,7 @@ using pixie::BitVector;
 using pixie::BitVectorInterleaved;
 
 TEST(BitVectorBenchmarkTest, SelectNonInterleaved10PercentFill) {
-  for (size_t n = 4; n <= (1ull << 34); n <<= 2) {
+  for (size_t n = 8; n <= (1ull << 34); n <<= 2) {
     std::mt19937_64 rng(42);
 
     std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
@@ -30,8 +30,29 @@ TEST(BitVectorBenchmarkTest, SelectNonInterleaved10PercentFill) {
   }
 }
 
+TEST(BitVectorBenchmarkTest, SelectZeroNonInterleaved10PercentFill) {
+  for (size_t n = 8; n <= (1ull << 34); n <<= 2) {
+    std::mt19937_64 rng(42);
+
+    std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
+    size_t num_ones = n * 0.1;
+    for (int i = 0; i < num_ones; i++) {
+      uint64_t pos = rng() % n;
+      bits[pos / 64] |= (1ULL << pos % 64);
+    }
+
+    pixie::BitVector bv(bits, n);
+
+    auto max_rank0 = bv.rank0(bv.size()) + 1;
+    for (int i = 0; i < 100000; i++) {
+      uint64_t rank0 = rng() % max_rank0;
+      bv.select0(rank0);
+    }
+  }
+}
+
 TEST(BitVectorBenchmarkTest, SelectNonInterleaved90PercentFill) {
-  for (size_t n = 4; n <= (1ull << 34); n <<= 2) {
+  for (size_t n = 8; n <= (1ull << 34); n <<= 2) {
     std::mt19937_64 rng(42);
 
     std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
@@ -51,8 +72,29 @@ TEST(BitVectorBenchmarkTest, SelectNonInterleaved90PercentFill) {
   }
 }
 
+TEST(BitVectorBenchmarkTest, SelectZeroNonInterleaved90PercentFill) {
+  for (size_t n = 8; n <= (1ull << 34); n <<= 2) {
+    std::mt19937_64 rng(42);
+
+    std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
+    size_t num_ones = n * 0.9;
+    for (int i = 0; i < num_ones; i++) {
+      uint64_t pos = rng() % n;
+      bits[pos / 64] |= (1ULL << pos % 64);
+    }
+
+    pixie::BitVector bv(bits, n);
+
+    auto max_rank0 = bv.rank0(bv.size()) + 1;
+    for (int i = 0; i < 100000; i++) {
+      uint64_t rank0 = rng() % max_rank0;
+      bv.select0(rank0);
+    }
+  }
+}
+
 TEST(BitVectorBenchmarkTest, SelectNonInterleaved) {
-  for (size_t n = 4; n <= (1ull << 34); n <<= 2) {
+  for (size_t n = 8; n <= (1ull << 34); n <<= 2) {
     std::mt19937_64 rng(42);
 
     std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
@@ -69,3 +111,22 @@ TEST(BitVectorBenchmarkTest, SelectNonInterleaved) {
     }
   }
 }
+
+TEST(BitVectorBenchmarkTest, SelectZeroNonInterleaved) {
+  for (size_t n = 8; n <= (1ull << 34); n <<= 2) {
+    std::mt19937_64 rng(42);
+
+    std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
+    for (auto& x : bits) {
+      x = rng();
+    }
+    pixie::BitVector bv(bits, n);
+
+    auto max_rank0 = bv.rank0(bv.size()) + 1;
+
+    for (int i = 0; i < 100000; i++) {
+      uint64_t rank0 = rng() % max_rank0;
+      bv.select0(rank0);
+    }
+  }
+}
diff --git a/src/benchmarks.cpp b/src/benchmarks.cpp
index e2586fa..54a0ea6 100644
--- a/src/benchmarks.cpp
+++ b/src/benchmarks.cpp
@@ -24,6 +24,22 @@ static void BM_RankNonInterleaved(benchmark::State& state) {
   }
 }
 
+static void BM_RankZeroNonInterleaved(benchmark::State& state) {
+  size_t n = state.range(0);
+  std::mt19937_64 rng(42);
+
+  std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
+  for (auto& x : bits) {
+    x = rng();
+  }
+  pixie::BitVector bv(bits, n);
+
+  for (auto _ : state) {
+    uint64_t pos = rng() % n;
+    benchmark::DoNotOptimize(bv.rank0(pos));
+  }
+}
+
 static void BM_RankInterleaved(benchmark::State& state) {
   size_t n = state.range(0);
   std::mt19937_64 rng(42);
@@ -58,6 +74,24 @@ static void BM_SelectNonInterleaved(benchmark::State& state) {
   }
 }
 
+static void BM_SelectZeroNonInterleaved(benchmark::State& state) {
+  size_t n = state.range(0);
+  std::mt19937_64 rng(42);
+
+  std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
+  for (auto& x : bits) {
+    x = rng();
+  }
+  pixie::BitVector bv(bits, n);
+
+  auto max_rank0 = bv.rank0(bv.size()) + 1;
+
+  for (auto _ : state) {
+    uint64_t rank0 = rng() % max_rank0;
+    benchmark::DoNotOptimize(bv.select0(rank0));
+  }
+}
+
 static void BM_RankNonInterleaved10PercentFill(benchmark::State& state) {
   size_t n = state.range(0);
   std::mt19937_64 rng(42);
@@ -77,6 +111,25 @@ static void BM_RankNonInterleaved10PercentFill(benchmark::State& state) {
   }
 }
 
+static void BM_RankZeroNonInterleaved10PercentFill(benchmark::State& state) {
+  size_t n = state.range(0);
+  std::mt19937_64 rng(42);
+
+  std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
+  size_t num_ones = n * 0.1;
+  for (int i = 0; i < num_ones; i++) {
+    uint64_t pos = rng() % n;
+    bits[pos / 64] |= (1ULL << pos % 64);
+  }
+
+  pixie::BitVector bv(bits, n);
+
+  for (auto _ : state) {
+    uint64_t pos = rng() % n;
+    benchmark::DoNotOptimize(bv.rank0(pos));
+  }
+}
+
 static void BM_SelectNonInterleaved10PercentFill(benchmark::State& state) {
   size_t n = state.range(0);
   std::mt19937_64 rng(42);
@@ -98,6 +151,27 @@ static void BM_SelectNonInterleaved10PercentFill(benchmark::State& state) {
   }
 }
 
+static void BM_SelectZeroNonInterleaved10PercentFill(benchmark::State& state) {
+  size_t n = state.range(0);
+  std::mt19937_64 rng(42);
+
+  std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
+  size_t num_ones = n * 0.1;
+  for (int i = 0; i < num_ones; i++) {
+    uint64_t pos = rng() % n;
+    bits[pos / 64] |= (1ULL << pos % 64);
+  }
+
+  pixie::BitVector bv(bits, n);
+
+  auto max_rank0 = bv.rank0(bv.size()) + 1;
+
+  for (auto _ : state) {
+    uint64_t rank0 = rng() % max_rank0;
+    benchmark::DoNotOptimize(bv.select0(rank0));
+  }
+}
+
 static void BM_RankNonInterleaved90PercentFill(benchmark::State& state) {
   size_t n = state.range(0);
   std::mt19937_64 rng(42);
@@ -117,6 +191,25 @@ static void BM_RankNonInterleaved90PercentFill(benchmark::State& state) {
   }
 }
 
+static void BM_RankZeroNonInterleaved90PercentFill(benchmark::State& state) {
+  size_t n = state.range(0);
+  std::mt19937_64 rng(42);
+
+  std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
+  size_t num_ones = n * 0.9;
+  for (int i = 0; i < num_ones; i++) {
+    uint64_t pos = rng() % n;
+    bits[pos / 64] |= (1ULL << pos % 64);
+  }
+
+  pixie::BitVector bv(bits, n);
+
+  for (auto _ : state) {
+    uint64_t pos = rng() % n;
+    benchmark::DoNotOptimize(bv.rank0(pos));
+  }
+}
+
 static void BM_SelectNonInterleaved90PercentFill(benchmark::State& state) {
   size_t n = state.range(0);
   std::mt19937_64 rng(42);
@@ -138,13 +231,40 @@ static void BM_SelectNonInterleaved90PercentFill(benchmark::State& state) {
   }
 }
 
+static void BM_SelectZeroNonInterleaved90PercentFill(benchmark::State& state) {
+  size_t n = state.range(0);
+  std::mt19937_64 rng(42);
+
+  std::vector<uint64_t> bits(((8 + n / 64) / 8) * 8);
+  size_t num_ones = n * 0.9;
+  for (int i = 0; i < num_ones; i++) {
+    uint64_t pos = rng() % n;
+    bits[pos / 64] |= (1ULL << pos % 64);
+  }
+
+  pixie::BitVector bv(bits, n);
+
+  auto max_rank0 = bv.rank0(bv.size()) + 1;
+
+  for (auto _ : state) {
+    uint64_t rank0 = rng() % max_rank0;
+    benchmark::DoNotOptimize(bv.select(rank0));
+  }
+}
+
+BENCHMARK(BM_RankInterleaved)
+    ->ArgNames({"n"})
+    ->RangeMultiplier(4)
+    ->Range(8, 1ull << 34)
+    ->Iterations(100000);
+
 BENCHMARK(BM_RankNonInterleaved)
     ->ArgNames({"n"})
     ->RangeMultiplier(4)
     ->Range(8, 1ull << 34)
     ->Iterations(100000);
 
-BENCHMARK(BM_RankInterleaved)
+BENCHMARK(BM_RankZeroNonInterleaved)
     ->ArgNames({"n"})
     ->RangeMultiplier(4)
     ->Range(8, 1ull << 34)
@@ -156,26 +276,56 @@ BENCHMARK(BM_SelectNonInterleaved)
     ->Range(8, 1ull << 34)
     ->Iterations(100000);
 
+BENCHMARK(BM_SelectZeroNonInterleaved)
+    ->ArgNames({"n"})
+    ->RangeMultiplier(4)
+    ->Range(8, 1ull << 34)
+    ->Iterations(100000);
+
 BENCHMARK(BM_RankNonInterleaved10PercentFill)
     ->ArgNames({"n"})
     ->RangeMultiplier(4)
     ->Range(8, 1ull << 34)
     ->Iterations(100000);
 
+BENCHMARK(BM_RankZeroNonInterleaved10PercentFill)
+    ->ArgNames({"n"})
+    ->RangeMultiplier(4)
+    ->Range(8, 1ull << 34)
+    ->Iterations(100000);
+
 BENCHMARK(BM_SelectNonInterleaved10PercentFill)
     ->ArgNames({"n"})
     ->RangeMultiplier(4)
     ->Range(8, 1ull << 34)
     ->Iterations(100000);
 
+BENCHMARK(BM_SelectZeroNonInterleaved10PercentFill)
+    ->ArgNames({"n"})
+    ->RangeMultiplier(4)
+    ->Range(8, 1ull << 34)
+    ->Iterations(100000);
+
 BENCHMARK(BM_RankNonInterleaved90PercentFill)
     ->ArgNames({"n"})
     ->RangeMultiplier(4)
     ->Range(8, 1ull << 34)
     ->Iterations(100000);
 
+BENCHMARK(BM_RankZeroNonInterleaved90PercentFill)
+    ->ArgNames({"n"})
+    ->RangeMultiplier(4)
+    ->Range(8, 1ull << 34)
+    ->Iterations(100000);
+
 BENCHMARK(BM_SelectNonInterleaved90PercentFill)
     ->ArgNames({"n"})
     ->RangeMultiplier(4)
     ->Range(8, 1ull << 34)
     ->Iterations(100000);
+
+BENCHMARK(BM_SelectZeroNonInterleaved90PercentFill)
+    ->ArgNames({"n"})
+    ->RangeMultiplier(4)
+    ->Range(8, 1ull << 34)
+    ->Iterations(100000);
diff --git a/src/unittests.cpp b/src/unittests.cpp
index dddb0b7..27d037c 100644
--- a/src/unittests.cpp
+++ b/src/unittests.cpp
@@ -89,6 +89,14 @@ TEST(Select512, Ones) {
   }
 }
 
+TEST(SelectZero512, Zeros) {
+  std::array<uint64_t, 8> a{0, 0, 0, 0, 0, 0, 0, 0};
+  for (size_t i = 0; i < 512; ++i) {
+    auto p = select0_512(a.data(), i);
+    EXPECT_EQ(p, i);
+  }
+}
+
 TEST(Select512, Random) {
   std::array<uint64_t, 8> a{std::numeric_limits<uint64_t>::max(),
                             std::numeric_limits<uint64_t>::max(),
@@ -114,6 +122,31 @@ TEST(Select512, Random) {
   }
 }
 
+TEST(SelectZero512, Random) {
+  std::array<uint64_t, 8> a{std::numeric_limits<uint64_t>::max(),
+                            std::numeric_limits<uint64_t>::max(),
+                            std::numeric_limits<uint64_t>::max(),
+                            std::numeric_limits<uint64_t>::max(),
+                            std::numeric_limits<uint64_t>::max(),
+                            std::numeric_limits<uint64_t>::max(),
+                            std::numeric_limits<uint64_t>::max(),
+                            std::numeric_limits<uint64_t>::max()};
+
+  std::mt19937_64 rng(42);
+  for (size_t t = 0; t < 1000; ++t) {
+    for (auto& x : a) {
+      x = rng();
+    }
+    size_t rank = 0;
+    for (size_t i = 0; i < 512; ++i) {
+      if ((1 & (a[i >> 6] >> (i & 63))) == 0) {
+        auto p = select0_512(a.data(), rank++);
+        ASSERT_EQ(p, i);
+      }
+    }
+  }
+}
+
 TEST(Select512, RankCompativility) {
   std::array<uint64_t, 8> a{std::numeric_limits<uint64_t>::max(),
                             std::numeric_limits<uint64_t>::max(),
@@ -209,7 +242,7 @@ TEST(BitVectorTest, RankWithZeros) {
 TEST(BitVectorTest, SelectBasic) {
   std::vector<uint64_t> bits(8, 0);
   bits[0] = 0b1100010110010110;
-  BitVector bv(bits, 5);
+  BitVector bv(bits, 16);
 
   EXPECT_EQ(bv.select(1), 1);
   EXPECT_EQ(bv.select(2), 2);
@@ -255,6 +288,78 @@ TEST(BitVectorTest, MainSelectTest) {
   }
 }
 
+TEST(BitVectorTest, RankZeroBasic) {
+  std::vector<uint64_t> bits(8, 0);
+  bits[0] = 0b10110;
+  BitVector bv(bits, 10);
+
+  EXPECT_EQ(bv.rank0(0), 0);
+  EXPECT_EQ(bv.rank0(1), 1);
+  EXPECT_EQ(bv.rank0(2), 1);
+  EXPECT_EQ(bv.rank0(3), 1);
+  EXPECT_EQ(bv.rank0(4), 2);
+  EXPECT_EQ(bv.rank0(5), 2);
+}
+
+TEST(BitVectorTest, RankZeroWithOnes) {
+  std::vector<uint64_t> bits(8, 0);
+  BitVector bv(bits, 5);
+
+  for (size_t i = 0; i <= 5; i++) {
+    EXPECT_EQ(bv.rank0(i), i);
+  }
+}
+
+TEST(BitVectorTest, SelectZeroBasic) {
+  std::vector<uint64_t> bits(8, 0);
+  bits[0] = 0b1100010110010110;
+  BitVector bv(bits, 16);
+
+  EXPECT_EQ(bv.select0(1), 0);
+  EXPECT_EQ(bv.select0(2), 3);
+  EXPECT_EQ(bv.select0(3), 5);
+  EXPECT_EQ(bv.select0(4), 6);
+  EXPECT_EQ(bv.select0(5), 9);
+  EXPECT_EQ(bv.select0(6), 11);
+  EXPECT_EQ(bv.select0(7), 12);
+  EXPECT_EQ(bv.select0(8), 13);
+}
+
+TEST(BitVectorTest, MainRankZeroTest) {
+  std::mt19937_64 rng(42);
+  std::vector<uint64_t> bits(65536 * 32);
+  for (size_t i = 0; i < bits.size(); i++) {
+    bits[i] = rng();
+  }
+
+  BitVector bv(bits, bits.size() * 64);
+  uint64_t zero_count = 0;
+  for (size_t i = 0; i < bv.size(); ++i) {
+    ASSERT_EQ(zero_count, bv.rank0(i));
+    zero_count += (bv[i] == 0) ? 1 : 0;
+  }
+}
+
+TEST(BitVectorTest, MainSelectZeroTest) {
+  std::mt19937_64 rng(42);
+  std::vector<uint64_t> bits(65536 * 32);
+  for (size_t i = 0; i < bits.size(); i++) {
+    bits[i] = rng();
+  }
+
+  BitVector bv(bits, bits.size() * 64);
+  uint64_t zero_rank = 0;
+
+  for (size_t i = 0; i < bv.size(); ++i) {
+    if (bv[i] == 0) {
+      zero_rank++;
+      EXPECT_EQ(bv.select0(zero_rank), i);
+      EXPECT_EQ(bv.rank0(i), zero_rank - 1);
+      EXPECT_EQ(bv.rank0(i + 1), zero_rank);
+    }
+  }
+}
+
 TEST(BitVectorInterleavedTest, AtTest) {
   std::mt19937_64 rng(42);
   /*
@@ -330,6 +435,52 @@ TEST(LowerBound8x64, Random) {
   }
 }
 
+TEST(LowerBoundDlt4x64, Random) {
+  std::vector<uint64_t> x(4);
+  uint64_t dlt_array[4];
+  std::mt19937_64 rng(42);
+  for (size_t i = 0; i < 100000; i++) {
+    uint64_t y = rng();
+    uint64_t dlt_scalar = rng();
+    uint16_t cnt = 0;
+    bool fl = 1;
+    for (size_t j = 0; j < 4; j++) {
+      dlt_array[j] = rng();
+      x[j] = rng();
+      fl &= dlt_scalar + dlt_array[j] - x[j] < y;
+      cnt += fl;
+    }
+    if (cnt < 4) {
+      ASSERT_EQ(lower_bound_dlt_4x64(x.data(), y, dlt_array, dlt_scalar), cnt);
+    } else {
+      ASSERT_GE(lower_bound_dlt_4x64(x.data(), y, dlt_array, dlt_scalar), cnt);
+    }
+  }
+}
+
+TEST(LowerBoundDlt8x64, Random) {
+  std::vector<uint64_t> x(8);
+  uint64_t dlt_array[8];
+  std::mt19937_64 rng(42);
+  for (size_t i = 0; i < 100000; i++) {
+    uint64_t y = rng();
+    uint64_t dlt_scalar = rng();
+    uint16_t cnt = 0;
+    bool fl = 1;
+    for (size_t j = 0; j < 8; j++) {
+      dlt_array[j] = rng();
+      x[j] = rng();
+      fl &= dlt_scalar + dlt_array[j] - x[j] < y;
+      cnt += fl;
+    }
+    if (cnt < 8) {
+      ASSERT_EQ(lower_bound_dlt_8x64(x.data(), y, dlt_array, dlt_scalar), cnt);
+    } else {
+      ASSERT_GE(lower_bound_dlt_8x64(x.data(), y, dlt_array, dlt_scalar), cnt);
+    }
+  }
+}
+
 TEST(LowerBound32x16, Random) {
   std::vector<uint16_t> x(32);
   std::mt19937 rng(42);
@@ -344,6 +495,28 @@ TEST(LowerBound32x16, Random) {
   }
 }
 
+TEST(LowerBoundDlt32x16, Random) {
+  std::vector<uint16_t> x(32);
+  uint16_t dlt_array[32];
+  std::mt19937 rng(42);
+  for (size_t i = 0; i < 100000; i++) {
+    uint16_t y = rng();
+    uint16_t dlt_scalar = rng();
+    uint16_t cnt = 0;
+    for (size_t j = 0; j < 32; j++) {
+      x[j] = rng();
+      if (dlt_scalar < x[j]) {
+        dlt_array[j] =
+            x[j] - dlt_scalar + rng() % ((1 << 16) - x[j] + dlt_scalar);
+      } else {
+        dlt_array[j] = rng() % ((1 << 16) - dlt_scalar);
+      }
+      cnt += dlt_array[j] + dlt_scalar - x[j] < y;
+    }
+    ASSERT_EQ(lower_bound_dlt_32x16(x.data(), y, dlt_array, dlt_scalar), cnt);
+  }
+}
+
 TEST(Popcount64x4, Random) {
   std::vector<uint8_t> x(32);
   std::vector<uint8_t> y(32);