From 5c470274f46173aed7818b35c7ed3beda59ec0b0 Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Sun, 11 May 2025 14:54:49 +0200 Subject: [PATCH 01/23] added `into_bit_vec` to RsVec (#31) --- src/bit_vec/fast_rs_vec/mod.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/bit_vec/fast_rs_vec/mod.rs b/src/bit_vec/fast_rs_vec/mod.rs index 9014988..9b3926e 100644 --- a/src/bit_vec/fast_rs_vec/mod.rs +++ b/src/bit_vec/fast_rs_vec/mod.rs @@ -383,6 +383,18 @@ impl RsVec { } } + /// Convert the `RsVec` into a [`BitVec`]. + /// This consumes the `RsVec`, and discards all meta-data. + /// Since [`RsVec`]s are innately immutable, this conversion is the only way to modify the + /// underlying data. + #[must_use] + pub fn into_bit_vec(self) -> BitVec { + BitVec { + data: self.data, + len: self.len, + } + } + /// Check if two `RsVec`s are equal. For sparse vectors (either sparsely filled with 1-bits or /// 0-bits), this is faster than comparing the vectors bit by bit. /// Choose the value of `ZERO` depending on which bits are more sparse. From 4078e88ddf495d9c57d0fb1017f56aa10a56f97b Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Sun, 11 May 2025 14:59:12 +0200 Subject: [PATCH 02/23] add example to into_bit_vec --- src/bit_vec/fast_rs_vec/mod.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/bit_vec/fast_rs_vec/mod.rs b/src/bit_vec/fast_rs_vec/mod.rs index 9b3926e..4d4b053 100644 --- a/src/bit_vec/fast_rs_vec/mod.rs +++ b/src/bit_vec/fast_rs_vec/mod.rs @@ -387,6 +387,23 @@ impl RsVec { /// This consumes the `RsVec`, and discards all meta-data. /// Since [`RsVec`]s are innately immutable, this conversion is the only way to modify the /// underlying data. + /// + /// # Example + /// ```rust + /// use vers_vecs::{BitVec, RsVec}; + /// + /// let mut bit_vec = BitVec::new(); + /// bit_vec.append_word(u64::MAX); + /// + /// let rs_vec = RsVec::from_bit_vec(bit_vec); + /// assert_eq!(rs_vec.rank1(64), 64); + /// + /// let mut bit_vec = rs_vec.into_bit_vec(); + /// bit_vec.flip_bit(32); + /// let rs_vec = RsVec::from_bit_vec(bit_vec); + /// assert_eq!(rs_vec.rank1(64), 63); + /// assert_eq!(rs_vec.select0(0), 32); + /// ``` #[must_use] pub fn into_bit_vec(self) -> BitVec { BitVec { From a075a5b5ab4727b04ac7ca2aab15be3282934ee9 Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Sun, 11 May 2025 16:04:54 +0200 Subject: [PATCH 03/23] add `into_parentheses_vec` to `BpTree` (#31) --- src/trees/bp/mod.rs | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/trees/bp/mod.rs b/src/trees/bp/mod.rs index 1262e90..78cb215 100644 --- a/src/trees/bp/mod.rs +++ b/src/trees/bp/mod.rs @@ -506,6 +506,35 @@ impl BpTree { ChildrenIter::::new(self, node) } + /// Transform the tree into a [`RsVec`] containing the balanced parenthesis expression. + /// This consumes the tree and returns the underlying bit vector with the rank and select + /// support structure. + /// The remaining min-max-tree support structure of the `BpTree` is discarded. + /// Since the tree is innately immutable, this is the only way to access the underlying bit + /// vector for potential modification. + /// Modification requires turning the `RsVec` back into a `BitVec`, discarding the rank and select + /// support structure, however. + /// + /// # Examples + /// ```rust + /// use vers_vecs::{BitVec, RsVec, BpTree, Tree}; + /// + /// let bv = BitVec::pack_sequence_u8(&[0b1101_0111, 0b0010_0100], 8); + /// let tree = BpTree::<4>::from_bit_vector(bv); + /// assert_eq!(tree.size(), 8); + /// + /// let rs_vec = tree.into_parentheses_vec(); + /// let mut bv = rs_vec.into_bit_vec(); + /// + /// bv.flip_bit(15); + /// bv.append_bits(0, 2); + /// let tree = BpTree::<4>::from_bit_vector(bv); + /// assert_eq!(tree.size(), 9); + /// ``` + pub fn into_parentheses_vec(self) -> RsVec { + self.vec + } + /// Returns the number of bytes used on the heap for this tree. This does not include /// allocated space that is not used (e.g. by the allocation behavior of `Vec`). #[must_use] From 0b391469c0192d8070444b7bb9800dcc26c89477 Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Sun, 11 May 2025 19:19:08 +0200 Subject: [PATCH 04/23] add `must_use` to `into_parentheses_vec` --- src/trees/bp/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/trees/bp/mod.rs b/src/trees/bp/mod.rs index 78cb215..0f6313f 100644 --- a/src/trees/bp/mod.rs +++ b/src/trees/bp/mod.rs @@ -531,6 +531,7 @@ impl BpTree { /// let tree = BpTree::<4>::from_bit_vector(bv); /// assert_eq!(tree.size(), 9); /// ``` + #[must_use] pub fn into_parentheses_vec(self) -> RsVec { self.vec } From 84c6af998a57cf06694c6bed9c53d199b64740b5 Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Sun, 11 May 2025 20:14:02 +0200 Subject: [PATCH 05/23] reduced branching in get_bits, improving performance of various data structures --- src/bit_vec/fast_rs_vec/mod.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/bit_vec/fast_rs_vec/mod.rs b/src/bit_vec/fast_rs_vec/mod.rs index 4d4b053..8961d87 100644 --- a/src/bit_vec/fast_rs_vec/mod.rs +++ b/src/bit_vec/fast_rs_vec/mod.rs @@ -373,10 +373,8 @@ impl RsVec { pub fn get_bits_unchecked(&self, pos: usize, len: usize) -> u64 { debug_assert!(len <= WORD_SIZE); let partial_word = self.data[pos / WORD_SIZE] >> (pos % WORD_SIZE); - if pos % WORD_SIZE + len == WORD_SIZE { - partial_word - } else if pos % WORD_SIZE + len < WORD_SIZE { - partial_word & ((1 << (len % WORD_SIZE)) - 1) + if pos % WORD_SIZE + len <= WORD_SIZE { + partial_word & 1u64.checked_shl(len as u32).unwrap_or(0).wrapping_sub(1) } else { (partial_word | (self.data[pos / WORD_SIZE + 1] << (WORD_SIZE - pos % WORD_SIZE))) & 1u64.checked_shl(len as u32).unwrap_or(0).wrapping_sub(1) From 6238ba555c2f44cf6cb8ec5f82c5ba72ddb5dbd4 Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Sun, 11 May 2025 21:05:03 +0200 Subject: [PATCH 06/23] same branching reduction in BitVec::get_bits_unchecked --- src/bit_vec/mod.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/bit_vec/mod.rs b/src/bit_vec/mod.rs index ef4eea5..8e6b5ef 100644 --- a/src/bit_vec/mod.rs +++ b/src/bit_vec/mod.rs @@ -948,10 +948,8 @@ impl BitVec { pub fn get_bits_unchecked(&self, pos: usize, len: usize) -> u64 { debug_assert!(len <= WORD_SIZE); let partial_word = self.data[pos / WORD_SIZE] >> (pos % WORD_SIZE); - if pos % WORD_SIZE + len == WORD_SIZE { - partial_word - } else if pos % WORD_SIZE + len < WORD_SIZE { - partial_word & ((1 << (len % WORD_SIZE)) - 1) + if pos % WORD_SIZE + len <= WORD_SIZE { + partial_word & 1u64.checked_shl(len as u32).unwrap_or(0).wrapping_sub(1) } else { (partial_word | (self.data[pos / WORD_SIZE + 1] << (WORD_SIZE - pos % WORD_SIZE))) & 1u64.checked_shl(len as u32).unwrap_or(0).wrapping_sub(1) From 7c444def1f47a406306e027146c2589004350fcd Mon Sep 17 00:00:00 2001 From: Arne Skjaerholt Date: Mon, 12 May 2025 13:19:30 +0200 Subject: [PATCH 07/23] Quality-of-life API improvements. Adds: - `From for BitVec` implementations for converting RsVec, BpTree and WaveletMatrix back into BitVecs - `impl Extend for BitVec` and `BitVec.extend_vec()` to append many bits at once - `BitVec.split_at()` and `._split_at_unchecked()` to split a BitVec into two halves --- src/bit_vec/fast_rs_vec/mod.rs | 9 ++++ src/bit_vec/mod.rs | 79 ++++++++++++++++++++++++++++++++++ src/trees/bp/mod.rs | 6 +++ src/wavelet/mod.rs | 8 ++++ 4 files changed, 102 insertions(+) diff --git a/src/bit_vec/fast_rs_vec/mod.rs b/src/bit_vec/fast_rs_vec/mod.rs index 8961d87..100ec64 100644 --- a/src/bit_vec/fast_rs_vec/mod.rs +++ b/src/bit_vec/fast_rs_vec/mod.rs @@ -542,6 +542,15 @@ impl From for RsVec { } } +impl From for BitVec { + fn from(value: RsVec) -> Self { + BitVec { + data: value.data, + len: value.len, + } + } +} + // iter code in here to keep it more organized mod iter; // select code in here to keep it more organized diff --git a/src/bit_vec/mod.rs b/src/bit_vec/mod.rs index 8e6b5ef..fca706b 100644 --- a/src/bit_vec/mod.rs +++ b/src/bit_vec/mod.rs @@ -736,6 +736,19 @@ impl BitVec { self.len += len; } + /// Append the bits of another bit vector to the vector. + pub fn extend_bitvec(&mut self, other: &Self) { + let full_limbs = other.len() / WORD_SIZE; + for i in 0..full_limbs { + self.append_bits(other.data[i], WORD_SIZE); + } + + let partial_bits = other.len % WORD_SIZE; + if partial_bits > 0 { + self.append_bits(other.data[full_limbs], partial_bits); + } + } + /// Return the length of the bit vector. The length is measured in bits. #[must_use] pub fn len(&self) -> usize { @@ -1192,6 +1205,56 @@ impl BitVec { pub fn heap_size(&self) -> usize { self.data.len() * size_of::() } + + /// Split the vector in two at the specified index. The left half contains bits `0..at` and the + /// right half the remaining bits `at..`. If the split index is larger than the length of the + /// vector, the vector is returned unmodified in an `Err` variant. + /// + /// See also: [`split_at_unchecked`] + pub fn split_at(self, at: usize) -> Result<(Self, Self), Self> { + if at >= self.len { + Err(self) + } else { + Ok(self.split_at_unchecked(at)) + } + } + + /// Split the vector in two at the specified index. The left half contains bits `0..at` and the + /// right half the remaining bits `at..`. + /// + /// # Panics + /// If the index is larger than the length of the vector the function will panic or return + /// unpredictable data. Use [`split_at`] to properly handle this case. + #[must_use] + pub fn split_at_unchecked(self, at: usize) -> (Self, Self) { + let other_len = self.len - at; + let mut other = Self::with_capacity(other_len); + + let first_limb = at / WORD_SIZE; + let full_limbs = self.len / WORD_SIZE; + + let leading_partial = at % WORD_SIZE; + let iter_limbs = if leading_partial > 0 { + other.append_bits_unchecked( + self.data[first_limb] >> (WORD_SIZE - leading_partial), + leading_partial, + ); + first_limb + 1..full_limbs + } else { + first_limb..full_limbs + }; + + for i in iter_limbs { + other.append_bits_unchecked(self.data[i], WORD_SIZE); + } + + let trailing_partial = self.len % WORD_SIZE; + if trailing_partial > 0 { + other.append_bits_unchecked(self.data[full_limbs], trailing_partial); + } + + (self, other) + } } impl_vector_iterator! { BitVec, BitVecIter, BitVecRefIter } @@ -1216,6 +1279,22 @@ impl From> for BitVec { } } +impl Extend for BitVec { + fn extend>(&mut self, iter: T) { + for v in iter { + self.extend_bitvec(&v) + } + } +} + +impl<'t> Extend<&'t BitVec> for BitVec { + fn extend>(&mut self, iter: T) { + for v in iter { + self.extend_bitvec(v) + } + } +} + /// Create a new bit vector from u64 values. /// The bits are appended in little-endian order (i.e. the least significant bit is appended first). /// The function will append the bits of each element to the bit vector in the order they are diff --git a/src/trees/bp/mod.rs b/src/trees/bp/mod.rs index 0f6313f..1dc7eb9 100644 --- a/src/trees/bp/mod.rs +++ b/src/trees/bp/mod.rs @@ -749,6 +749,12 @@ impl From for BpTree { } } +impl From> for BitVec { + fn from(value: BpTree) -> Self { + value.vec.into() + } +} + /// An iterator over the children of a node. /// Calls to `next` return the next child node handle in the order they appear in the parenthesis /// expression. diff --git a/src/wavelet/mod.rs b/src/wavelet/mod.rs index 3d08602..b463535 100644 --- a/src/wavelet/mod.rs +++ b/src/wavelet/mod.rs @@ -2053,6 +2053,14 @@ impl WaveletMatrix { } } +impl From for BitVec { + fn from(value: WaveletMatrix) -> Self { + let mut output = BitVec::new(); + output.extend(value); + output + } +} + impl_vector_iterator!( WaveletMatrix, WaveletIter, From 93d45e4bb39e31252bbc0ae0490bc6311463b559 Mon Sep 17 00:00:00 2001 From: Arne Skjaerholt Date: Mon, 12 May 2025 15:37:38 +0200 Subject: [PATCH 08/23] Change From implementations to use methods on their types. --- src/bit_vec/fast_rs_vec/mod.rs | 5 +---- src/trees/bp/mod.rs | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/bit_vec/fast_rs_vec/mod.rs b/src/bit_vec/fast_rs_vec/mod.rs index 100ec64..2c35643 100644 --- a/src/bit_vec/fast_rs_vec/mod.rs +++ b/src/bit_vec/fast_rs_vec/mod.rs @@ -544,10 +544,7 @@ impl From for RsVec { impl From for BitVec { fn from(value: RsVec) -> Self { - BitVec { - data: value.data, - len: value.len, - } + value.into_bit_vec() } } diff --git a/src/trees/bp/mod.rs b/src/trees/bp/mod.rs index 1dc7eb9..40b7e59 100644 --- a/src/trees/bp/mod.rs +++ b/src/trees/bp/mod.rs @@ -751,7 +751,7 @@ impl From for BpTree { impl From> for BitVec { fn from(value: BpTree) -> Self { - value.vec.into() + value.into_parentheses_vec().into_bit_vec() } } From b571822387e206edb77b63bc09ca55746121d3fc Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Mon, 12 May 2025 20:36:52 +0200 Subject: [PATCH 09/23] revert From implementation for BitVec --- src/wavelet/mod.rs | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/wavelet/mod.rs b/src/wavelet/mod.rs index b463535..3d08602 100644 --- a/src/wavelet/mod.rs +++ b/src/wavelet/mod.rs @@ -2053,14 +2053,6 @@ impl WaveletMatrix { } } -impl From for BitVec { - fn from(value: WaveletMatrix) -> Self { - let mut output = BitVec::new(); - output.extend(value); - output - } -} - impl_vector_iterator!( WaveletMatrix, WaveletIter, From 952efa22e3e2443323d7b14392c7e6c325ab12a5 Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Mon, 12 May 2025 20:40:16 +0200 Subject: [PATCH 10/23] implement From for RsVec --- src/trees/bp/mod.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/trees/bp/mod.rs b/src/trees/bp/mod.rs index 40b7e59..6b9e89c 100644 --- a/src/trees/bp/mod.rs +++ b/src/trees/bp/mod.rs @@ -755,6 +755,12 @@ impl From> for BitVec { } } +impl From> for RsVec { + fn from(value: BpTree) -> Self { + value.into_parentheses_vec() + } +} + /// An iterator over the children of a node. /// Calls to `next` return the next child node handle in the order they appear in the parenthesis /// expression. From 97ccf057a8d9920dd22b42339096922b5516ac74 Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Mon, 12 May 2025 21:06:28 +0200 Subject: [PATCH 11/23] specify returned bit order of `BitVec::get_bits` --- src/bit_vec/mod.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/bit_vec/mod.rs b/src/bit_vec/mod.rs index fca706b..a40dde1 100644 --- a/src/bit_vec/mod.rs +++ b/src/bit_vec/mod.rs @@ -925,6 +925,9 @@ impl BitVec { /// If the position at the end of the query is larger than the length of the vector, /// None is returned (even if the query partially overlaps with the vector). /// If the length of the query is larger than 64, None is returned. + /// + /// The first bit at `pos` is the most significant bit of the return value + /// limited to `len` bits. #[must_use] pub fn get_bits(&self, pos: usize, len: usize) -> Option { if len > WORD_SIZE || len == 0 { From 14eb4cc4800195791027534d0fc489df924ba6a6 Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Mon, 12 May 2025 21:10:08 +0200 Subject: [PATCH 12/23] add test case for `BitVec::extend` --- src/bit_vec/tests.rs | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/src/bit_vec/tests.rs b/src/bit_vec/tests.rs index 4255374..4a354ed 100644 --- a/src/bit_vec/tests.rs +++ b/src/bit_vec/tests.rs @@ -626,3 +626,43 @@ fn test_unpack() { assert_eq!(bv.unpack_element(8, 10), None); assert_eq!(bv.unpack_element(1000, 10), None); } + +#[test] +fn test_extend() { + // test bitvec extend + let mut bv = BitVec::from_zeros(10); + let bv_ones = BitVec::from_ones(10); + bv.extend_bitvec(&bv_ones); + assert_eq!(bv.len, 20); + assert_eq!(bv.get_bits(0, 20), Some(0b11111111110000000000)); + + // extend with an empty bitvec + let mut bv = BitVec::from_zeros(10); + bv.extend_bitvec(&BitVec::default()); + assert_eq!(bv.len, 10); + assert_eq!(bv.get_bits(0, 10), Some(0)); + + // test extend of empty bitvec + let mut bv = BitVec::default(); + let bv_ones = BitVec::from_ones(10); + bv.extend_bitvec(&bv_ones); + assert_eq!(bv.len, 10); + assert_eq!(bv.get_bits(0, 10), Some(0b1111111111)); + + // test large vectors + let mut bv = BitVec::from_zeros(1000); + let bv_ones = BitVec::from_ones(1000); + bv.extend_bitvec(&bv_ones); + assert_eq!(bv.len, 2000); + // sanity check: + assert_eq!(bv.get_bits(64, 64), Some(0)); + assert_eq!(bv.get_bits(1064, 64), Some(u64::MAX)); + + // test aligned vectors + let mut bv = BitVec::from_zeros(64); + let bv_ones = BitVec::from_ones(64); + bv.extend_bitvec(&bv_ones); + assert_eq!(bv.len, 128); + assert_eq!(bv.get_bits(0, 64), Some(0)); + assert_eq!(bv.get_bits(64, 64), Some(u64::MAX)); +} From 752936cfbbe94dd600b97e8ec2fc18c94b56913e Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Mon, 12 May 2025 21:15:02 +0200 Subject: [PATCH 13/23] ensure extend only reallocates once, update its documentation --- src/bit_vec/mod.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/bit_vec/mod.rs b/src/bit_vec/mod.rs index a40dde1..d94c07f 100644 --- a/src/bit_vec/mod.rs +++ b/src/bit_vec/mod.rs @@ -736,8 +736,15 @@ impl BitVec { self.len += len; } - /// Append the bits of another bit vector to the vector. + /// Append the bits of another bit vector to the end of this vector. + /// If this vector does not contain a multiple of 64 bits, the appended limbs need to be + /// shifted to the left. + /// This function is guaranteed to reallocate the underlying vector at most once. pub fn extend_bitvec(&mut self, other: &Self) { + // reserve space for the new bits, ensuring at most one re-allocation + self.data + .reserve((self.len + other.len).div_ceil(WORD_SIZE) - self.data.len()); + let full_limbs = other.len() / WORD_SIZE; for i in 0..full_limbs { self.append_bits(other.data[i], WORD_SIZE); From cd23966ae537001d2adf1bd8b11914a7626352e2 Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Mon, 12 May 2025 21:15:54 +0200 Subject: [PATCH 14/23] add error section to `split_at` documentation --- src/bit_vec/mod.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/bit_vec/mod.rs b/src/bit_vec/mod.rs index d94c07f..8fd4328 100644 --- a/src/bit_vec/mod.rs +++ b/src/bit_vec/mod.rs @@ -1220,6 +1220,10 @@ impl BitVec { /// right half the remaining bits `at..`. If the split index is larger than the length of the /// vector, the vector is returned unmodified in an `Err` variant. /// + /// # Errors + /// If the index is out of bounds, the function will return an error + /// containing the original vector. + /// /// See also: [`split_at_unchecked`] pub fn split_at(self, at: usize) -> Result<(Self, Self), Self> { if at >= self.len { From 7e2b2c701062af089773ac14e5697606a121fb8c Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Mon, 12 May 2025 21:27:22 +0200 Subject: [PATCH 15/23] fix: split_at_unchecked doesn't drop second part from self --- src/bit_vec/mod.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/bit_vec/mod.rs b/src/bit_vec/mod.rs index 8fd4328..10e5f18 100644 --- a/src/bit_vec/mod.rs +++ b/src/bit_vec/mod.rs @@ -1240,7 +1240,7 @@ impl BitVec { /// If the index is larger than the length of the vector the function will panic or return /// unpredictable data. Use [`split_at`] to properly handle this case. #[must_use] - pub fn split_at_unchecked(self, at: usize) -> (Self, Self) { + pub fn split_at_unchecked(mut self, at: usize) -> (Self, Self) { let other_len = self.len - at; let mut other = Self::with_capacity(other_len); @@ -1267,6 +1267,9 @@ impl BitVec { other.append_bits_unchecked(self.data[full_limbs], trailing_partial); } + // remove the copied bits from the original vector + self.drop_last(other_len); + (self, other) } } From 9269caf6f3297c50936d8becf5afd5e49a5fe759 Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Mon, 12 May 2025 21:27:58 +0200 Subject: [PATCH 16/23] fix: split_at_unchecked copies wrong partial limb at the start of the second part --- src/bit_vec/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bit_vec/mod.rs b/src/bit_vec/mod.rs index 10e5f18..d3bf750 100644 --- a/src/bit_vec/mod.rs +++ b/src/bit_vec/mod.rs @@ -1247,7 +1247,7 @@ impl BitVec { let first_limb = at / WORD_SIZE; let full_limbs = self.len / WORD_SIZE; - let leading_partial = at % WORD_SIZE; + let leading_partial = WORD_SIZE - (at % WORD_SIZE); let iter_limbs = if leading_partial > 0 { other.append_bits_unchecked( self.data[first_limb] >> (WORD_SIZE - leading_partial), From 69756b20e6c2ce96f0cc12a4a407f197deb12aef Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Mon, 12 May 2025 21:49:46 +0200 Subject: [PATCH 17/23] fix: if we split in the last limb, or in a single-limb vector, we could be copying past the vector, or copying the same data twice --- src/bit_vec/mod.rs | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/bit_vec/mod.rs b/src/bit_vec/mod.rs index d3bf750..8e93349 100644 --- a/src/bit_vec/mod.rs +++ b/src/bit_vec/mod.rs @@ -1247,7 +1247,11 @@ impl BitVec { let first_limb = at / WORD_SIZE; let full_limbs = self.len / WORD_SIZE; - let leading_partial = WORD_SIZE - (at % WORD_SIZE); + // if we start in the middle of a limb, we need to copy the leading partial limb. + // however, we limit the range to the size of the other vector, since we could be starting + // in the last limb of the original vector + let leading_partial = min((WORD_SIZE - (at % WORD_SIZE)) % WORD_SIZE, other_len); + let iter_limbs = if leading_partial > 0 { other.append_bits_unchecked( self.data[first_limb] >> (WORD_SIZE - leading_partial), @@ -1262,9 +1266,13 @@ impl BitVec { other.append_bits_unchecked(self.data[i], WORD_SIZE); } - let trailing_partial = self.len % WORD_SIZE; - if trailing_partial > 0 { - other.append_bits_unchecked(self.data[full_limbs], trailing_partial); + // if we did not start in the last limb, and there are bits left we didn't copy, + // we need to copy the remaining incomplete limb + if full_limbs > first_limb { + let trailing_partial = self.len - full_limbs * WORD_SIZE; + if trailing_partial > 0 { + other.append_bits_unchecked(self.data[full_limbs], trailing_partial); + } } // remove the copied bits from the original vector From dce75126dff722d5d0906d789614bbbd2f286891 Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Mon, 12 May 2025 21:49:54 +0200 Subject: [PATCH 18/23] add test case for split_at --- src/bit_vec/tests.rs | 52 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/src/bit_vec/tests.rs b/src/bit_vec/tests.rs index 4a354ed..bf75eed 100644 --- a/src/bit_vec/tests.rs +++ b/src/bit_vec/tests.rs @@ -666,3 +666,55 @@ fn test_extend() { assert_eq!(bv.get_bits(0, 64), Some(0)); assert_eq!(bv.get_bits(64, 64), Some(u64::MAX)); } + +#[test] +fn test_split_at() { + // test the split_at(_unchecked) function + let mut bv = BitVec::from_zeros(64); + bv.flip_bit(1); + bv.flip_bit(3); + + // check splitting at 1 + let (left, right) = bv.split_at_unchecked(2); + assert_eq!(left.len, 2); + assert_eq!(right.len, 62); + assert_eq!(left.get(0), Some(0)); + assert_eq!(left.get(1), Some(1)); + assert_eq!(right.get(0), Some(0)); + assert_eq!(right.get(1), Some(1)); + assert_eq!(right.get_bits(2, 60), Some(0)); + + // check splitting at 0 + let bv = BitVec::from_zeros(1000); + let (left, right) = bv.split_at_unchecked(0); + assert_eq!(left.len, 0); + assert_eq!(right.len, 1000); + assert_eq!(right.get(999), Some(0)); + + // check splitting at the end + let bv = BitVec::from_zeros(1000); + let (left, right) = bv.split_at_unchecked(1000); + assert_eq!(left.len, 1000); + assert_eq!(right.len, 0); + assert_eq!(left.get(999), Some(0)); + + // check splitting aligned + let bv = BitVec::from_ones(128); + let (left, right) = bv.split_at_unchecked(64); + assert_eq!(left.len, 64); + assert_eq!(right.len, 64); + assert_eq!(left.get_bits(0, 64), Some(u64::MAX)); + assert_eq!(right.get_bits(0, 64), Some(u64::MAX)); + + // check splitting in single limb + let bv = BitVec::from_ones(20); + let (left, right) = bv.split_at_unchecked(10); + assert_eq!(left.len, 10); + assert_eq!(right.len, 10); + + // check splitting empty vector + let bv = BitVec::default(); + let (left, right) = bv.split_at_unchecked(0); + assert_eq!(left.len, 0); + assert_eq!(right.len, 0); +} From 90b155822daca629aa1f47d908545ab3469973a1 Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Mon, 12 May 2025 21:52:11 +0200 Subject: [PATCH 19/23] add test for the checked version of split_at --- src/bit_vec/tests.rs | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/bit_vec/tests.rs b/src/bit_vec/tests.rs index bf75eed..b57d03c 100644 --- a/src/bit_vec/tests.rs +++ b/src/bit_vec/tests.rs @@ -718,3 +718,32 @@ fn test_split_at() { assert_eq!(left.len, 0); assert_eq!(right.len, 0); } + +#[test] +fn test_split_at_result() { + // check splitting at 1 + let mut bv = BitVec::from_zeros(2); + bv.flip_bit(1); + let (left, right) = bv.split_at(1).expect("failed to split"); + assert_eq!(left.len, 1); + assert_eq!(right.len, 1); + assert_eq!(left.get(0), Some(0)); + assert_eq!(right.get(0), Some(1)); + + // check splitting at 0 + let bv = BitVec::from_zeros(2); + let (left, right) = bv.split_at(0).expect("failed to split"); + assert_eq!(left.len, 0); + assert_eq!(right.len, 2); + + // check splitting at the end + let bv = BitVec::from_zeros(2); + let (left, right) = bv.split_at(2).expect("failed to split"); + assert_eq!(left.len, 2); + assert_eq!(right.len, 0); + + // check splitting past the end + let bv = BitVec::from_zeros(2); + let result = bv.split_at(3); + assert!(result.is_err()); +} From b800b03ced6c3e74ade85801783d8ef897b4e438 Mon Sep 17 00:00:00 2001 From: Arne Skjaerholt Date: Tue, 13 May 2025 19:31:57 +0200 Subject: [PATCH 20/23] Fix split_at implementation, and add more tests for splitting. --- src/bit_vec/mod.rs | 53 +++++++++++++++++++++++++------------------ src/bit_vec/tests.rs | 54 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 22 deletions(-) diff --git a/src/bit_vec/mod.rs b/src/bit_vec/mod.rs index 8e93349..841ec0d 100644 --- a/src/bit_vec/mod.rs +++ b/src/bit_vec/mod.rs @@ -1226,7 +1226,7 @@ impl BitVec { /// /// See also: [`split_at_unchecked`] pub fn split_at(self, at: usize) -> Result<(Self, Self), Self> { - if at >= self.len { + if at > self.len { Err(self) } else { Ok(self.split_at_unchecked(at)) @@ -1245,33 +1245,42 @@ impl BitVec { let mut other = Self::with_capacity(other_len); let first_limb = at / WORD_SIZE; - let full_limbs = self.len / WORD_SIZE; + let last_limb = self.len / WORD_SIZE; - // if we start in the middle of a limb, we need to copy the leading partial limb. - // however, we limit the range to the size of the other vector, since we could be starting - // in the last limb of the original vector - let leading_partial = min((WORD_SIZE - (at % WORD_SIZE)) % WORD_SIZE, other_len); + // First, we figure out the number of bits from the first limb to retain in this vector: + let leading_partial = at % WORD_SIZE; - let iter_limbs = if leading_partial > 0 { - other.append_bits_unchecked( - self.data[first_limb] >> (WORD_SIZE - leading_partial), - leading_partial, - ); - first_limb + 1..full_limbs + // If the split point is in the last limb, and the vector ends before the last bit, first_limb + // and last_limb will be equal, and the other half is simply other_len bits off the limb + // right shifted by the number of bits to retain in this vector. + if first_limb == last_limb { + other.append_bits_unchecked(self.data[first_limb] >> leading_partial, other_len); } else { - first_limb..full_limbs - }; + // Otherwise, some range n..last_limb should be copied in their entirety to the other half, + // with n=first_limb+1 if the split point is inside the first limb (leading_partial > 0), or + // n=first_limb if the entire first limb belongs in the other half. + let full_limbs = if leading_partial > 0 { + // If the split point is inside the first limb, we also have to remember to copy over + // the trailing bits to the new vector. + other.append_bits_unchecked( + self.data[first_limb] >> leading_partial, + WORD_SIZE - leading_partial, + ); + first_limb + 1..last_limb + } else { + first_limb..last_limb + }; - for i in iter_limbs { - other.append_bits_unchecked(self.data[i], WORD_SIZE); - } + // Copy over any full limbs. + for i in full_limbs { + other.append_bits_unchecked(self.data[i], WORD_SIZE); + } - // if we did not start in the last limb, and there are bits left we didn't copy, - // we need to copy the remaining incomplete limb - if full_limbs > first_limb { - let trailing_partial = self.len - full_limbs * WORD_SIZE; + // Finally, if the vector has a partially filled last limb, we need to put those bits + // in the other half. + let trailing_partial = self.len % WORD_SIZE; if trailing_partial > 0 { - other.append_bits_unchecked(self.data[full_limbs], trailing_partial); + other.append_bits_unchecked(self.data[last_limb], trailing_partial); } } diff --git a/src/bit_vec/tests.rs b/src/bit_vec/tests.rs index b57d03c..1aa416d 100644 --- a/src/bit_vec/tests.rs +++ b/src/bit_vec/tests.rs @@ -746,4 +746,58 @@ fn test_split_at_result() { let bv = BitVec::from_zeros(2); let result = bv.split_at(3); assert!(result.is_err()); + + // check splitting inside a limb, with the end inside the next limb + let mut bv = BitVec::from_zeros(68); + bv.flip_bit(60); + let (left, right) = bv.split_at(60).expect("failed to split"); + assert_eq!(left.len, 60); + assert_eq!(right.len, 8); + assert_eq!(left.get(0), Some(0)); + assert_eq!(right.get(0), Some(1)); + + // check splitting inside a limb, with the complete next limb being the final limb + let mut bv = BitVec::from_zeros(128); + bv.flip_bit(60); + let (left, right) = bv.split_at(60).expect("failed to split"); + assert_eq!(left.len, 60); + assert_eq!(right.len, 68); + assert_eq!(left.get(0), Some(0)); + assert_eq!(right.get(0), Some(1)); + + // check splitting inside a limb, with a complete and then partial limb following + let mut bv = BitVec::from_zeros(140); + bv.flip_bit(60); + let (left, right) = bv.split_at(60).expect("failed to split"); + assert_eq!(left.len, 60); + assert_eq!(right.len, 80); + assert_eq!(left.get(0), Some(0)); + assert_eq!(right.get(0), Some(1)); + + // check splitting at the beginning of a limb, with the end inside the next limb + let mut bv = BitVec::from_zeros(144); + bv.flip_bit(64); + let (left, right) = bv.split_at(64).expect("failed to split"); + assert_eq!(left.len, 64); + assert_eq!(right.len, 80); + assert_eq!(left.get(0), Some(0)); + assert_eq!(right.get(0), Some(1)); + + // check splitting at the beginning of a limb, with the complete next limb being the final limb + let mut bv = BitVec::from_zeros(192); + bv.flip_bit(64); + let (left, right) = bv.split_at(64).expect("failed to split"); + assert_eq!(left.len, 64); + assert_eq!(right.len, 128); + assert_eq!(left.get(0), Some(0)); + assert_eq!(right.get(0), Some(1)); + + // check splitting at the beginning of a limb, with a complete and then partial limb following + let mut bv = BitVec::from_zeros(200); + bv.flip_bit(64); + let (left, right) = bv.split_at(64).expect("failed to split"); + assert_eq!(left.len, 64); + assert_eq!(right.len, 136); + assert_eq!(left.get(0), Some(0)); + assert_eq!(right.get(0), Some(1)); } From 0a7dcd5b4a49c342281ee82b5e1145c7a4f02d63 Mon Sep 17 00:00:00 2001 From: Arne Skjaerholt Date: Wed, 14 May 2025 00:45:33 +0200 Subject: [PATCH 21/23] Early return in split if second half is empty. --- src/bit_vec/mod.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/bit_vec/mod.rs b/src/bit_vec/mod.rs index 841ec0d..32213ec 100644 --- a/src/bit_vec/mod.rs +++ b/src/bit_vec/mod.rs @@ -1244,6 +1244,10 @@ impl BitVec { let other_len = self.len - at; let mut other = Self::with_capacity(other_len); + if other_len == 0 { + return (self, other); + } + let first_limb = at / WORD_SIZE; let last_limb = self.len / WORD_SIZE; From 63ef1235f3a0b2945e543fa5d98107ec7c309594 Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Thu, 15 May 2025 10:48:09 +0200 Subject: [PATCH 22/23] cleanup tests, add test for empty vector --- src/bit_vec/tests.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/bit_vec/tests.rs b/src/bit_vec/tests.rs index 1aa416d..51275c4 100644 --- a/src/bit_vec/tests.rs +++ b/src/bit_vec/tests.rs @@ -747,6 +747,18 @@ fn test_split_at_result() { let result = bv.split_at(3); assert!(result.is_err()); + // check splitting empty vec + let bv = BitVec::default(); + let (left, right) = bv.split_at(0).expect("failed to split"); + assert!(left.is_empty()); + assert!(right.is_empty()); +} + +#[test] +fn test_splitting_limbs() { + // this test might overlap with test_split_at. + // we test all variations of splitting in limbs of bit vecs + // check splitting inside a limb, with the end inside the next limb let mut bv = BitVec::from_zeros(68); bv.flip_bit(60); From 8e8c60fe5d95b33be7f6c5fc30865a3f64e028e2 Mon Sep 17 00:00:00 2001 From: Johannes Hengstler Date: Thu, 15 May 2025 10:51:47 +0200 Subject: [PATCH 23/23] slight doc adjustment --- src/bit_vec/mod.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/bit_vec/mod.rs b/src/bit_vec/mod.rs index 32213ec..25b16eb 100644 --- a/src/bit_vec/mod.rs +++ b/src/bit_vec/mod.rs @@ -1237,8 +1237,9 @@ impl BitVec { /// right half the remaining bits `at..`. /// /// # Panics - /// If the index is larger than the length of the vector the function will panic or return - /// unpredictable data. Use [`split_at`] to properly handle this case. + /// If the index is larger than the length of the vector the function will panic or run + /// out of memory. + /// Use [`split_at`] to properly handle this case. #[must_use] pub fn split_at_unchecked(mut self, at: usize) -> (Self, Self) { let other_len = self.len - at;