diff --git a/src/bit_vec/fast_rs_vec/mod.rs b/src/bit_vec/fast_rs_vec/mod.rs index 9014988..2c35643 100644 --- a/src/bit_vec/fast_rs_vec/mod.rs +++ b/src/bit_vec/fast_rs_vec/mod.rs @@ -373,16 +373,43 @@ impl RsVec { pub fn get_bits_unchecked(&self, pos: usize, len: usize) -> u64 { debug_assert!(len <= WORD_SIZE); let partial_word = self.data[pos / WORD_SIZE] >> (pos % WORD_SIZE); - if pos % WORD_SIZE + len == WORD_SIZE { - partial_word - } else if pos % WORD_SIZE + len < WORD_SIZE { - partial_word & ((1 << (len % WORD_SIZE)) - 1) + if pos % WORD_SIZE + len <= WORD_SIZE { + partial_word & 1u64.checked_shl(len as u32).unwrap_or(0).wrapping_sub(1) } else { (partial_word | (self.data[pos / WORD_SIZE + 1] << (WORD_SIZE - pos % WORD_SIZE))) & 1u64.checked_shl(len as u32).unwrap_or(0).wrapping_sub(1) } } + /// Convert the `RsVec` into a [`BitVec`]. + /// This consumes the `RsVec`, and discards all meta-data. + /// Since [`RsVec`]s are innately immutable, this conversion is the only way to modify the + /// underlying data. + /// + /// # Example + /// ```rust + /// use vers_vecs::{BitVec, RsVec}; + /// + /// let mut bit_vec = BitVec::new(); + /// bit_vec.append_word(u64::MAX); + /// + /// let rs_vec = RsVec::from_bit_vec(bit_vec); + /// assert_eq!(rs_vec.rank1(64), 64); + /// + /// let mut bit_vec = rs_vec.into_bit_vec(); + /// bit_vec.flip_bit(32); + /// let rs_vec = RsVec::from_bit_vec(bit_vec); + /// assert_eq!(rs_vec.rank1(64), 63); + /// assert_eq!(rs_vec.select0(0), 32); + /// ``` + #[must_use] + pub fn into_bit_vec(self) -> BitVec { + BitVec { + data: self.data, + len: self.len, + } + } + /// Check if two `RsVec`s are equal. For sparse vectors (either sparsely filled with 1-bits or /// 0-bits), this is faster than comparing the vectors bit by bit. /// Choose the value of `ZERO` depending on which bits are more sparse. @@ -515,6 +542,12 @@ impl From for RsVec { } } +impl From for BitVec { + fn from(value: RsVec) -> Self { + value.into_bit_vec() + } +} + // iter code in here to keep it more organized mod iter; // select code in here to keep it more organized diff --git a/src/bit_vec/mod.rs b/src/bit_vec/mod.rs index ef4eea5..25b16eb 100644 --- a/src/bit_vec/mod.rs +++ b/src/bit_vec/mod.rs @@ -736,6 +736,26 @@ impl BitVec { self.len += len; } + /// Append the bits of another bit vector to the end of this vector. + /// If this vector does not contain a multiple of 64 bits, the appended limbs need to be + /// shifted to the left. + /// This function is guaranteed to reallocate the underlying vector at most once. + pub fn extend_bitvec(&mut self, other: &Self) { + // reserve space for the new bits, ensuring at most one re-allocation + self.data + .reserve((self.len + other.len).div_ceil(WORD_SIZE) - self.data.len()); + + let full_limbs = other.len() / WORD_SIZE; + for i in 0..full_limbs { + self.append_bits(other.data[i], WORD_SIZE); + } + + let partial_bits = other.len % WORD_SIZE; + if partial_bits > 0 { + self.append_bits(other.data[full_limbs], partial_bits); + } + } + /// Return the length of the bit vector. The length is measured in bits. #[must_use] pub fn len(&self) -> usize { @@ -912,6 +932,9 @@ impl BitVec { /// If the position at the end of the query is larger than the length of the vector, /// None is returned (even if the query partially overlaps with the vector). /// If the length of the query is larger than 64, None is returned. + /// + /// The first bit at `pos` is the most significant bit of the return value + /// limited to `len` bits. #[must_use] pub fn get_bits(&self, pos: usize, len: usize) -> Option { if len > WORD_SIZE || len == 0 { @@ -948,10 +971,8 @@ impl BitVec { pub fn get_bits_unchecked(&self, pos: usize, len: usize) -> u64 { debug_assert!(len <= WORD_SIZE); let partial_word = self.data[pos / WORD_SIZE] >> (pos % WORD_SIZE); - if pos % WORD_SIZE + len == WORD_SIZE { - partial_word - } else if pos % WORD_SIZE + len < WORD_SIZE { - partial_word & ((1 << (len % WORD_SIZE)) - 1) + if pos % WORD_SIZE + len <= WORD_SIZE { + partial_word & 1u64.checked_shl(len as u32).unwrap_or(0).wrapping_sub(1) } else { (partial_word | (self.data[pos / WORD_SIZE + 1] << (WORD_SIZE - pos % WORD_SIZE))) & 1u64.checked_shl(len as u32).unwrap_or(0).wrapping_sub(1) @@ -1194,6 +1215,85 @@ impl BitVec { pub fn heap_size(&self) -> usize { self.data.len() * size_of::() } + + /// Split the vector in two at the specified index. The left half contains bits `0..at` and the + /// right half the remaining bits `at..`. If the split index is larger than the length of the + /// vector, the vector is returned unmodified in an `Err` variant. + /// + /// # Errors + /// If the index is out of bounds, the function will return an error + /// containing the original vector. + /// + /// See also: [`split_at_unchecked`] + pub fn split_at(self, at: usize) -> Result<(Self, Self), Self> { + if at > self.len { + Err(self) + } else { + Ok(self.split_at_unchecked(at)) + } + } + + /// Split the vector in two at the specified index. The left half contains bits `0..at` and the + /// right half the remaining bits `at..`. + /// + /// # Panics + /// If the index is larger than the length of the vector the function will panic or run + /// out of memory. + /// Use [`split_at`] to properly handle this case. + #[must_use] + pub fn split_at_unchecked(mut self, at: usize) -> (Self, Self) { + let other_len = self.len - at; + let mut other = Self::with_capacity(other_len); + + if other_len == 0 { + return (self, other); + } + + let first_limb = at / WORD_SIZE; + let last_limb = self.len / WORD_SIZE; + + // First, we figure out the number of bits from the first limb to retain in this vector: + let leading_partial = at % WORD_SIZE; + + // If the split point is in the last limb, and the vector ends before the last bit, first_limb + // and last_limb will be equal, and the other half is simply other_len bits off the limb + // right shifted by the number of bits to retain in this vector. + if first_limb == last_limb { + other.append_bits_unchecked(self.data[first_limb] >> leading_partial, other_len); + } else { + // Otherwise, some range n..last_limb should be copied in their entirety to the other half, + // with n=first_limb+1 if the split point is inside the first limb (leading_partial > 0), or + // n=first_limb if the entire first limb belongs in the other half. + let full_limbs = if leading_partial > 0 { + // If the split point is inside the first limb, we also have to remember to copy over + // the trailing bits to the new vector. + other.append_bits_unchecked( + self.data[first_limb] >> leading_partial, + WORD_SIZE - leading_partial, + ); + first_limb + 1..last_limb + } else { + first_limb..last_limb + }; + + // Copy over any full limbs. + for i in full_limbs { + other.append_bits_unchecked(self.data[i], WORD_SIZE); + } + + // Finally, if the vector has a partially filled last limb, we need to put those bits + // in the other half. + let trailing_partial = self.len % WORD_SIZE; + if trailing_partial > 0 { + other.append_bits_unchecked(self.data[last_limb], trailing_partial); + } + } + + // remove the copied bits from the original vector + self.drop_last(other_len); + + (self, other) + } } impl_vector_iterator! { BitVec, BitVecIter, BitVecRefIter } @@ -1218,6 +1318,22 @@ impl From> for BitVec { } } +impl Extend for BitVec { + fn extend>(&mut self, iter: T) { + for v in iter { + self.extend_bitvec(&v) + } + } +} + +impl<'t> Extend<&'t BitVec> for BitVec { + fn extend>(&mut self, iter: T) { + for v in iter { + self.extend_bitvec(v) + } + } +} + /// Create a new bit vector from u64 values. /// The bits are appended in little-endian order (i.e. the least significant bit is appended first). /// The function will append the bits of each element to the bit vector in the order they are diff --git a/src/bit_vec/tests.rs b/src/bit_vec/tests.rs index 4255374..51275c4 100644 --- a/src/bit_vec/tests.rs +++ b/src/bit_vec/tests.rs @@ -626,3 +626,190 @@ fn test_unpack() { assert_eq!(bv.unpack_element(8, 10), None); assert_eq!(bv.unpack_element(1000, 10), None); } + +#[test] +fn test_extend() { + // test bitvec extend + let mut bv = BitVec::from_zeros(10); + let bv_ones = BitVec::from_ones(10); + bv.extend_bitvec(&bv_ones); + assert_eq!(bv.len, 20); + assert_eq!(bv.get_bits(0, 20), Some(0b11111111110000000000)); + + // extend with an empty bitvec + let mut bv = BitVec::from_zeros(10); + bv.extend_bitvec(&BitVec::default()); + assert_eq!(bv.len, 10); + assert_eq!(bv.get_bits(0, 10), Some(0)); + + // test extend of empty bitvec + let mut bv = BitVec::default(); + let bv_ones = BitVec::from_ones(10); + bv.extend_bitvec(&bv_ones); + assert_eq!(bv.len, 10); + assert_eq!(bv.get_bits(0, 10), Some(0b1111111111)); + + // test large vectors + let mut bv = BitVec::from_zeros(1000); + let bv_ones = BitVec::from_ones(1000); + bv.extend_bitvec(&bv_ones); + assert_eq!(bv.len, 2000); + // sanity check: + assert_eq!(bv.get_bits(64, 64), Some(0)); + assert_eq!(bv.get_bits(1064, 64), Some(u64::MAX)); + + // test aligned vectors + let mut bv = BitVec::from_zeros(64); + let bv_ones = BitVec::from_ones(64); + bv.extend_bitvec(&bv_ones); + assert_eq!(bv.len, 128); + assert_eq!(bv.get_bits(0, 64), Some(0)); + assert_eq!(bv.get_bits(64, 64), Some(u64::MAX)); +} + +#[test] +fn test_split_at() { + // test the split_at(_unchecked) function + let mut bv = BitVec::from_zeros(64); + bv.flip_bit(1); + bv.flip_bit(3); + + // check splitting at 1 + let (left, right) = bv.split_at_unchecked(2); + assert_eq!(left.len, 2); + assert_eq!(right.len, 62); + assert_eq!(left.get(0), Some(0)); + assert_eq!(left.get(1), Some(1)); + assert_eq!(right.get(0), Some(0)); + assert_eq!(right.get(1), Some(1)); + assert_eq!(right.get_bits(2, 60), Some(0)); + + // check splitting at 0 + let bv = BitVec::from_zeros(1000); + let (left, right) = bv.split_at_unchecked(0); + assert_eq!(left.len, 0); + assert_eq!(right.len, 1000); + assert_eq!(right.get(999), Some(0)); + + // check splitting at the end + let bv = BitVec::from_zeros(1000); + let (left, right) = bv.split_at_unchecked(1000); + assert_eq!(left.len, 1000); + assert_eq!(right.len, 0); + assert_eq!(left.get(999), Some(0)); + + // check splitting aligned + let bv = BitVec::from_ones(128); + let (left, right) = bv.split_at_unchecked(64); + assert_eq!(left.len, 64); + assert_eq!(right.len, 64); + assert_eq!(left.get_bits(0, 64), Some(u64::MAX)); + assert_eq!(right.get_bits(0, 64), Some(u64::MAX)); + + // check splitting in single limb + let bv = BitVec::from_ones(20); + let (left, right) = bv.split_at_unchecked(10); + assert_eq!(left.len, 10); + assert_eq!(right.len, 10); + + // check splitting empty vector + let bv = BitVec::default(); + let (left, right) = bv.split_at_unchecked(0); + assert_eq!(left.len, 0); + assert_eq!(right.len, 0); +} + +#[test] +fn test_split_at_result() { + // check splitting at 1 + let mut bv = BitVec::from_zeros(2); + bv.flip_bit(1); + let (left, right) = bv.split_at(1).expect("failed to split"); + assert_eq!(left.len, 1); + assert_eq!(right.len, 1); + assert_eq!(left.get(0), Some(0)); + assert_eq!(right.get(0), Some(1)); + + // check splitting at 0 + let bv = BitVec::from_zeros(2); + let (left, right) = bv.split_at(0).expect("failed to split"); + assert_eq!(left.len, 0); + assert_eq!(right.len, 2); + + // check splitting at the end + let bv = BitVec::from_zeros(2); + let (left, right) = bv.split_at(2).expect("failed to split"); + assert_eq!(left.len, 2); + assert_eq!(right.len, 0); + + // check splitting past the end + let bv = BitVec::from_zeros(2); + let result = bv.split_at(3); + assert!(result.is_err()); + + // check splitting empty vec + let bv = BitVec::default(); + let (left, right) = bv.split_at(0).expect("failed to split"); + assert!(left.is_empty()); + assert!(right.is_empty()); +} + +#[test] +fn test_splitting_limbs() { + // this test might overlap with test_split_at. + // we test all variations of splitting in limbs of bit vecs + + // check splitting inside a limb, with the end inside the next limb + let mut bv = BitVec::from_zeros(68); + bv.flip_bit(60); + let (left, right) = bv.split_at(60).expect("failed to split"); + assert_eq!(left.len, 60); + assert_eq!(right.len, 8); + assert_eq!(left.get(0), Some(0)); + assert_eq!(right.get(0), Some(1)); + + // check splitting inside a limb, with the complete next limb being the final limb + let mut bv = BitVec::from_zeros(128); + bv.flip_bit(60); + let (left, right) = bv.split_at(60).expect("failed to split"); + assert_eq!(left.len, 60); + assert_eq!(right.len, 68); + assert_eq!(left.get(0), Some(0)); + assert_eq!(right.get(0), Some(1)); + + // check splitting inside a limb, with a complete and then partial limb following + let mut bv = BitVec::from_zeros(140); + bv.flip_bit(60); + let (left, right) = bv.split_at(60).expect("failed to split"); + assert_eq!(left.len, 60); + assert_eq!(right.len, 80); + assert_eq!(left.get(0), Some(0)); + assert_eq!(right.get(0), Some(1)); + + // check splitting at the beginning of a limb, with the end inside the next limb + let mut bv = BitVec::from_zeros(144); + bv.flip_bit(64); + let (left, right) = bv.split_at(64).expect("failed to split"); + assert_eq!(left.len, 64); + assert_eq!(right.len, 80); + assert_eq!(left.get(0), Some(0)); + assert_eq!(right.get(0), Some(1)); + + // check splitting at the beginning of a limb, with the complete next limb being the final limb + let mut bv = BitVec::from_zeros(192); + bv.flip_bit(64); + let (left, right) = bv.split_at(64).expect("failed to split"); + assert_eq!(left.len, 64); + assert_eq!(right.len, 128); + assert_eq!(left.get(0), Some(0)); + assert_eq!(right.get(0), Some(1)); + + // check splitting at the beginning of a limb, with a complete and then partial limb following + let mut bv = BitVec::from_zeros(200); + bv.flip_bit(64); + let (left, right) = bv.split_at(64).expect("failed to split"); + assert_eq!(left.len, 64); + assert_eq!(right.len, 136); + assert_eq!(left.get(0), Some(0)); + assert_eq!(right.get(0), Some(1)); +} diff --git a/src/trees/bp/mod.rs b/src/trees/bp/mod.rs index 1262e90..6b9e89c 100644 --- a/src/trees/bp/mod.rs +++ b/src/trees/bp/mod.rs @@ -506,6 +506,36 @@ impl BpTree { ChildrenIter::::new(self, node) } + /// Transform the tree into a [`RsVec`] containing the balanced parenthesis expression. + /// This consumes the tree and returns the underlying bit vector with the rank and select + /// support structure. + /// The remaining min-max-tree support structure of the `BpTree` is discarded. + /// Since the tree is innately immutable, this is the only way to access the underlying bit + /// vector for potential modification. + /// Modification requires turning the `RsVec` back into a `BitVec`, discarding the rank and select + /// support structure, however. + /// + /// # Examples + /// ```rust + /// use vers_vecs::{BitVec, RsVec, BpTree, Tree}; + /// + /// let bv = BitVec::pack_sequence_u8(&[0b1101_0111, 0b0010_0100], 8); + /// let tree = BpTree::<4>::from_bit_vector(bv); + /// assert_eq!(tree.size(), 8); + /// + /// let rs_vec = tree.into_parentheses_vec(); + /// let mut bv = rs_vec.into_bit_vec(); + /// + /// bv.flip_bit(15); + /// bv.append_bits(0, 2); + /// let tree = BpTree::<4>::from_bit_vector(bv); + /// assert_eq!(tree.size(), 9); + /// ``` + #[must_use] + pub fn into_parentheses_vec(self) -> RsVec { + self.vec + } + /// Returns the number of bytes used on the heap for this tree. This does not include /// allocated space that is not used (e.g. by the allocation behavior of `Vec`). #[must_use] @@ -719,6 +749,18 @@ impl From for BpTree { } } +impl From> for BitVec { + fn from(value: BpTree) -> Self { + value.into_parentheses_vec().into_bit_vec() + } +} + +impl From> for RsVec { + fn from(value: BpTree) -> Self { + value.into_parentheses_vec() + } +} + /// An iterator over the children of a node. /// Calls to `next` return the next child node handle in the order they appear in the parenthesis /// expression.