From d90d6f0f004946f4ac05140a02bc4c5c5e50deec Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Wed, 23 Jul 2025 22:43:18 +0100 Subject: [PATCH 1/9] implement basic serialization and deserialization of diff counts as bytes --- crates/geo_filters/src/diff_count.rs | 104 +++++++++++++++++++- crates/geo_filters/src/diff_count/bitvec.rs | 64 +++++++++++- 2 files changed, 163 insertions(+), 5 deletions(-) diff --git a/crates/geo_filters/src/diff_count.rs b/crates/geo_filters/src/diff_count.rs index 7e784c6..fd97c15 100644 --- a/crates/geo_filters/src/diff_count.rs +++ b/crates/geo_filters/src/diff_count.rs @@ -4,6 +4,7 @@ use std::borrow::Cow; use std::cmp::Ordering; use std::hash::BuildHasher as _; use std::mem::{size_of, size_of_val}; +use std::ops::Deref; use crate::config::{ count_ones_from_bitchunks, count_ones_from_msb_and_lsb, iter_bit_chunks, iter_ones, @@ -77,7 +78,7 @@ impl> std::fmt::Debug for GeoDiffCount<'_, C> { } } -impl> GeoDiffCount<'_, C> { +impl<'a, C: GeoConfig> GeoDiffCount<'a, C> { pub fn new(config: C) -> Self { Self { config, @@ -86,6 +87,49 @@ impl> GeoDiffCount<'_, C> { } } + pub fn from_bytes(c: C, buf: &'a [u8]) -> Self { + if buf.is_empty() { + return Self::new(c); + } + + // The number of most significant bits stores in the MSB sparse repr + let msb_len = (buf.len() / size_of::()).min(c.max_msb_len()); + + let msb = + unsafe { std::slice::from_raw_parts(buf.as_ptr() as *const C::BucketType, msb_len) }; + + // The number of bytes representing the MSB - this is how many bytes we need to + // skip over to reach the LSB + let msb_bytes_len = msb_len * size_of::(); + + Self { + config: c, + msb: Cow::Borrowed(msb), + lsb: BitVec::from_bytes(&buf[msb_bytes_len..]), + } + } + + pub fn write(&self, writer: &mut W) -> std::io::Result { + if self.msb.is_empty() { + return Ok(0); + } + + let msb_buckets = self.msb.deref(); + let msb_bytes = unsafe { + std::slice::from_raw_parts( + msb_buckets.as_ptr() as *const u8, + msb_buckets.len() * size_of::(), + ) + }; + writer.write_all(msb_bytes)?; + + let mut bytes_written = msb_bytes.len(); + + bytes_written += self.lsb.write(writer)?; + + Ok(bytes_written) + } + /// `BitChunk`s can be processed much more efficiently than individual one bits! /// This function makes it possible to construct a GeoDiffCount instance directly from /// `BitChunk`s. It will extract the most significant bits first and then put the remainder @@ -208,16 +252,23 @@ impl> GeoDiffCount<'_, C> { /// that makes the cost of the else case negligible. fn xor_bit(&mut self, bucket: C::BucketType) { if bucket.into_usize() < self.lsb.num_bits() { + // The bit being toggled is within our LSB bit vector + // so toggle it directly. self.lsb.toggle(bucket.into_usize()); } else { let msb = self.msb.to_mut(); match msb.binary_search_by(|k| bucket.cmp(k)) { Ok(idx) => { + // The bit is already set in the MSB sparse bitset, remove it (XOR) msb.remove(idx); + + // We have removed a value from our MSB, move a value in the + // LSB into the MSB let (first, second) = { let mut lsb = iter_ones(self.lsb.bit_chunks().peekable()); (lsb.next(), lsb.next()) }; + let new_smallest = if let Some(smallest) = first { msb.push(C::BucketType::from_usize(smallest)); second.map(|_| smallest).unwrap_or(0) @@ -229,15 +280,19 @@ impl> GeoDiffCount<'_, C> { Err(idx) => { msb.insert(idx, bucket); if msb.len() > self.config.max_msb_len() { + // We have too many values in the MSB sparse index vector, + // let's move the smalles MSB value into the LSB bit vector let smallest = msb .pop() .expect("we should have at least one element!") .into_usize(); - // ensure vector covers smallest + let new_smallest = msb .last() .expect("should have at least one element") .into_usize(); + + // ensure LSB bit vector has the space for `smallest` self.lsb.resize(new_smallest); self.lsb.toggle(smallest); } @@ -360,7 +415,10 @@ impl> Count for GeoDiffCount<'_, C> { #[cfg(test)] mod tests { use itertools::Itertools; - use rand::{RngCore, SeedableRng}; + use rand::{ + seq::{IndexedRandom, IteratorRandom}, + RngCore, SeedableRng, + }; use crate::{ build_hasher::UnstableDefaultBuildHasher, @@ -580,4 +638,44 @@ mod tests { iter_ones(self.bit_chunks().peekable()).map(C::BucketType::from_usize) } } + + #[test] + fn test_serialization_empty() { + let before = GeoDiffCount7::default(); + + let mut writer = vec![]; + before.write(&mut writer).unwrap(); + + assert_eq!(writer.len(), 0); + + let after = GeoDiffCount7::from_bytes(before.config.clone(), &writer); + + assert_eq!(before, after); + } + + #[test] + fn test_serialization_round_trip() { + let mut rnd = rand::rngs::StdRng::from_os_rng(); + + // Run 100 simulations of random values being put into + // a diff counter + for _ in 0..100 { + let mut before = GeoDiffCount7::default(); + + // Select a random number of items to insert + let items = (1..1000).choose(&mut rnd).unwrap(); + + for _ in 0..items { + before.push_hash(rnd.next_u64()); + } + + let mut writer = vec![]; + + before.write(&mut writer).unwrap(); + + let after = GeoDiffCount7::from_bytes(before.config.clone(), &writer); + + assert_eq!(before, after); + } + } } diff --git a/crates/geo_filters/src/diff_count/bitvec.rs b/crates/geo_filters/src/diff_count/bitvec.rs index c94a041..31158af 100644 --- a/crates/geo_filters/src/diff_count/bitvec.rs +++ b/crates/geo_filters/src/diff_count/bitvec.rs @@ -2,11 +2,11 @@ use std::borrow::Cow; use std::cmp::Ordering; use std::iter::Peekable; use std::mem::{size_of, size_of_val}; -use std::ops::{Index, Range}; +use std::ops::{Deref as _, Index, Range}; -use crate::config::BitChunk; use crate::config::IsBucketType; use crate::config::BITS_PER_BLOCK; +use crate::config::{BitChunk, BYTES_PER_BLOCK}; /// A bit vector where every bit occupies exactly one bit (in contrast to `Vec` where each /// bit consumes 1 byte). It only implements the minimum number of operations that we need for our @@ -34,6 +34,62 @@ impl PartialOrd for BitVec<'_> { } impl BitVec<'_> { + pub fn from_bytes(mut buf: &[u8]) -> Self { + if buf.is_empty() { + return Self::default(); + } + + // The first byte of the serialized BitVec is used to indicate how many + // of the bits in the left-most byte are *unoccupied*. + // See [`BitVec::write`] implementation for how this is done. + assert!( + buf[0] < 64, + "Number of unoccupied bits should be <64, got {}", + buf[0] + ); + + let num_bits = (buf.len() - 1) * 8 - buf[0] as usize; + buf = &buf[1..]; + + assert_eq!( + buf.len() % BYTES_PER_BLOCK, + 0, + "buffer should be a multiple of 8 bytes, got {}", + buf.len() + ); + + let blocks = unsafe { + std::mem::transmute(std::slice::from_raw_parts( + buf.as_ptr(), + buf.len() / BYTES_PER_BLOCK, + )) + }; + let blocks = Cow::Borrowed(blocks); + + Self { num_bits, blocks } + } + + pub fn write(&self, writer: &mut W) -> std::io::Result { + if self.is_empty() { + return Ok(0); + } + + // First serialize the number of unoccupied bits in the last block as one byte. + let unoccupied_bits = 63 - ((self.num_bits - 1) % 64) as u8; + + writer.write_all(&[unoccupied_bits])?; + + let blocks = self.blocks.deref(); + + let block_bytes = unsafe { + std::slice::from_raw_parts(blocks.as_ptr() as *const u8, blocks.len() * BYTES_PER_BLOCK) + }; + + writer.write_all(block_bytes)?; + + Ok(block_bytes.len() + 1) + } + /// Takes an iterator of `BitChunk` items as input and returns the corresponding `BitVec`. /// The order of `BitChunk`s doesn't matter for this function and `BitChunk` may be hitting /// the same block. In this case, the function will simply xor them together. @@ -81,6 +137,10 @@ impl BitVec<'_> { self.num_bits } + pub fn is_empty(&self) -> bool { + self.num_bits() == 0 + } + /// Tests the bit specified by the provided zero-based bit position. pub fn test_bit(&self, index: usize) -> bool { assert!(index < self.num_bits); From 3e3398846f521006e3dcc26e311f74a175e579b5 Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Wed, 23 Jul 2025 23:02:22 +0100 Subject: [PATCH 2/9] Refactor serialization tests for geo diff count types The changes extract a shared test helper for verifying serialization of geo diff counts with different bucket types in MSB sparse bit field representation. --- crates/geo_filters/src/diff_count.rs | 31 +++++++++++++++++++--------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/crates/geo_filters/src/diff_count.rs b/crates/geo_filters/src/diff_count.rs index fd97c15..258fd41 100644 --- a/crates/geo_filters/src/diff_count.rs +++ b/crates/geo_filters/src/diff_count.rs @@ -415,10 +415,7 @@ impl> Count for GeoDiffCount<'_, C> { #[cfg(test)] mod tests { use itertools::Itertools; - use rand::{ - seq::{IndexedRandom, IteratorRandom}, - RngCore, SeedableRng, - }; + use rand::{seq::IteratorRandom, RngCore, SeedableRng}; use crate::{ build_hasher::UnstableDefaultBuildHasher, @@ -653,14 +650,17 @@ mod tests { assert_eq!(before, after); } - #[test] - fn test_serialization_round_trip() { + // This helper exists in order to easily test serializing types with different + // bucket types in the MSB sparse bit field representation. See tests below. + fn serialization_round_trip + Default>() { let mut rnd = rand::rngs::StdRng::from_os_rng(); // Run 100 simulations of random values being put into - // a diff counter + // a diff counter. "Serializing" to a vector to emulate + // writing to a disk, and then deserializing and asserting + // the filters are equal. for _ in 0..100 { - let mut before = GeoDiffCount7::default(); + let mut before = GeoDiffCount::<'_, C>::default(); // Select a random number of items to insert let items = (1..1000).choose(&mut rnd).unwrap(); @@ -670,12 +670,23 @@ mod tests { } let mut writer = vec![]; - before.write(&mut writer).unwrap(); - let after = GeoDiffCount7::from_bytes(before.config.clone(), &writer); + let after = GeoDiffCount::<'_, C>::from_bytes(before.config.clone(), &writer); assert_eq!(before, after); } } + + #[test] + fn test_serialization_round_trip_7() { + // Uses a u16 for MSB buckets + serialization_round_trip::(); + } + + #[test] + fn test_serialization_round_trip_13() { + // Uses a u32 for MSB buckets + serialization_round_trip::(); + } } From c143f4f036cd32d03a6bdfbaf856b80ddb4169ed Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Thu, 24 Jul 2025 00:38:49 +0100 Subject: [PATCH 3/9] Move serialization code to dedicated module The changes split out serialization-related code from diff_count.rs and bitvec.rs into a new serde.rs module. This improves code organization by separating concerns. --- crates/geo_filters/src/diff_count.rs | 45 +------- crates/geo_filters/src/diff_count/bitvec.rs | 111 ++++++++++---------- crates/geo_filters/src/diff_count/serde.rs | 72 +++++++++++++ 3 files changed, 128 insertions(+), 100 deletions(-) create mode 100644 crates/geo_filters/src/diff_count/serde.rs diff --git a/crates/geo_filters/src/diff_count.rs b/crates/geo_filters/src/diff_count.rs index 258fd41..bf7301a 100644 --- a/crates/geo_filters/src/diff_count.rs +++ b/crates/geo_filters/src/diff_count.rs @@ -4,7 +4,6 @@ use std::borrow::Cow; use std::cmp::Ordering; use std::hash::BuildHasher as _; use std::mem::{size_of, size_of_val}; -use std::ops::Deref; use crate::config::{ count_ones_from_bitchunks, count_ones_from_msb_and_lsb, iter_bit_chunks, iter_ones, @@ -14,6 +13,7 @@ use crate::{Count, Diff}; mod bitvec; mod config; +mod serde; mod sim_hash; use bitvec::*; @@ -87,49 +87,6 @@ impl<'a, C: GeoConfig> GeoDiffCount<'a, C> { } } - pub fn from_bytes(c: C, buf: &'a [u8]) -> Self { - if buf.is_empty() { - return Self::new(c); - } - - // The number of most significant bits stores in the MSB sparse repr - let msb_len = (buf.len() / size_of::()).min(c.max_msb_len()); - - let msb = - unsafe { std::slice::from_raw_parts(buf.as_ptr() as *const C::BucketType, msb_len) }; - - // The number of bytes representing the MSB - this is how many bytes we need to - // skip over to reach the LSB - let msb_bytes_len = msb_len * size_of::(); - - Self { - config: c, - msb: Cow::Borrowed(msb), - lsb: BitVec::from_bytes(&buf[msb_bytes_len..]), - } - } - - pub fn write(&self, writer: &mut W) -> std::io::Result { - if self.msb.is_empty() { - return Ok(0); - } - - let msb_buckets = self.msb.deref(); - let msb_bytes = unsafe { - std::slice::from_raw_parts( - msb_buckets.as_ptr() as *const u8, - msb_buckets.len() * size_of::(), - ) - }; - writer.write_all(msb_bytes)?; - - let mut bytes_written = msb_bytes.len(); - - bytes_written += self.lsb.write(writer)?; - - Ok(bytes_written) - } - /// `BitChunk`s can be processed much more efficiently than individual one bits! /// This function makes it possible to construct a GeoDiffCount instance directly from /// `BitChunk`s. It will extract the most significant bits first and then put the remainder diff --git a/crates/geo_filters/src/diff_count/bitvec.rs b/crates/geo_filters/src/diff_count/bitvec.rs index 31158af..09c50a2 100644 --- a/crates/geo_filters/src/diff_count/bitvec.rs +++ b/crates/geo_filters/src/diff_count/bitvec.rs @@ -34,62 +34,6 @@ impl PartialOrd for BitVec<'_> { } impl BitVec<'_> { - pub fn from_bytes(mut buf: &[u8]) -> Self { - if buf.is_empty() { - return Self::default(); - } - - // The first byte of the serialized BitVec is used to indicate how many - // of the bits in the left-most byte are *unoccupied*. - // See [`BitVec::write`] implementation for how this is done. - assert!( - buf[0] < 64, - "Number of unoccupied bits should be <64, got {}", - buf[0] - ); - - let num_bits = (buf.len() - 1) * 8 - buf[0] as usize; - buf = &buf[1..]; - - assert_eq!( - buf.len() % BYTES_PER_BLOCK, - 0, - "buffer should be a multiple of 8 bytes, got {}", - buf.len() - ); - - let blocks = unsafe { - std::mem::transmute(std::slice::from_raw_parts( - buf.as_ptr(), - buf.len() / BYTES_PER_BLOCK, - )) - }; - let blocks = Cow::Borrowed(blocks); - - Self { num_bits, blocks } - } - - pub fn write(&self, writer: &mut W) -> std::io::Result { - if self.is_empty() { - return Ok(0); - } - - // First serialize the number of unoccupied bits in the last block as one byte. - let unoccupied_bits = 63 - ((self.num_bits - 1) % 64) as u8; - - writer.write_all(&[unoccupied_bits])?; - - let blocks = self.blocks.deref(); - - let block_bytes = unsafe { - std::slice::from_raw_parts(blocks.as_ptr() as *const u8, blocks.len() * BYTES_PER_BLOCK) - }; - - writer.write_all(block_bytes)?; - - Ok(block_bytes.len() + 1) - } - /// Takes an iterator of `BitChunk` items as input and returns the corresponding `BitVec`. /// The order of `BitChunk`s doesn't matter for this function and `BitChunk` may be hitting /// the same block. In this case, the function will simply xor them together. @@ -202,6 +146,61 @@ impl BitVec<'_> { let Self { num_bits, blocks } = self; size_of_val(num_bits) + blocks.len() * size_of::() } + + pub fn from_bytes(mut buf: &[u8]) -> Self { + if buf.is_empty() { + return Self::default(); + } + + // The first byte of the serialized BitVec is used to indicate how many + // of the bits in the left-most byte are *unoccupied*. + // See [`BitVec::write`] implementation for how this is done. + assert!( + buf[0] < 64, + "Number of unoccupied bits should be <64, got {}", + buf[0] + ); + + let num_bits = (buf.len() - 1) * 8 - buf[0] as usize; + buf = &buf[1..]; + + assert_eq!( + buf.len() % BYTES_PER_BLOCK, + 0, + "buffer should be a multiple of 8 bytes, got {}", + buf.len() + ); + + let blocks = unsafe { + std::mem::transmute(std::slice::from_raw_parts( + buf.as_ptr(), + buf.len() / BYTES_PER_BLOCK, + )) + }; + let blocks = Cow::Borrowed(blocks); + + Self { num_bits, blocks } + } + + pub fn write(&self, writer: &mut W) -> std::io::Result { + if self.is_empty() { + return Ok(0); + } + + // First serialize the number of unoccupied bits in the last block as one byte. + let unoccupied_bits = 63 - ((self.num_bits - 1) % 64) as u8; + + writer.write_all(&[unoccupied_bits])?; + + let blocks = self.blocks.deref(); + let block_bytes = unsafe { + std::slice::from_raw_parts(blocks.as_ptr() as *const u8, blocks.len() * BYTES_PER_BLOCK) + }; + + writer.write_all(block_bytes)?; + + Ok(block_bytes.len() + 1) + } } impl Index for BitVec<'_> { diff --git a/crates/geo_filters/src/diff_count/serde.rs b/crates/geo_filters/src/diff_count/serde.rs new file mode 100644 index 0000000..552f293 --- /dev/null +++ b/crates/geo_filters/src/diff_count/serde.rs @@ -0,0 +1,72 @@ +//! Convert a [`GeoDiffCount`] to and from byte arrays. +//! +//! Since most of our target platforms are little endian there are more optimised approaches +//! for little endian platforms, just splatting the bytes into the writer. This is contrary +//! to the usual "network endian" approach where big endian is the default, but most of our +//! consumers are little endian so it makes sense for this to be the optimal approach. +//! +//! We still need to support big endian platforms though, but they get a less efficient path. +use std::{borrow::Cow, ops::Deref as _}; + +use crate::{config::GeoConfig, Diff}; + +use super::{bitvec::BitVec, GeoDiffCount}; + +impl<'a, C: GeoConfig> GeoDiffCount<'a, C> { + /// Create a new [`GeoDiffCount`] from a slice of bytes + #[cfg(target_endian = "little")] + pub fn from_bytes(c: C, buf: &'a [u8]) -> Self { + if buf.is_empty() { + return Self::new(c); + } + + // The number of most significant bits stores in the MSB sparse repr + let msb_len = (buf.len() / size_of::()).min(c.max_msb_len()); + + let msb = + unsafe { std::slice::from_raw_parts(buf.as_ptr() as *const C::BucketType, msb_len) }; + + // The number of bytes representing the MSB - this is how many bytes we need to + // skip over to reach the LSB + let msb_bytes_len = msb_len * size_of::(); + + Self { + config: c, + msb: Cow::Borrowed(msb), + lsb: BitVec::from_bytes(&buf[msb_bytes_len..]), + } + } + + /// Create a new [`GeoDiffCount`] from a slice of bytes + #[cfg(target_endian = "big")] + pub fn from_bytes(c: C, buf: &'a [u8]) -> Self { + unimplemented!("not supported on big endian platforms") + } + + #[cfg(target_endian = "little")] + pub fn write(&self, writer: &mut W) -> std::io::Result { + if self.msb.is_empty() { + return Ok(0); + } + + let msb_buckets = self.msb.deref(); + let msb_bytes = unsafe { + std::slice::from_raw_parts( + msb_buckets.as_ptr() as *const u8, + msb_buckets.len() * size_of::(), + ) + }; + writer.write_all(msb_bytes)?; + + let mut bytes_written = msb_bytes.len(); + + bytes_written += self.lsb.write(writer)?; + + Ok(bytes_written) + } + + #[cfg(target_endian = "big")] + pub fn write(&self, writer: &mut W) -> std::io::Result { + unimplemented!("not supported on big endian platforms") + } +} From fc9c45d782c7c56e6a94366ff23fe7b9c612809a Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Thu, 24 Jul 2025 00:55:34 +0100 Subject: [PATCH 4/9] Modify module docs to explain that big endian is not supported. --- crates/geo_filters/src/diff_count/bitvec.rs | 1 - crates/geo_filters/src/diff_count/serde.rs | 6 +++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/crates/geo_filters/src/diff_count/bitvec.rs b/crates/geo_filters/src/diff_count/bitvec.rs index 09c50a2..01b2b99 100644 --- a/crates/geo_filters/src/diff_count/bitvec.rs +++ b/crates/geo_filters/src/diff_count/bitvec.rs @@ -189,7 +189,6 @@ impl BitVec<'_> { // First serialize the number of unoccupied bits in the last block as one byte. let unoccupied_bits = 63 - ((self.num_bits - 1) % 64) as u8; - writer.write_all(&[unoccupied_bits])?; let blocks = self.blocks.deref(); diff --git a/crates/geo_filters/src/diff_count/serde.rs b/crates/geo_filters/src/diff_count/serde.rs index 552f293..d0d33be 100644 --- a/crates/geo_filters/src/diff_count/serde.rs +++ b/crates/geo_filters/src/diff_count/serde.rs @@ -5,7 +5,11 @@ //! to the usual "network endian" approach where big endian is the default, but most of our //! consumers are little endian so it makes sense for this to be the optimal approach. //! -//! We still need to support big endian platforms though, but they get a less efficient path. +//! For now we do not support big endian platforms. In the future we might add a big endian +//! platform specific implementation which is able to read the little endian serialized +//! representation. For now, if you attempt to serialize a filter on a big endian platform +//! you get a panic. + use std::{borrow::Cow, ops::Deref as _}; use crate::{config::GeoConfig, Diff}; From aa215ce7fa3be40176a087db9479da413d04c9c0 Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Thu, 24 Jul 2025 10:01:28 +0100 Subject: [PATCH 5/9] Linting fixes and alignment testing Clippy wants explicit types on transmute calls (reasonable). Fixed issue where from_raw_parts was upset about misaligned slices. I think for all platforms we're targetting the alignment of the buckets doesn't matter. This was fixed by not casting the byte buffer pointer to the correct type and instead transmuting after the fact. Fix alignment and unsafe transmutes The changes focus on making alignment handling more safe and explicit in unsafe code blocks, while also adding test coverage for alignment issues. Fix unsafe transmutes and alignment in bitvec module The changes improve type safety in unsafe transmutes and add alignment tests for raw pointer conversions. This fixes potential undefined behavior when converting between slice types. --- crates/geo_filters/src/diff_count.rs | 13 ++++++++++++- crates/geo_filters/src/diff_count/bitvec.rs | 2 +- crates/geo_filters/src/diff_count/serde.rs | 13 +++++++------ 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/crates/geo_filters/src/diff_count.rs b/crates/geo_filters/src/diff_count.rs index bf7301a..06b7be4 100644 --- a/crates/geo_filters/src/diff_count.rs +++ b/crates/geo_filters/src/diff_count.rs @@ -371,6 +371,8 @@ impl> Count for GeoDiffCount<'_, C> { #[cfg(test)] mod tests { + use std::io::Write; + use itertools::Itertools; use rand::{seq::IteratorRandom, RngCore, SeedableRng}; @@ -627,9 +629,18 @@ mod tests { } let mut writer = vec![]; + + // Insert some padding to emulate alignment issues with the slices + // A previous version of this test never panicked even though we were + // violating the alignment preconditions for the `from_raw_parts` function + let pad = (0..8).choose(&mut rnd).unwrap(); + for _ in 0..pad { + writer.write(&[0x0]).unwrap(); + } + before.write(&mut writer).unwrap(); - let after = GeoDiffCount::<'_, C>::from_bytes(before.config.clone(), &writer); + let after = GeoDiffCount::<'_, C>::from_bytes(before.config.clone(), &writer[pad..]); assert_eq!(before, after); } diff --git a/crates/geo_filters/src/diff_count/bitvec.rs b/crates/geo_filters/src/diff_count/bitvec.rs index 01b2b99..9a0571b 100644 --- a/crates/geo_filters/src/diff_count/bitvec.rs +++ b/crates/geo_filters/src/diff_count/bitvec.rs @@ -172,7 +172,7 @@ impl BitVec<'_> { ); let blocks = unsafe { - std::mem::transmute(std::slice::from_raw_parts( + std::mem::transmute::<&[u8], &[u64]>(std::slice::from_raw_parts( buf.as_ptr(), buf.len() / BYTES_PER_BLOCK, )) diff --git a/crates/geo_filters/src/diff_count/serde.rs b/crates/geo_filters/src/diff_count/serde.rs index d0d33be..a52ccd7 100644 --- a/crates/geo_filters/src/diff_count/serde.rs +++ b/crates/geo_filters/src/diff_count/serde.rs @@ -27,8 +27,12 @@ impl<'a, C: GeoConfig> GeoDiffCount<'a, C> { // The number of most significant bits stores in the MSB sparse repr let msb_len = (buf.len() / size_of::()).min(c.max_msb_len()); - let msb = - unsafe { std::slice::from_raw_parts(buf.as_ptr() as *const C::BucketType, msb_len) }; + let msb = unsafe { + std::mem::transmute::<&[u8], &[C::BucketType]>(std::slice::from_raw_parts( + buf.as_ptr(), + msb_len, + )) + }; // The number of bytes representing the MSB - this is how many bytes we need to // skip over to reach the LSB @@ -55,10 +59,7 @@ impl<'a, C: GeoConfig> GeoDiffCount<'a, C> { let msb_buckets = self.msb.deref(); let msb_bytes = unsafe { - std::slice::from_raw_parts( - msb_buckets.as_ptr() as *const u8, - msb_buckets.len() * size_of::(), - ) + std::slice::from_raw_parts(msb_buckets.as_ptr() as *const u8, size_of_val(msb_buckets)) }; writer.write_all(msb_bytes)?; From 42f76b8db698511f1379e7b4f4a1040be6207305 Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Thu, 24 Jul 2025 10:05:02 +0100 Subject: [PATCH 6/9] Fix test name/config mismatch in diff_count test. Thanks Copilot! --- crates/geo_filters/src/diff_count.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/geo_filters/src/diff_count.rs b/crates/geo_filters/src/diff_count.rs index 06b7be4..5dab46f 100644 --- a/crates/geo_filters/src/diff_count.rs +++ b/crates/geo_filters/src/diff_count.rs @@ -655,6 +655,6 @@ mod tests { #[test] fn test_serialization_round_trip_13() { // Uses a u32 for MSB buckets - serialization_round_trip::(); + serialization_round_trip::(); } } From a80c3e67e8e9cf32bfc91c042c36491db740bbc9 Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Thu, 24 Jul 2025 10:13:13 +0100 Subject: [PATCH 7/9] Update diff_count serialization test to use less clowny padding approach. --- crates/geo_filters/src/diff_count.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/geo_filters/src/diff_count.rs b/crates/geo_filters/src/diff_count.rs index 5dab46f..2c5394e 100644 --- a/crates/geo_filters/src/diff_count.rs +++ b/crates/geo_filters/src/diff_count.rs @@ -633,14 +633,14 @@ mod tests { // Insert some padding to emulate alignment issues with the slices // A previous version of this test never panicked even though we were // violating the alignment preconditions for the `from_raw_parts` function - let pad = (0..8).choose(&mut rnd).unwrap(); - for _ in 0..pad { - writer.write(&[0x0]).unwrap(); - } + let padding = [0_u8; 8]; + let pad_amount = (0..8).choose(&mut rnd).unwrap(); + writer.write_all(&padding[..pad_amount]).unwrap(); before.write(&mut writer).unwrap(); - let after = GeoDiffCount::<'_, C>::from_bytes(before.config.clone(), &writer[pad..]); + let after = + GeoDiffCount::<'_, C>::from_bytes(before.config.clone(), &writer[pad_amount..]); assert_eq!(before, after); } From 3fc9ff327878982bd5a59ec9d7bb897f0e2eec26 Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Thu, 24 Jul 2025 13:03:23 +0100 Subject: [PATCH 8/9] Reworks for geofilter serialization work - Remove serde module. Put code into diff_count.rs. - Remove bonus comments - Remove panic-y big endian serialization stuff. - Squish together lines --- crates/geo_filters/src/diff_count.rs | 81 +++++++++++++++------ crates/geo_filters/src/diff_count/bitvec.rs | 4 +- crates/geo_filters/src/diff_count/serde.rs | 77 -------------------- 3 files changed, 63 insertions(+), 99 deletions(-) delete mode 100644 crates/geo_filters/src/diff_count/serde.rs diff --git a/crates/geo_filters/src/diff_count.rs b/crates/geo_filters/src/diff_count.rs index 2c5394e..1b0c949 100644 --- a/crates/geo_filters/src/diff_count.rs +++ b/crates/geo_filters/src/diff_count.rs @@ -4,6 +4,7 @@ use std::borrow::Cow; use std::cmp::Ordering; use std::hash::BuildHasher as _; use std::mem::{size_of, size_of_val}; +use std::ops::Deref as _; use crate::config::{ count_ones_from_bitchunks, count_ones_from_msb_and_lsb, iter_bit_chunks, iter_ones, @@ -13,7 +14,6 @@ use crate::{Count, Diff}; mod bitvec; mod config; -mod serde; mod sim_hash; use bitvec::*; @@ -216,16 +216,11 @@ impl<'a, C: GeoConfig> GeoDiffCount<'a, C> { let msb = self.msb.to_mut(); match msb.binary_search_by(|k| bucket.cmp(k)) { Ok(idx) => { - // The bit is already set in the MSB sparse bitset, remove it (XOR) msb.remove(idx); - - // We have removed a value from our MSB, move a value in the - // LSB into the MSB let (first, second) = { let mut lsb = iter_ones(self.lsb.bit_chunks().peekable()); (lsb.next(), lsb.next()) }; - let new_smallest = if let Some(smallest) = first { msb.push(C::BucketType::from_usize(smallest)); second.map(|_| smallest).unwrap_or(0) @@ -243,12 +238,10 @@ impl<'a, C: GeoConfig> GeoDiffCount<'a, C> { .pop() .expect("we should have at least one element!") .into_usize(); - let new_smallest = msb .last() .expect("should have at least one element") .into_usize(); - // ensure LSB bit vector has the space for `smallest` self.lsb.resize(new_smallest); self.lsb.toggle(smallest); @@ -293,6 +286,57 @@ impl<'a, C: GeoConfig> GeoDiffCount<'a, C> { self.lsb.num_bits(), ); } + + // Serialization: + // + // Since most of our target platforms are little endian there are more optimised approaches + // for little endian platforms, just splatting the bytes into the writer. This is contrary + // to the usual "network endian" approach where big endian is the default, but most of our + // consumers are little endian so it makes sense for this to be the optimal approach. + // + // For now we do not support big endian platforms. In the future we might add a big endian + // platform specific implementation which is able to read the little endian serialized + // representation. For now, if you attempt to serialize a filter on a big endian platform + // you get a panic. + + /// Create a new [`GeoDiffCount`] from a slice of bytes + #[cfg(target_endian = "little")] + pub fn from_bytes(c: C, buf: &'a [u8]) -> Self { + if buf.is_empty() { + return Self::new(c); + } + // The number of most significant bits stores in the MSB sparse repr + let msb_len = (buf.len() / size_of::()).min(c.max_msb_len()); + let msb = unsafe { + std::mem::transmute::<&[u8], &[C::BucketType]>(std::slice::from_raw_parts( + buf.as_ptr(), + msb_len, + )) + }; + // The number of bytes representing the MSB - this is how many bytes we need to + // skip over to reach the LSB + let msb_bytes_len = msb_len * size_of::(); + Self { + config: c, + msb: Cow::Borrowed(msb), + lsb: BitVec::from_bytes(&buf[msb_bytes_len..]), + } + } + + #[cfg(target_endian = "little")] + pub fn write(&self, writer: &mut W) -> std::io::Result { + if self.msb.is_empty() { + return Ok(0); + } + let msb_buckets = self.msb.deref(); + let msb_bytes = unsafe { + std::slice::from_raw_parts(msb_buckets.as_ptr() as *const u8, size_of_val(msb_buckets)) + }; + writer.write_all(msb_bytes)?; + let mut bytes_written = msb_bytes.len(); + bytes_written += self.lsb.write(writer)?; + Ok(bytes_written) + } } /// Applies a repeated bit mask to the underlying filter. @@ -611,50 +655,45 @@ mod tests { // This helper exists in order to easily test serializing types with different // bucket types in the MSB sparse bit field representation. See tests below. + #[cfg(target_endian = "little")] fn serialization_round_trip + Default>() { let mut rnd = rand::rngs::StdRng::from_os_rng(); - // Run 100 simulations of random values being put into // a diff counter. "Serializing" to a vector to emulate // writing to a disk, and then deserializing and asserting // the filters are equal. for _ in 0..100 { let mut before = GeoDiffCount::<'_, C>::default(); - - // Select a random number of items to insert + // Select a random number of items to insert. let items = (1..1000).choose(&mut rnd).unwrap(); - for _ in 0..items { before.push_hash(rnd.next_u64()); } - let mut writer = vec![]; - - // Insert some padding to emulate alignment issues with the slices + // Insert some padding to emulate alignment issues with the slices. // A previous version of this test never panicked even though we were - // violating the alignment preconditions for the `from_raw_parts` function + // violating the alignment preconditions for the `from_raw_parts` function. let padding = [0_u8; 8]; let pad_amount = (0..8).choose(&mut rnd).unwrap(); writer.write_all(&padding[..pad_amount]).unwrap(); - before.write(&mut writer).unwrap(); - let after = GeoDiffCount::<'_, C>::from_bytes(before.config.clone(), &writer[pad_amount..]); - assert_eq!(before, after); } } #[test] + #[cfg(target_endian = "little")] fn test_serialization_round_trip_7() { - // Uses a u16 for MSB buckets + // Uses a u16 for MSB buckets. serialization_round_trip::(); } #[test] + #[cfg(target_endian = "little")] fn test_serialization_round_trip_13() { - // Uses a u32 for MSB buckets + // Uses a u32 for MSB buckets. serialization_round_trip::(); } } diff --git a/crates/geo_filters/src/diff_count/bitvec.rs b/crates/geo_filters/src/diff_count/bitvec.rs index 9a0571b..e96b65c 100644 --- a/crates/geo_filters/src/diff_count/bitvec.rs +++ b/crates/geo_filters/src/diff_count/bitvec.rs @@ -147,13 +147,14 @@ impl BitVec<'_> { size_of_val(num_bits) + blocks.len() * size_of::() } + #[cfg(target_endian = "little")] pub fn from_bytes(mut buf: &[u8]) -> Self { if buf.is_empty() { return Self::default(); } // The first byte of the serialized BitVec is used to indicate how many - // of the bits in the left-most byte are *unoccupied*. + // of the bits in the left-most u64 block are *unoccupied*. // See [`BitVec::write`] implementation for how this is done. assert!( buf[0] < 64, @@ -182,6 +183,7 @@ impl BitVec<'_> { Self { num_bits, blocks } } + #[cfg(target_endian = "little")] pub fn write(&self, writer: &mut W) -> std::io::Result { if self.is_empty() { return Ok(0); diff --git a/crates/geo_filters/src/diff_count/serde.rs b/crates/geo_filters/src/diff_count/serde.rs deleted file mode 100644 index a52ccd7..0000000 --- a/crates/geo_filters/src/diff_count/serde.rs +++ /dev/null @@ -1,77 +0,0 @@ -//! Convert a [`GeoDiffCount`] to and from byte arrays. -//! -//! Since most of our target platforms are little endian there are more optimised approaches -//! for little endian platforms, just splatting the bytes into the writer. This is contrary -//! to the usual "network endian" approach where big endian is the default, but most of our -//! consumers are little endian so it makes sense for this to be the optimal approach. -//! -//! For now we do not support big endian platforms. In the future we might add a big endian -//! platform specific implementation which is able to read the little endian serialized -//! representation. For now, if you attempt to serialize a filter on a big endian platform -//! you get a panic. - -use std::{borrow::Cow, ops::Deref as _}; - -use crate::{config::GeoConfig, Diff}; - -use super::{bitvec::BitVec, GeoDiffCount}; - -impl<'a, C: GeoConfig> GeoDiffCount<'a, C> { - /// Create a new [`GeoDiffCount`] from a slice of bytes - #[cfg(target_endian = "little")] - pub fn from_bytes(c: C, buf: &'a [u8]) -> Self { - if buf.is_empty() { - return Self::new(c); - } - - // The number of most significant bits stores in the MSB sparse repr - let msb_len = (buf.len() / size_of::()).min(c.max_msb_len()); - - let msb = unsafe { - std::mem::transmute::<&[u8], &[C::BucketType]>(std::slice::from_raw_parts( - buf.as_ptr(), - msb_len, - )) - }; - - // The number of bytes representing the MSB - this is how many bytes we need to - // skip over to reach the LSB - let msb_bytes_len = msb_len * size_of::(); - - Self { - config: c, - msb: Cow::Borrowed(msb), - lsb: BitVec::from_bytes(&buf[msb_bytes_len..]), - } - } - - /// Create a new [`GeoDiffCount`] from a slice of bytes - #[cfg(target_endian = "big")] - pub fn from_bytes(c: C, buf: &'a [u8]) -> Self { - unimplemented!("not supported on big endian platforms") - } - - #[cfg(target_endian = "little")] - pub fn write(&self, writer: &mut W) -> std::io::Result { - if self.msb.is_empty() { - return Ok(0); - } - - let msb_buckets = self.msb.deref(); - let msb_bytes = unsafe { - std::slice::from_raw_parts(msb_buckets.as_ptr() as *const u8, size_of_val(msb_buckets)) - }; - writer.write_all(msb_bytes)?; - - let mut bytes_written = msb_bytes.len(); - - bytes_written += self.lsb.write(writer)?; - - Ok(bytes_written) - } - - #[cfg(target_endian = "big")] - pub fn write(&self, writer: &mut W) -> std::io::Result { - unimplemented!("not supported on big endian platforms") - } -} From 0b63f37a5c0e6d78f1834373839f90815d96d5bb Mon Sep 17 00:00:00 2001 From: Sam Cutler Date: Thu, 24 Jul 2025 21:09:26 +0100 Subject: [PATCH 9/9] Remove whitespace in geofilters bitvec --- crates/geo_filters/src/diff_count/bitvec.rs | 9 --------- 1 file changed, 9 deletions(-) diff --git a/crates/geo_filters/src/diff_count/bitvec.rs b/crates/geo_filters/src/diff_count/bitvec.rs index e96b65c..98c1498 100644 --- a/crates/geo_filters/src/diff_count/bitvec.rs +++ b/crates/geo_filters/src/diff_count/bitvec.rs @@ -152,7 +152,6 @@ impl BitVec<'_> { if buf.is_empty() { return Self::default(); } - // The first byte of the serialized BitVec is used to indicate how many // of the bits in the left-most u64 block are *unoccupied*. // See [`BitVec::write`] implementation for how this is done. @@ -161,17 +160,14 @@ impl BitVec<'_> { "Number of unoccupied bits should be <64, got {}", buf[0] ); - let num_bits = (buf.len() - 1) * 8 - buf[0] as usize; buf = &buf[1..]; - assert_eq!( buf.len() % BYTES_PER_BLOCK, 0, "buffer should be a multiple of 8 bytes, got {}", buf.len() ); - let blocks = unsafe { std::mem::transmute::<&[u8], &[u64]>(std::slice::from_raw_parts( buf.as_ptr(), @@ -179,7 +175,6 @@ impl BitVec<'_> { )) }; let blocks = Cow::Borrowed(blocks); - Self { num_bits, blocks } } @@ -188,18 +183,14 @@ impl BitVec<'_> { if self.is_empty() { return Ok(0); } - // First serialize the number of unoccupied bits in the last block as one byte. let unoccupied_bits = 63 - ((self.num_bits - 1) % 64) as u8; writer.write_all(&[unoccupied_bits])?; - let blocks = self.blocks.deref(); let block_bytes = unsafe { std::slice::from_raw_parts(blocks.as_ptr() as *const u8, blocks.len() * BYTES_PER_BLOCK) }; - writer.write_all(block_bytes)?; - Ok(block_bytes.len() + 1) } }