From 29b624c28ef7b18e3d14846fd79bed2727bfbf0b Mon Sep 17 00:00:00 2001 From: RoDmitry Date: Tue, 21 Oct 2025 19:19:12 +0000 Subject: [PATCH 01/17] Fix Set bench --- Cargo.toml | 1 + benches/set.rs | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 8b7ef64..38157e3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,7 @@ proptest = "1.4.0" rand = "0.8.5" rand_chacha = "0.3.1" rkyv = { version = "0.7.42", features = ["validation", "strict"] } +rustc-hash = "2" test-case = "3.3.1" [features] diff --git a/benches/set.rs b/benches/set.rs index d4654b7..d949f21 100644 --- a/benches/set.rs +++ b/benches/set.rs @@ -1,14 +1,13 @@ +use std::collections::HashSet; use std::env; use std::hash::{BuildHasherDefault, DefaultHasher}; use std::time::Instant; -use std::{collections::HashSet, default}; use entropy_map::{Set, DEFAULT_GAMMA}; use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; use rand::{Rng, SeedableRng}; use rand_chacha::ChaCha8Rng; -use rkyv::collections; pub fn benchmark(c: &mut Criterion) { let n: usize = env::var("N").unwrap_or("1000000".to_string()).parse().unwrap(); @@ -21,7 +20,9 @@ pub fn benchmark(c: &mut Criterion) { println!("set generation took: {:?}", t0.elapsed()); let t0 = Instant::now(); - let set = Set::try_from(original_set.clone()).expect("failed to build set"); + let set = + Set::::from_iter_with_params(original_set.iter().cloned(), DEFAULT_GAMMA) + .expect("failed to build set"); println!("set construction took: {:?}", t0.elapsed()); let mut group = c.benchmark_group("set"); @@ -45,7 +46,7 @@ pub fn benchmark(c: &mut Criterion) { }); }); - let fxhash_set: HashSet = HashSet::from_iter(original_set.iter().cloned()); + let fxhash_set: HashSet = HashSet::from_iter(original_set.iter().cloned()); group.bench_function("std-contains-fxhash", |b| { b.iter(|| { for key in original_set.iter().take(query_n) { From 131b7cdb0e0bd721ba8117d0f6c457cbacbeb1da Mon Sep 17 00:00:00 2001 From: RoDmitry Date: Tue, 21 Oct 2025 19:49:12 +0000 Subject: [PATCH 02/17] Update criterion, wyhash --- Cargo.toml | 4 ++-- benches/map_with_dict.rs | 3 ++- benches/map_with_dict_bitpacked.rs | 3 ++- benches/mphf.rs | 3 ++- benches/rank.rs | 3 ++- benches/set.rs | 3 ++- 6 files changed, 12 insertions(+), 7 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 38157e3..fc7b142 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,11 +19,11 @@ bitpacking = "0.9.2" bytecheck = { version = "~0.6.8", default-features = false, optional = true } num = "0.4.1" rkyv = { version = "0.7.42", features = ["validation", "strict"], optional = true } -wyhash = "0.5.0" +wyhash = "0.6" [dev-dependencies] bitvec = "1.0.1" -criterion = { version = "0.5.1", features = ["html_reports"] } +criterion = { version = "0.7", features = ["html_reports"] } paste = "1.0.14" proptest = "1.4.0" rand = "0.8.5" diff --git a/benches/map_with_dict.rs b/benches/map_with_dict.rs index c7e960b..516fb7e 100644 --- a/benches/map_with_dict.rs +++ b/benches/map_with_dict.rs @@ -1,10 +1,11 @@ use std::collections::HashMap; use std::env; +use std::hint::black_box; use std::time::Instant; use entropy_map::MapWithDict; -use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; +use criterion::{criterion_group, criterion_main, Criterion, Throughput}; use rand::{Rng, SeedableRng}; use rand_chacha::ChaCha8Rng; diff --git a/benches/map_with_dict_bitpacked.rs b/benches/map_with_dict_bitpacked.rs index 92121ec..02dc29d 100644 --- a/benches/map_with_dict_bitpacked.rs +++ b/benches/map_with_dict_bitpacked.rs @@ -1,10 +1,11 @@ use std::collections::HashMap; use std::env; +use std::hint::black_box; use std::time::Instant; use entropy_map::MapWithDictBitpacked; -use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; +use criterion::{criterion_group, criterion_main, Criterion, Throughput}; use rand::{Rng, SeedableRng}; use rand_chacha::ChaCha8Rng; diff --git a/benches/mphf.rs b/benches/mphf.rs index 9cc3d00..e02da4f 100644 --- a/benches/mphf.rs +++ b/benches/mphf.rs @@ -1,9 +1,10 @@ use std::env; +use std::hint::black_box; use std::time::Instant; use entropy_map::Mphf; -use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; +use criterion::{criterion_group, criterion_main, Criterion, Throughput}; use rand::random; /// # Benchmark results for N = 1M: diff --git a/benches/rank.rs b/benches/rank.rs index c038ef8..e89e70a 100644 --- a/benches/rank.rs +++ b/benches/rank.rs @@ -1,9 +1,10 @@ use std::env; +use std::hint::black_box; use std::time::Instant; use entropy_map::{RankedBits, RankedBitsAccess}; -use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; +use criterion::{criterion_group, criterion_main, Criterion, Throughput}; use rand::prelude::SliceRandom; use rand::random; diff --git a/benches/set.rs b/benches/set.rs index d949f21..2bafc19 100644 --- a/benches/set.rs +++ b/benches/set.rs @@ -1,11 +1,12 @@ use std::collections::HashSet; use std::env; use std::hash::{BuildHasherDefault, DefaultHasher}; +use std::hint::black_box; use std::time::Instant; use entropy_map::{Set, DEFAULT_GAMMA}; -use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; +use criterion::{criterion_group, criterion_main, Criterion, Throughput}; use rand::{Rng, SeedableRng}; use rand_chacha::ChaCha8Rng; From ba8f354ced47c2b1151bb960d65bf4b40a8ca824 Mon Sep 17 00:00:00 2001 From: RoDmitry Date: Tue, 21 Oct 2025 21:55:30 +0000 Subject: [PATCH 03/17] Update rkyv --- Cargo.toml | 4 +-- benches/map_with_dict.rs | 6 ++--- benches/map_with_dict_bitpacked.rs | 6 ++--- benches/mphf.rs | 6 ++--- examples/map_with_dict.rs | 6 +++-- examples/map_with_dict_bitpacked.rs | 6 +++-- examples/mphf.rs | 6 +++-- examples/set.rs | 6 +++-- src/map_with_dict.rs | 39 ++++++++++++++--------------- src/map_with_dict_bitpacked.rs | 13 +++++----- src/mphf.rs | 24 ++++++++++-------- src/rank.rs | 5 ++-- src/set.rs | 14 +++++------ 13 files changed, 74 insertions(+), 67 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index fc7b142..9369261 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,7 +18,7 @@ categories = ["algorithms", "data-structures"] bitpacking = "0.9.2" bytecheck = { version = "~0.6.8", default-features = false, optional = true } num = "0.4.1" -rkyv = { version = "0.7.42", features = ["validation", "strict"], optional = true } +rkyv = { version = "0.8", optional = true } wyhash = "0.6" [dev-dependencies] @@ -28,7 +28,7 @@ paste = "1.0.14" proptest = "1.4.0" rand = "0.8.5" rand_chacha = "0.3.1" -rkyv = { version = "0.7.42", features = ["validation", "strict"] } +rkyv = { version = "0.8" } rustc-hash = "2" test-case = "3.3.1" diff --git a/benches/map_with_dict.rs b/benches/map_with_dict.rs index 516fb7e..5c84022 100644 --- a/benches/map_with_dict.rs +++ b/benches/map_with_dict.rs @@ -3,7 +3,7 @@ use std::env; use std::hint::black_box; use std::time::Instant; -use entropy_map::MapWithDict; +use entropy_map::{ArchivedMapWithDict, MapWithDict}; use criterion::{criterion_group, criterion_main, Criterion, Throughput}; use rand::{Rng, SeedableRng}; @@ -54,10 +54,10 @@ pub fn benchmark(c: &mut Criterion) { }); let t0 = Instant::now(); - let rkyv_bytes = rkyv::to_bytes::<_, 1024>(&map).unwrap(); + let rkyv_bytes = rkyv::to_bytes::(&map).unwrap(); println!("map_with_dict rkyv serialization took: {:?}", t0.elapsed()); - let rkyv_map = rkyv::check_archived_root::>(&rkyv_bytes).unwrap(); + let rkyv_map = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); group.bench_function("get-rkyv", |b| { b.iter(|| { diff --git a/benches/map_with_dict_bitpacked.rs b/benches/map_with_dict_bitpacked.rs index 02dc29d..867f71b 100644 --- a/benches/map_with_dict_bitpacked.rs +++ b/benches/map_with_dict_bitpacked.rs @@ -3,7 +3,7 @@ use std::env; use std::hint::black_box; use std::time::Instant; -use entropy_map::MapWithDictBitpacked; +use entropy_map::{ArchivedMapWithDictBitpacked, MapWithDictBitpacked}; use criterion::{criterion_group, criterion_main, Criterion, Throughput}; use rand::{Rng, SeedableRng}; @@ -55,10 +55,10 @@ pub fn benchmark(c: &mut Criterion) { }); let t0 = Instant::now(); - let rkyv_bytes = rkyv::to_bytes::<_, 1024>(&map).unwrap(); + let rkyv_bytes = rkyv::to_bytes::(&map).unwrap(); println!("map_with_dict_bitpacked rkyv serialization took: {:?}", t0.elapsed()); - let rkyv_map = rkyv::check_archived_root::>(&rkyv_bytes).unwrap(); + let rkyv_map = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); group.bench_function("get-rkyv", |b| { b.iter(|| { diff --git a/benches/mphf.rs b/benches/mphf.rs index e02da4f..1dc54eb 100644 --- a/benches/mphf.rs +++ b/benches/mphf.rs @@ -2,7 +2,7 @@ use std::env; use std::hint::black_box; use std::time::Instant; -use entropy_map::Mphf; +use entropy_map::{ArchivedMphf, Mphf}; use criterion::{criterion_group, criterion_main, Criterion, Throughput}; use rand::random; @@ -61,10 +61,10 @@ pub fn benchmark(c: &mut Criterion) { }); let t0 = Instant::now(); - let rkyv_bytes = rkyv::to_bytes::<_, 1024>(&mphf).unwrap(); + let rkyv_bytes = rkyv::to_bytes::(&mphf).unwrap(); println!("mphf ({:.1}) rkyv serialization took: {:?}", gamma, t0.elapsed()); - let rkyv_mphf = rkyv::check_archived_root::>(&rkyv_bytes).unwrap(); + let rkyv_mphf = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); group.bench_function(format!("rkyv-mphf-get/gamma-{:.1}", gamma), |b| { b.iter(|| { diff --git a/examples/map_with_dict.rs b/examples/map_with_dict.rs index 210058d..fd93783 100644 --- a/examples/map_with_dict.rs +++ b/examples/map_with_dict.rs @@ -20,9 +20,11 @@ fn main() { #[cfg(feature = "rkyv_derive")] { + use entropy_map::ArchivedMapWithDict; + // Serialize map to rkyv and test again - let rkyv_bytes = rkyv::to_bytes::<_, 1024>(&map).unwrap(); - let rkyv_map = rkyv::check_archived_root::>(&rkyv_bytes).unwrap(); + let rkyv_bytes = rkyv::to_bytes::(&map).unwrap(); + let rkyv_map = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); assert_eq!(rkyv_map.get(&1).unwrap(), &"Dog".to_string()); assert_eq!(rkyv_map.get(&2).unwrap(), &"Cat".to_string()); diff --git a/examples/map_with_dict_bitpacked.rs b/examples/map_with_dict_bitpacked.rs index 096ea30..40a548a 100644 --- a/examples/map_with_dict_bitpacked.rs +++ b/examples/map_with_dict_bitpacked.rs @@ -23,9 +23,11 @@ fn main() { #[cfg(feature = "rkyv_derive")] { + use entropy_map::ArchivedMapWithDictBitpacked; + // Serialize map to rkyv and test again - let rkyv_bytes = rkyv::to_bytes::<_, 1024>(&map).unwrap(); - let rkyv_map = rkyv::check_archived_root::>(&rkyv_bytes).unwrap(); + let rkyv_bytes = rkyv::to_bytes::(&map).unwrap(); + let rkyv_map = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); assert!(rkyv_map.get_values(&1, &mut values_buf)); assert_eq!(values_buf, vec![1, 2, 3]); diff --git a/examples/mphf.rs b/examples/mphf.rs index 5039813..d31f012 100644 --- a/examples/mphf.rs +++ b/examples/mphf.rs @@ -15,9 +15,11 @@ fn main() { #[cfg(feature = "rkyv_derive")] { + use entropy_map::ArchivedMphf; + // Serialize mphf to rkyv and test again - let rkyv_bytes = rkyv::to_bytes::<_, 1024>(&mphf).unwrap(); - let rkyv_mphf = rkyv::check_archived_root::>(&rkyv_bytes).unwrap(); + let rkyv_bytes = rkyv::to_bytes::(&mphf).unwrap(); + let rkyv_mphf = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); assert!(rkyv_mphf.get(&1).is_some()); assert!(rkyv_mphf.get(&5).is_some()); diff --git a/examples/set.rs b/examples/set.rs index 00ebc5d..6319a21 100644 --- a/examples/set.rs +++ b/examples/set.rs @@ -19,8 +19,10 @@ fn main() { #[cfg(feature = "rkyv_derive")] { - let rkyv_bytes = rkyv::to_bytes::<_, 1024>(&set).unwrap(); - let rkyv = rkyv::check_archived_root::>(&rkyv_bytes).unwrap(); + use entropy_map::ArchivedSet; + + let rkyv_bytes = rkyv::to_bytes::(&set).unwrap(); + let rkyv = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); assert!(rkyv.contains(&1)); assert!(rkyv.contains(&2)); diff --git a/src/map_with_dict.rs b/src/map_with_dict.rs index 66fdf83..a9b6ad2 100644 --- a/src/map_with_dict.rs +++ b/src/map_with_dict.rs @@ -20,7 +20,6 @@ use crate::mphf::{Mphf, MphfError, DEFAULT_GAMMA}; /// An efficient, immutable hash map with values dictionary-packed for optimized space usage. #[derive(Default)] #[cfg_attr(feature = "rkyv_derive", derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize))] -#[cfg_attr(feature = "rkyv_derive", archive_attr(derive(rkyv::CheckBytes)))] pub struct MapWithDict where ST: PrimInt + Unsigned, @@ -82,7 +81,7 @@ where } } - Ok(MapWithDict { + Ok(Self { mphf, keys: keys.into_boxed_slice(), values_index: values_index.into_boxed_slice(), @@ -262,7 +261,7 @@ where #[inline] fn try_from(value: HashMap) -> Result { - MapWithDict::::from_iter_with_params(value, DEFAULT_GAMMA) + Self::from_iter_with_params(value, DEFAULT_GAMMA) } } @@ -281,11 +280,11 @@ where /// # Examples /// ``` /// # use std::collections::HashMap; + /// # use entropy_map::ArchivedMapWithDict; /// # use entropy_map::MapWithDict; /// let map = MapWithDict::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); - /// let archived_map = rkyv::from_bytes::>( - /// &rkyv::to_bytes::<_, 1024>(&map).unwrap() - /// ).unwrap(); + /// let bytes = rkyv::to_bytes::(&map).unwrap(); + /// let archived_map = rkyv::access::, rkyv::rancor::Error>(&bytes).unwrap(); /// assert_eq!(archived_map.contains_key(&1), true); /// assert_eq!(archived_map.contains_key(&2), false); /// ``` @@ -310,13 +309,13 @@ where /// # Examples /// ``` /// # use std::collections::HashMap; + /// # use entropy_map::ArchivedMapWithDict; /// # use entropy_map::MapWithDict; /// let map = MapWithDict::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); - /// let archived_map = rkyv::from_bytes::>( - /// &rkyv::to_bytes::<_, 1024>(&map).unwrap() - /// ).unwrap(); - /// assert_eq!(archived_map.get(&1), Some(&2)); - /// assert_eq!(archived_map.get(&5), None); + /// let bytes = rkyv::to_bytes::(&map).unwrap(); + /// let archived_map = rkyv::access::, rkyv::rancor::Error>(&bytes).unwrap(); + /// assert_eq!(archived_map.get(&1).map(|v| v.to_native()), Some(2)); + /// assert_eq!(archived_map.get(&5).map(|v| v.to_native()), None); /// ``` #[inline] pub fn get(&self, key: &Q) -> Option<&V::Archived> @@ -331,7 +330,7 @@ where unsafe { if self.keys.get_unchecked(idx) == key { // SAFETY: `idx` and `value_idx` are always within bounds (ensure during construction) - let value_idx = *self.values_index.get_unchecked(idx) as usize; + let value_idx = self.values_index.get_unchecked(idx).to_native() as usize; Some(self.values_dict.get_unchecked(value_idx)) } else { None @@ -347,7 +346,7 @@ where .zip(self.values_index.iter()) .map(move |(key, &value_idx)| { // SAFETY: `value_idx` is always within bounds (ensured during construction) - let value = unsafe { self.values_dict.get_unchecked(value_idx as usize) }; + let value = unsafe { self.values_dict.get_unchecked(value_idx.to_native() as usize) }; (key, value) }) } @@ -433,11 +432,11 @@ mod tests { // create regular `HashMap`, then `MapWithDict`, then serialize to `rkyv` bytes. let original_map = gen_map(1000); let map = MapWithDict::try_from(original_map.clone()).unwrap(); - let rkyv_bytes = rkyv::to_bytes::<_, 1024>(&map).unwrap(); + let rkyv_bytes = rkyv::to_bytes::(&map).unwrap(); - assert_eq!(rkyv_bytes.len(), 12464); + assert_eq!(rkyv_bytes.len(), 12480); - let rkyv_map = rkyv::check_archived_root::>(&rkyv_bytes).unwrap(); + let rkyv_map = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); // Test get on `Archived` version for (k, v) in original_map.iter() { @@ -445,8 +444,8 @@ mod tests { } // Test iter on `Archived` version - for (&k, &v) in rkyv_map.iter() { - assert_eq!(original_map.get(&k), Some(&v)); + for (k, v) in rkyv_map.iter() { + assert_eq!(original_map.get(&k.to_native()), Some(&v.to_native())); } } @@ -455,8 +454,8 @@ mod tests { fn test_rkyv_get_borrow() { let original_map = HashMap::from_iter([("a".to_string(), ()), ("b".to_string(), ())]); let map = MapWithDict::try_from(original_map).unwrap(); - let rkyv_bytes = rkyv::to_bytes::<_, 1024>(&map).unwrap(); - let rkyv_map = rkyv::check_archived_root::>(&rkyv_bytes).unwrap(); + let rkyv_bytes = rkyv::to_bytes::(&map).unwrap(); + let rkyv_map = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); assert_eq!(map.get("a"), Some(&())); assert!(rkyv_map.contains_key("a")); diff --git a/src/map_with_dict_bitpacked.rs b/src/map_with_dict_bitpacked.rs index 203f176..93245ff 100644 --- a/src/map_with_dict_bitpacked.rs +++ b/src/map_with_dict_bitpacked.rs @@ -25,7 +25,6 @@ use crate::mphf::{Mphf, DEFAULT_GAMMA}; /// An efficient, immutable hash map with bit-packed `Vec` values for optimized space usage. #[derive(Default)] #[cfg_attr(feature = "rkyv_derive", derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize))] -#[cfg_attr(feature = "rkyv_derive", archive_attr(derive(rkyv::CheckBytes)))] pub struct MapWithDictBitpacked where ST: PrimInt + Unsigned, @@ -363,11 +362,11 @@ where /// # Examples /// ``` /// # use std::collections::HashMap; + /// # use entropy_map::ArchivedMapWithDictBitpacked; /// # use entropy_map::MapWithDictBitpacked; /// let map = MapWithDictBitpacked::try_from(HashMap::from([(1, vec![2]), (3, vec![4])])).unwrap(); - /// let archived_map = rkyv::from_bytes::>( - /// &rkyv::to_bytes::<_, 1024>(&map).unwrap() - /// ).unwrap(); + /// let bytes = rkyv::to_bytes::(&map).unwrap(); + /// let archived_map = rkyv::access::, rkyv::rancor::Error>(&bytes).unwrap(); /// let mut values = [0]; /// assert_eq!(archived_map.get_values(&1, &mut values), true); /// assert_eq!(values, [2]); @@ -387,7 +386,7 @@ where } // SAFETY: `idx` and `value_idx` are always within bounds (ensure during construction) - let value_idx = *self.values_index.get_unchecked(idx) as usize; + let value_idx = self.values_index.get_unchecked(idx).to_native() as usize; let dict = self.values_dict.get_unchecked(value_idx..); unpack_values(dict, values); } @@ -544,11 +543,11 @@ mod tests { let values_num = 10; let original_map = gen_map(items_num, values_num); let map = MapWithDictBitpacked::try_from(original_map.clone()).unwrap(); - let rkyv_bytes = rkyv::to_bytes::<_, 1024>(&map).unwrap(); + let rkyv_bytes = rkyv::to_bytes::(&map).unwrap(); assert_eq!(rkyv_bytes.len(), 18516); - let rkyv_map = rkyv::check_archived_root::>(&rkyv_bytes).unwrap(); + let rkyv_map = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); // Test get_values on `Archived` version of `MapWithDictBitpacked` let mut values_buf = vec![0; values_num]; diff --git a/src/mphf.rs b/src/mphf.rs index acb52ae..b5f2b9a 100644 --- a/src/mphf.rs +++ b/src/mphf.rs @@ -26,7 +26,6 @@ use crate::rank::{RankedBits, RankedBitsAccess}; /// - `H`: hasher used to hash keys, default `WyHash`. #[derive(Default)] #[cfg_attr(feature = "rkyv_derive", derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize))] -#[cfg_attr(feature = "rkyv_derive", archive_attr(derive(rkyv::CheckBytes)))] pub struct Mphf { /// Ranked bits for efficient rank queries ranked_bits: RankedBits, @@ -69,6 +68,11 @@ impl(keys: &[K], gamma: f32) -> Result { + Self::from_iter(keys.iter(), gamma) + } + + /// Initializes `Mphf` using iter of `keys` and parameter `gamma`. + pub fn from_iter<'k, K: Hash + 'k>(keys_iter: impl Iterator, gamma: f32) -> Result { if gamma < 1.0 { return Err(InvalidGammaParameter); } @@ -77,7 +81,7 @@ impl = keys.iter().map(|key| hash_key::(key)).collect(); + let mut hashes: Vec = keys_iter.map(|key| hash_key::(key)).collect(); let mut group_bits = vec![]; let mut group_seeds = vec![]; let mut level_groups = vec![]; @@ -95,7 +99,7 @@ impl(&self, key: &K) -> Option { - Self::get_impl(key, &self.level_groups, &self.group_seeds, &self.ranked_bits) + Self::get_impl(key, self.level_groups.iter().copied(), &self.group_seeds, &self.ranked_bits) } /// Inner implementation of `get` with `level_groups`, `group_seeds` and `ranked_bits` passed @@ -244,12 +248,12 @@ impl( key: &K, - level_groups: &[u32], + level_groups: impl Iterator, group_seeds: &[ST], ranked_bits: &impl RankedBitsAccess, ) -> Option { let mut groups_before = 0; - for (level, &groups) in level_groups.iter().enumerate() { + for (level, groups) in level_groups.enumerate() { let level_hash = hash_with_seed(hash_key::(key), level as u32); let group_idx = groups_before + fastmod32(level_hash as u32, groups); // SAFETY: `group_idx` is always within bounds (ensured during calculation) @@ -318,7 +322,7 @@ where { #[inline] pub fn get(&self, key: &K) -> Option { - Mphf::::get_impl(key, &self.level_groups, &self.group_seeds, &self.ranked_bits) + Mphf::::get_impl(key, self.level_groups.iter().map(|v| v.to_native()), &self.group_seeds, &self.ranked_bits) } } @@ -412,11 +416,11 @@ mod tests { let n = 10000; let keys = (0..n as u64).collect::>(); let mphf = Mphf::<32, 4>::from_slice(&keys, DEFAULT_GAMMA).expect("failed to create mphf"); - let rkyv_bytes = rkyv::to_bytes::<_, 1024>(&mphf).unwrap(); + let rkyv_bytes = rkyv::to_bytes::(&mphf).unwrap(); - assert_eq!(rkyv_bytes.len(), 3804); + assert_eq!(rkyv_bytes.len(), 3884); - let rkyv_mphf = rkyv::check_archived_root::>(&rkyv_bytes).unwrap(); + let rkyv_mphf = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); // Ensure that all keys are assigned unique index which is less than `n` let mut set = HashSet::with_capacity(n); diff --git a/src/rank.rs b/src/rank.rs index c1aad42..97d8e50 100644 --- a/src/rank.rs +++ b/src/rank.rs @@ -56,7 +56,6 @@ pub trait RankedBitsAccess { #[derive(Debug, Default)] #[cfg_attr(feature = "rkyv_derive", derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize))] -#[cfg_attr(feature = "rkyv_derive", archive_attr(derive(rkyv::CheckBytes)))] pub struct RankedBits { /// The bit vector represented as an array of u64 integers. bits: Box<[u64]>, @@ -70,7 +69,6 @@ pub struct RankedBits { /// See https://github.com/rkyv/rkyv/issues/409 for more details. #[derive(Debug)] #[cfg_attr(feature = "rkyv_derive", derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize))] -#[cfg_attr(feature = "rkyv_derive", archive_attr(derive(rkyv::CheckBytes)))] pub struct L12Rank([u8; 16]); /// Trait used to access archived and non-archived L1 and L2 ranks @@ -163,7 +161,8 @@ impl RankedBitsAccess for RankedBits { impl RankedBitsAccess for ArchivedRankedBits { #[inline] fn rank(&self, idx: usize) -> Option { - unsafe { Self::rank_impl(&self.bits, &self.l12_ranks, idx) } + // transmute? + unsafe { Self::rank_impl(std::mem::transmute(self.bits.get()), &self.l12_ranks, idx) } } } diff --git a/src/set.rs b/src/set.rs index 9b8b08c..7c3ab97 100644 --- a/src/set.rs +++ b/src/set.rs @@ -23,7 +23,6 @@ use crate::mphf::{Mphf, MphfError, DEFAULT_GAMMA}; /// An efficient, immutable set. #[derive(Default)] #[cfg_attr(feature = "rkyv_derive", derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize))] -#[cfg_attr(feature = "rkyv_derive", archive_attr(derive(rkyv::CheckBytes)))] pub struct Set where ST: PrimInt + Unsigned, @@ -185,9 +184,8 @@ where /// # use std::collections::HashSet; /// # use entropy_map::{ArchivedSet, Set}; /// let set: Set = Set::try_from(HashSet::from([1, 2, 3])).unwrap(); - /// let archived_set = rkyv::from_bytes::>( - /// &rkyv::to_bytes::<_, 1024>(&set).unwrap() - /// ).unwrap(); + /// let bytes = rkyv::to_bytes::(&set).unwrap(); + /// let archived_set = rkyv::access::, rkyv::rancor::Error>(&bytes).unwrap(); /// assert_eq!(archived_set.contains(&1), true); /// assert_eq!(archived_set.contains(&4), false); /// ``` @@ -264,11 +262,11 @@ mod tests { // create regular `HashSet`, then `Set`, then serialize to `rkyv` bytes. let original_set = gen_set(1000); let set = Set::try_from(original_set.clone()).unwrap(); - let rkyv_bytes = rkyv::to_bytes::<_, 1024>(&set).unwrap(); + let rkyv_bytes = rkyv::to_bytes::(&set).unwrap(); assert_eq!(rkyv_bytes.len(), 8408); - let rkyv_set = rkyv::check_archived_root::>(&rkyv_bytes).unwrap(); + let rkyv_set = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); // Test get on `Archived` version for k in original_set.iter() { @@ -280,8 +278,8 @@ mod tests { #[test] fn test_rkyv_contains_borrow() { let set = Set::try_from(HashSet::from(["a".to_string(), "b".to_string()])).unwrap(); - let rkyv_bytes = rkyv::to_bytes::<_, 1024>(&set).unwrap(); - let rkyv_set = rkyv::check_archived_root::>(&rkyv_bytes).unwrap(); + let rkyv_bytes = rkyv::to_bytes::(&set).unwrap(); + let rkyv_set = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); assert!(rkyv_set.contains("a")); assert!(rkyv_set.contains("b")); From b4ef83d81d95bc75ff21ef20501fc947924af1cf Mon Sep 17 00:00:00 2001 From: RoDmitry Date: Tue, 21 Oct 2025 22:01:35 +0000 Subject: [PATCH 04/17] Fmt --- benches/map_with_dict.rs | 5 +---- benches/map_with_dict_bitpacked.rs | 5 +---- benches/mphf.rs | 4 +--- benches/rank.rs | 7 ++----- benches/set.rs | 12 +++++++----- rustfmt.toml | 3 ++- src/map_with_dict.rs | 10 ++++++---- src/map_with_dict_bitpacked.rs | 10 ++++++---- src/mphf.rs | 28 +++++++++++++++++++++------- src/rank.rs | 6 ++---- src/set.rs | 10 ++++++---- 11 files changed, 55 insertions(+), 45 deletions(-) diff --git a/benches/map_with_dict.rs b/benches/map_with_dict.rs index 5c84022..2277fdc 100644 --- a/benches/map_with_dict.rs +++ b/benches/map_with_dict.rs @@ -1,7 +1,4 @@ -use std::collections::HashMap; -use std::env; -use std::hint::black_box; -use std::time::Instant; +use std::{collections::HashMap, env, hint::black_box, time::Instant}; use entropy_map::{ArchivedMapWithDict, MapWithDict}; diff --git a/benches/map_with_dict_bitpacked.rs b/benches/map_with_dict_bitpacked.rs index 867f71b..b332438 100644 --- a/benches/map_with_dict_bitpacked.rs +++ b/benches/map_with_dict_bitpacked.rs @@ -1,7 +1,4 @@ -use std::collections::HashMap; -use std::env; -use std::hint::black_box; -use std::time::Instant; +use std::{collections::HashMap, env, hint::black_box, time::Instant}; use entropy_map::{ArchivedMapWithDictBitpacked, MapWithDictBitpacked}; diff --git a/benches/mphf.rs b/benches/mphf.rs index 1dc54eb..fc5388a 100644 --- a/benches/mphf.rs +++ b/benches/mphf.rs @@ -1,6 +1,4 @@ -use std::env; -use std::hint::black_box; -use std::time::Instant; +use std::{env, hint::black_box, time::Instant}; use entropy_map::{ArchivedMphf, Mphf}; diff --git a/benches/rank.rs b/benches/rank.rs index e89e70a..9a632c7 100644 --- a/benches/rank.rs +++ b/benches/rank.rs @@ -1,12 +1,9 @@ -use std::env; -use std::hint::black_box; -use std::time::Instant; +use std::{env, hint::black_box, time::Instant}; use entropy_map::{RankedBits, RankedBitsAccess}; use criterion::{criterion_group, criterion_main, Criterion, Throughput}; -use rand::prelude::SliceRandom; -use rand::random; +use rand::{prelude::SliceRandom, random}; /// Benchmark results for N = 1M: /// diff --git a/benches/set.rs b/benches/set.rs index 2bafc19..ee5fc36 100644 --- a/benches/set.rs +++ b/benches/set.rs @@ -1,8 +1,10 @@ -use std::collections::HashSet; -use std::env; -use std::hash::{BuildHasherDefault, DefaultHasher}; -use std::hint::black_box; -use std::time::Instant; +use std::{ + collections::HashSet, + env, + hash::{BuildHasherDefault, DefaultHasher}, + hint::black_box, + time::Instant, +}; use entropy_map::{Set, DEFAULT_GAMMA}; diff --git a/rustfmt.toml b/rustfmt.toml index 9998926..ddfd589 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -1,2 +1,3 @@ max_width = 120 -struct_lit_width = 80 \ No newline at end of file +struct_lit_width = 80 +imports_granularity = "Crate" diff --git a/src/map_with_dict.rs b/src/map_with_dict.rs index a9b6ad2..40117f6 100644 --- a/src/map_with_dict.rs +++ b/src/map_with_dict.rs @@ -7,10 +7,12 @@ //! the values dictionary. Keys are stored to ensure that `get` operation will return `None` if key //! wasn't present in original set. -use std::borrow::Borrow; -use std::collections::HashMap; -use std::hash::{Hash, Hasher}; -use std::mem::size_of_val; +use std::{ + borrow::Borrow, + collections::HashMap, + hash::{Hash, Hasher}, + mem::size_of_val, +}; use num::{PrimInt, Unsigned}; use wyhash::WyHash; diff --git a/src/map_with_dict_bitpacked.rs b/src/map_with_dict_bitpacked.rs index 93245ff..c549e61 100644 --- a/src/map_with_dict_bitpacked.rs +++ b/src/map_with_dict_bitpacked.rs @@ -11,10 +11,12 @@ //! stored in the byte dictionary. Keys are maintained for validation during retrieval. A `get` //! query for a non-existent key at construction returns `false`, similar to `MapWithDict`. -use std::borrow::Borrow; -use std::collections::HashMap; -use std::hash::{Hash, Hasher}; -use std::mem::size_of_val; +use std::{ + borrow::Borrow, + collections::HashMap, + hash::{Hash, Hasher}, + mem::size_of_val, +}; use bitpacking::{BitPacker, BitPacker1x}; use num::{PrimInt, Unsigned}; diff --git a/src/mphf.rs b/src/mphf.rs index b5f2b9a..cd04b38 100644 --- a/src/mphf.rs +++ b/src/mphf.rs @@ -7,15 +7,19 @@ //! but prioritizes code simplicity and portability, with a special focus on optimizing the rank //! storage mechanism and reducing the construction time and querying latency of MPHF. -use std::hash::{Hash, Hasher}; -use std::marker::PhantomData; -use std::mem::size_of_val; +use std::{ + hash::{Hash, Hasher}, + marker::PhantomData, + mem::size_of_val, +}; use num::{Integer, PrimInt, Unsigned}; use wyhash::WyHash; -use crate::mphf::MphfError::*; -use crate::rank::{RankedBits, RankedBitsAccess}; +use crate::{ + mphf::MphfError::*, + rank::{RankedBits, RankedBitsAccess}, +}; /// A Minimal Perfect Hash Function (MPHF). /// @@ -240,7 +244,12 @@ impl(&self, key: &K) -> Option { - Self::get_impl(key, self.level_groups.iter().copied(), &self.group_seeds, &self.ranked_bits) + Self::get_impl( + key, + self.level_groups.iter().copied(), + &self.group_seeds, + &self.ranked_bits, + ) } /// Inner implementation of `get` with `level_groups`, `group_seeds` and `ranked_bits` passed @@ -322,7 +331,12 @@ where { #[inline] pub fn get(&self, key: &K) -> Option { - Mphf::::get_impl(key, self.level_groups.iter().map(|v| v.to_native()), &self.group_seeds, &self.ranked_bits) + Mphf::::get_impl( + key, + self.level_groups.iter().map(|v| v.to_native()), + &self.group_seeds, + &self.ranked_bits, + ) } } diff --git a/src/rank.rs b/src/rank.rs index 97d8e50..2502f4f 100644 --- a/src/rank.rs +++ b/src/rank.rs @@ -169,10 +169,8 @@ impl RankedBitsAccess for ArchivedRankedBits { #[cfg(test)] mod tests { use super::*; - use bitvec::order::Lsb0; - use bitvec::vec::BitVec; - use rand::distributions::Standard; - use rand::Rng; + use bitvec::{order::Lsb0, vec::BitVec}; + use rand::{distributions::Standard, Rng}; #[test] fn test_rank_and_get() { diff --git a/src/set.rs b/src/set.rs index 7c3ab97..bb43ed8 100644 --- a/src/set.rs +++ b/src/set.rs @@ -10,10 +10,12 @@ //! dynamically update membership. However, when the `rkyv_derive` feature is enabled, you can use //! [`rkyv`](https://rkyv.org/) to perform zero-copy deserialization of a new set. -use std::borrow::Borrow; -use std::collections::HashSet; -use std::hash::{Hash, Hasher}; -use std::mem::size_of_val; +use std::{ + borrow::Borrow, + collections::HashSet, + hash::{Hash, Hasher}, + mem::size_of_val, +}; use num::{PrimInt, Unsigned}; use wyhash::WyHash; From 325e1e2ae92cebac3f9b8044392e93eef5fc7d7a Mon Sep 17 00:00:00 2001 From: RoDmitry Date: Wed, 22 Oct 2025 00:24:53 +0000 Subject: [PATCH 05/17] Fix bench black_box, new stuct Map --- Cargo.toml | 23 +- benches/map.rs | 62 ++++ benches/map_with_dict.rs | 9 +- benches/map_with_dict_bitpacked.rs | 6 +- benches/mphf.rs | 6 +- benches/rank.rs | 4 +- benches/set.rs | 23 +- src/lib.rs | 2 + src/map.rs | 498 +++++++++++++++++++++++++++++ src/map_tuple.rs | 492 ++++++++++++++++++++++++++++ src/map_with_dict.rs | 8 +- src/map_with_dict_bitpacked.rs | 11 +- src/rank.rs | 2 +- src/set.rs | 2 +- 14 files changed, 1107 insertions(+), 41 deletions(-) create mode 100644 benches/map.rs create mode 100644 src/map.rs create mode 100644 src/map_tuple.rs diff --git a/Cargo.toml b/Cargo.toml index 9369261..32cee79 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,6 @@ categories = ["algorithms", "data-structures"] [dependencies] bitpacking = "0.9.2" -bytecheck = { version = "~0.6.8", default-features = false, optional = true } num = "0.4.1" rkyv = { version = "0.8", optional = true } wyhash = "0.6" @@ -28,13 +27,13 @@ paste = "1.0.14" proptest = "1.4.0" rand = "0.8.5" rand_chacha = "0.3.1" -rkyv = { version = "0.8" } +rkyv = "0.8" rustc-hash = "2" test-case = "3.3.1" [features] default = [] -rkyv_derive = ["rkyv", "bytecheck"] +rkyv_derive = ["rkyv"] [[bench]] name = "rank" @@ -45,6 +44,11 @@ name = "mphf" harness = false required-features = ["rkyv_derive"] +[[bench]] +name = "map" +harness = false +required-features = ["rkyv_derive"] + [[bench]] name = "map_with_dict" harness = false @@ -60,11 +64,8 @@ name = "set" harness = false required-features = ["rkyv_derive"] -[profile.bench] -debug = true - -[profile.release] -codegen-units = 1 -debug = true -lto = "fat" -opt-level = 3 +# [profile.release] +# codegen-units = 1 +# debug = true +# lto = "fat" +# opt-level = 3 diff --git a/benches/map.rs b/benches/map.rs new file mode 100644 index 0000000..7dc7ad4 --- /dev/null +++ b/benches/map.rs @@ -0,0 +1,62 @@ +use std::{collections::HashMap, env, hint::black_box, time::Instant}; + +use entropy_map::{ArchivedMap, Map}; + +use criterion::{criterion_group, criterion_main, Criterion, Throughput}; +use rand::{Rng, SeedableRng}; +use rand_chacha::ChaCha8Rng; + +pub fn benchmark(c: &mut Criterion) { + let n: usize = env::var("N").unwrap_or("1000000".to_string()).parse().unwrap(); + let query_n: usize = env::var("QN").unwrap_or("1000000".to_string()).parse().unwrap(); + + let mut rng = ChaCha8Rng::seed_from_u64(123); + + let t0 = Instant::now(); + let original_map: HashMap = (0..n) + .map(|_| { + let key = rng.gen::(); + let value = rng.gen::(); + (key, value) + }) + .collect(); + println!("map generation took: {:?}", t0.elapsed()); + + let t0 = Instant::now(); + let map = Map::try_from(original_map.clone()).expect("failed to build map"); + println!("map construction took: {:?}", t0.elapsed()); + + let mut group = c.benchmark_group("Map"); + group.throughput(Throughput::Elements(query_n as u64)); + + group.bench_function("get", |b| { + b.iter(|| { + for key in original_map.keys().take(query_n) { + black_box(map.get(key).unwrap()); + } + }); + }); + + let t0 = Instant::now(); + let rkyv_bytes = rkyv::to_bytes::(&map).unwrap(); + println!("map rkyv serialization took: {:?}", t0.elapsed()); + + let rkyv_map = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); + + group.bench_function("get-rkyv", |b| { + b.iter(|| { + for key in original_map.keys().take(query_n) { + black_box(rkyv_map.get(key).unwrap()); + } + }); + }); + + group.finish(); +} + +criterion_group! { + name = benches; + config = Criterion::default(); + targets = benchmark, +} +criterion_main!(benches); diff --git a/benches/map_with_dict.rs b/benches/map_with_dict.rs index 2277fdc..b7ec4c7 100644 --- a/benches/map_with_dict.rs +++ b/benches/map_with_dict.rs @@ -29,7 +29,8 @@ pub fn benchmark(c: &mut Criterion) { let original_map: HashMap = (0..n) .map(|_| { let key = rng.gen::(); - let value = rng.gen_range(1..=10); + // let value = rng.gen_range(1..=10); + let value = rng.gen::(); (key, value) }) .collect(); @@ -39,13 +40,13 @@ pub fn benchmark(c: &mut Criterion) { let map = MapWithDict::try_from(original_map.clone()).expect("failed to build map"); println!("map_with_dict construction took: {:?}", t0.elapsed()); - let mut group = c.benchmark_group("map_with_dict"); + let mut group = c.benchmark_group("MapWithDict"); group.throughput(Throughput::Elements(query_n as u64)); group.bench_function("get", |b| { b.iter(|| { for key in original_map.keys().take(query_n) { - map.get(black_box(key)).unwrap(); + black_box(map.get(key).unwrap()); } }); }); @@ -59,7 +60,7 @@ pub fn benchmark(c: &mut Criterion) { group.bench_function("get-rkyv", |b| { b.iter(|| { for key in original_map.keys().take(query_n) { - rkyv_map.get(black_box(key)).unwrap(); + black_box(rkyv_map.get(key).unwrap()); } }); }); diff --git a/benches/map_with_dict_bitpacked.rs b/benches/map_with_dict_bitpacked.rs index b332438..e698c94 100644 --- a/benches/map_with_dict_bitpacked.rs +++ b/benches/map_with_dict_bitpacked.rs @@ -40,13 +40,13 @@ pub fn benchmark(c: &mut Criterion) { let map = MapWithDictBitpacked::try_from(original_map.clone()).expect("failed to build map"); println!("map_with_dict_bitpacked construction took: {:?}", t0.elapsed()); - let mut group = c.benchmark_group("map_with_dict_bitpacked"); + let mut group = c.benchmark_group("MapWithDictBitpacked"); group.throughput(Throughput::Elements(query_n as u64)); group.bench_function("get_values", |b| { b.iter(|| { for key in original_map.keys().take(query_n) { - map.get_values(black_box(key), &mut values_buf); + black_box(map.get_values(key, &mut values_buf)); } }); }); @@ -60,7 +60,7 @@ pub fn benchmark(c: &mut Criterion) { group.bench_function("get-rkyv", |b| { b.iter(|| { for key in original_map.keys().take(query_n) { - rkyv_map.get_values(black_box(key), &mut values_buf); + black_box(rkyv_map.get_values(key, &mut values_buf)); } }); }); diff --git a/benches/mphf.rs b/benches/mphf.rs index fc5388a..b30d000 100644 --- a/benches/mphf.rs +++ b/benches/mphf.rs @@ -32,7 +32,7 @@ pub fn benchmark(c: &mut Criterion) { let n: usize = env::var("N").unwrap_or("1000000".to_string()).parse().unwrap(); let query_n: usize = env::var("QN").unwrap_or("1000000".to_string()).parse().unwrap(); - let mut group = c.benchmark_group("mphf"); + let mut group = c.benchmark_group("Mphf"); group.throughput(Throughput::Elements(query_n as u64)); let t0 = Instant::now(); @@ -53,7 +53,7 @@ pub fn benchmark(c: &mut Criterion) { group.bench_function(format!("mphf-get/gamma-{:.1}", gamma), |b| { b.iter(|| { for item in items.iter().take(query_n) { - mphf.get(black_box(item)).unwrap(); + black_box(mphf.get(item).unwrap()); } }); }); @@ -67,7 +67,7 @@ pub fn benchmark(c: &mut Criterion) { group.bench_function(format!("rkyv-mphf-get/gamma-{:.1}", gamma), |b| { b.iter(|| { for item in items.iter().take(query_n) { - rkyv_mphf.get(black_box(item)).unwrap(); + black_box(rkyv_mphf.get(item).unwrap()); } }); }); diff --git a/benches/rank.rs b/benches/rank.rs index 9a632c7..4243b72 100644 --- a/benches/rank.rs +++ b/benches/rank.rs @@ -33,12 +33,12 @@ pub fn benchmark(c: &mut Criterion) { overhead ); - let mut group = c.benchmark_group("ranked_bits"); + let mut group = c.benchmark_group("RankedBits"); group.throughput(Throughput::Elements(query_n as u64)); group.bench_function("rank", |b| { b.iter(|| { for &idx in indices.iter().take(query_n) { - ranked_bits.rank(black_box(idx)).unwrap_or_default(); + black_box(ranked_bits.rank(idx).unwrap_or_default()); } }); }); diff --git a/benches/set.rs b/benches/set.rs index ee5fc36..5e2a949 100644 --- a/benches/set.rs +++ b/benches/set.rs @@ -24,17 +24,26 @@ pub fn benchmark(c: &mut Criterion) { let t0 = Instant::now(); let set = - Set::::from_iter_with_params(original_set.iter().cloned(), DEFAULT_GAMMA) - .expect("failed to build set"); + Set::::from_iter_with_params(original_set.iter().cloned(), DEFAULT_GAMMA).expect("failed to build set"); println!("set construction took: {:?}", t0.elapsed()); - let mut group = c.benchmark_group("set"); + let mut group = c.benchmark_group("Set"); group.throughput(Throughput::Elements(query_n as u64)); + group.bench_function("entropy-contains-native", |b| { + b.iter(|| { + for key in original_set.iter().take(query_n) { + black_box(set.contains(key)); + } + }); + }); + + let set_fxhash: Set = + Set::from_iter_with_params(original_set.iter().cloned(), DEFAULT_GAMMA).expect("failed to build set"); group.bench_function("entropy-contains-fxhash", |b| { b.iter(|| { for key in original_set.iter().take(query_n) { - set.contains(black_box(key)); + black_box(set_fxhash.contains(key)); } }); }); @@ -44,7 +53,7 @@ pub fn benchmark(c: &mut Criterion) { group.bench_function("entropy-contains-defaulthasher", |b| { b.iter(|| { for key in original_set.iter().take(query_n) { - set_default_hasher.contains(black_box(key)); + black_box(set_default_hasher.contains(key)); } }); }); @@ -53,7 +62,7 @@ pub fn benchmark(c: &mut Criterion) { group.bench_function("std-contains-fxhash", |b| { b.iter(|| { for key in original_set.iter().take(query_n) { - fxhash_set.contains(black_box(key)); + black_box(fxhash_set.contains(key)); } }); }); @@ -63,7 +72,7 @@ pub fn benchmark(c: &mut Criterion) { group.bench_function("std-contains-defaulthasher", |b| { b.iter(|| { for key in original_set.iter().take(query_n) { - defaulthasher_set.contains(black_box(key)); + black_box(defaulthasher_set.contains(key)); } }); }); diff --git a/src/lib.rs b/src/lib.rs index 76ab64c..babd652 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,9 +1,11 @@ +pub mod map; pub mod map_with_dict; pub mod map_with_dict_bitpacked; pub mod mphf; pub mod rank; pub mod set; +pub use map::*; pub use map_with_dict::*; pub use map_with_dict_bitpacked::*; pub use mphf::*; diff --git a/src/map.rs b/src/map.rs new file mode 100644 index 0000000..13e460a --- /dev/null +++ b/src/map.rs @@ -0,0 +1,498 @@ +//! A module providing `Map`, an immutable hash map implementation. +//! +//! `Map` is a hash map structure that optimizes for space by utilizing a minimal perfect +//! hash function (MPHF) for indexing the map's keys. +//! The MPHF provides direct access to the indices of keys. +//! Keys are stored to ensure that `get` operation will return `None` if key +//! wasn't present in the original set. + +use std::{ + borrow::Borrow, + collections::HashMap, + hash::{Hash, Hasher}, + mem::size_of_val, +}; + +use num::{PrimInt, Unsigned}; +use wyhash::WyHash; + +use crate::mphf::{Mphf, MphfError, DEFAULT_GAMMA}; + +/// An efficient, immutable hash map. +#[derive(Default)] +#[cfg_attr(feature = "rkyv_derive", derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize))] +pub struct Map +where + ST: PrimInt + Unsigned, + H: Hasher + Default, +{ + /// Minimally Perfect Hash Function for keys indices retrieval + mphf: Mphf, + /// Map keys + keys: Box<[K]>, + /// Map values + values: Box<[V]>, +} + +impl Map +where + K: Hash, + ST: PrimInt + Unsigned, + H: Hasher + Default, +{ + /// Constructs a `Map` from an iterator of key-value pairs and MPHF function params. + pub fn from_iter_with_params(iter: I, gamma: f32) -> Result + where + I: IntoIterator, + { + let mut keys = vec![]; + let mut values = vec![]; + + for (k, v) in iter { + keys.push(k); + values.push(v); + } + + let mphf = Mphf::from_slice(&keys, gamma)?; + + // Re-order `keys` and `values_index` according to `mphf` + for i in 0..keys.len() { + loop { + let idx = mphf.get(&keys[i]).unwrap(); + if idx == i { + break; + } + keys.swap(i, idx); + values.swap(i, idx); + } + } + + Ok(Self { mphf, keys: keys.into_boxed_slice(), values: values.into_boxed_slice() }) + } + + /// Returns a reference to the value corresponding to the key. Returns `None` if the key is + /// not present in the map. + /// + /// # Examples + /// ``` + /// # use std::collections::HashMap; + /// # use entropy_map::Map; + /// let map = Map::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); + /// assert_eq!(map.get(&1), Some(&2)); + /// assert_eq!(map.get(&5), None); + /// ``` + #[inline] + pub fn get(&self, key: &Q) -> Option<&V> + where + K: Borrow + PartialEq, + Q: Hash + Eq + ?Sized, + { + let idx = self.mphf.get(key)?; + + // SAFETY: `idx` is always within bounds (ensured during construction) + unsafe { + if self.keys.get_unchecked(idx) == key { + Some(self.values.get_unchecked(idx)) + } else { + None + } + } + } + + /// Returns the number of key-value pairs in the map. + /// + /// # Examples + /// ``` + /// # use std::collections::HashMap; + /// # use entropy_map::Map; + /// let map = Map::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); + /// assert_eq!(map.len(), 2); + /// ``` + #[inline] + pub fn len(&self) -> usize { + self.keys.len() + } + + /// Returns `true` if the map contains no elements. + /// + /// # Examples + /// ``` + /// # use std::collections::HashMap; + /// # use entropy_map::Map; + /// let map = Map::try_from(HashMap::from([(0, 0); 0])).unwrap(); + /// assert_eq!(map.is_empty(), true); + /// let map = Map::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); + /// assert_eq!(map.is_empty(), false); + /// ``` + #[inline] + pub fn is_empty(&self) -> bool { + self.keys.is_empty() + } + + /// Checks if the map contains the specified key. + /// + /// # Examples + /// ``` + /// # use std::collections::HashMap; + /// # use entropy_map::Map; + /// let map = Map::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); + /// assert_eq!(map.contains_key(&1), true); + /// assert_eq!(map.contains_key(&2), false); + /// ``` + #[inline] + pub fn contains_key(&self, key: &Q) -> bool + where + K: Borrow + PartialEq, + Q: Hash + Eq + ?Sized, + { + if let Some(idx) = self.mphf.get(key) { + // SAFETY: `idx` is always within bounds (ensured during construction) + unsafe { self.keys.get_unchecked(idx) == key } + } else { + false + } + } + + /// Returns an iterator over the map, yielding key-value pairs. + /// + /// # Examples + /// ``` + /// # use std::collections::HashMap; + /// # use entropy_map::Map; + /// let map = Map::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); + /// for (key, val) in map.iter() { + /// println!("key: {key} val: {val}"); + /// } + /// ``` + #[inline] + pub fn iter(&self) -> impl Iterator { + self.keys.iter().zip(self.values.iter()) + } + + /// Returns an iterator over the keys of the map. + /// + /// # Examples + /// ``` + /// # use std::collections::HashMap; + /// # use entropy_map::Map; + /// let map = Map::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); + /// for key in map.keys() { + /// println!("{key}"); + /// } + /// ``` + #[inline] + pub fn keys(&self) -> impl Iterator { + self.keys.iter() + } + + /// Returns an iterator over the values of the map. + /// + /// # Examples + /// ``` + /// # use std::collections::HashMap; + /// # use entropy_map::Map; + /// let map = Map::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); + /// for val in map.values() { + /// println!("{val}"); + /// } + /// ``` + #[inline] + pub fn values(&self) -> impl Iterator { + self.values.iter() + } + + /// Returns the total number of bytes occupied by the structure. + /// + /// # Examples + /// ``` + /// # use std::collections::HashMap; + /// # use entropy_map::Map; + /// let map = Map::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); + /// assert_eq!(map.size(), 238); + /// ``` + #[inline] + pub fn size(&self) -> usize { + size_of_val(self) + self.mphf.size() + size_of_val(self.keys.as_ref()) + size_of_val(self.values.as_ref()) + } +} + +/// Creates a `Map` from a `HashMap`. +impl TryFrom> for Map +where + K: Eq + Hash + Clone, + V: Eq + Clone + Hash, +{ + type Error = MphfError; + + #[inline] + fn try_from(value: HashMap) -> Result { + Self::from_iter_with_params(value, DEFAULT_GAMMA) + } +} + +/// Implement `get` for `Archived` version of `Map` if feature is enabled +#[cfg(feature = "rkyv_derive")] +impl ArchivedMap +where + K: PartialEq + Hash + rkyv::Archive, + K::Archived: PartialEq, + V: rkyv::Archive, + ST: PrimInt + Unsigned + rkyv::Archive, + H: Hasher + Default, +{ + /// Checks if the map contains the specified key. + /// + /// # Examples + /// ``` + /// # use std::collections::HashMap; + /// # use entropy_map::ArchivedMap; + /// # use entropy_map::Map; + /// let map = Map::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); + /// let bytes = rkyv::to_bytes::(&map).unwrap(); + /// let archived_map = rkyv::access::, rkyv::rancor::Error>(&bytes).unwrap(); + /// assert_eq!(archived_map.contains_key(&1), true); + /// assert_eq!(archived_map.contains_key(&2), false); + /// ``` + #[inline] + pub fn contains_key(&self, key: &Q) -> bool + where + K: Borrow, + ::Archived: PartialEq, + Q: Hash + Eq, + { + if let Some(idx) = self.mphf.get(key) { + // SAFETY: `idx` is always within bounds (ensured during construction) + unsafe { self.keys.get_unchecked(idx) == key } + } else { + false + } + } + + /// Returns a reference to the value corresponding to the key. Returns `None` if the key is + /// not present in the map. + /// + /// # Examples + /// ``` + /// # use std::collections::HashMap; + /// # use entropy_map::ArchivedMap; + /// # use entropy_map::Map; + /// let map = Map::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); + /// let bytes = rkyv::to_bytes::(&map).unwrap(); + /// let archived_map = rkyv::access::, rkyv::rancor::Error>(&bytes).unwrap(); + /// assert_eq!(archived_map.get(&1).map(|v| v.to_native()), Some(2)); + /// assert_eq!(archived_map.get(&5).map(|v| v.to_native()), None); + /// ``` + #[inline] + pub fn get(&self, key: &Q) -> Option<&V::Archived> + where + K: Borrow, + ::Archived: PartialEq, + Q: Hash + Eq, + { + let idx = self.mphf.get(key)?; + + // SAFETY: `idx` is always within bounds (ensured during construction) + unsafe { + if self.keys.get_unchecked(idx) == key { + Some(self.values.get_unchecked(idx)) + } else { + None + } + } + } + + /// Returns an iterator over the archived map, yielding archived key-value pairs. + #[inline] + pub fn iter(&self) -> impl Iterator { + self.keys.iter().zip(self.values.iter()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use paste::paste; + use proptest::prelude::*; + use rand::{Rng, SeedableRng}; + use rand_chacha::ChaCha8Rng; + use std::collections::{hash_map::RandomState, HashSet}; + + fn gen_map(items_num: usize) -> HashMap { + let mut rng = ChaCha8Rng::seed_from_u64(123); + + (0..items_num) + .map(|_| { + let key = rng.gen::(); + let value = rng.gen_range(1..=10); + (key, value) + }) + .collect() + } + + #[test] + fn test_map_with_dict() { + // Collect original key-value pairs directly into a HashMap + let original_map = gen_map(1000); + + // Create the map from the iterator + let map = Map::try_from(original_map.clone()).unwrap(); + + // Test len + assert_eq!(map.len(), original_map.len()); + + // Test is_empty + assert_eq!(map.is_empty(), original_map.is_empty()); + + // Test get, contains_key + for (key, value) in &original_map { + assert_eq!(map.get(key), Some(value)); + assert!(map.contains_key(key)); + } + + // Test iter + for (&k, &v) in map.iter() { + assert_eq!(original_map.get(&k), Some(&v)); + } + + // Test keys + for k in map.keys() { + assert!(original_map.contains_key(k)); + } + + // Test values + for &v in map.values() { + assert!(original_map.values().any(|&val| val == v)); + } + + // Test size + assert_eq!(map.size(), 12570); + } + + /// Assert that we can call `.get()` with `K::borrow()`. + #[test] + fn test_get_borrow() { + let original_map = HashMap::from_iter([("a".to_string(), ()), ("b".to_string(), ())]); + let map = Map::try_from(original_map).unwrap(); + + assert_eq!(map.get("a"), Some(&())); + assert!(map.contains_key("a")); + assert_eq!(map.get("b"), Some(&())); + assert!(map.contains_key("b")); + assert_eq!(map.get("c"), None); + assert!(!map.contains_key("c")); + } + + #[cfg(feature = "rkyv_derive")] + #[test] + fn test_rkyv() { + // create regular `HashMap`, then `Map`, then serialize to `rkyv` bytes. + let original_map = gen_map(1000); + let map = Map::try_from(original_map.clone()).unwrap(); + let rkyv_bytes = rkyv::to_bytes::(&map).unwrap(); + + assert_eq!(rkyv_bytes.len(), 12432); + + let rkyv_map = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); + + // Test get on `Archived` version + for (k, v) in original_map.iter() { + assert_eq!(v, rkyv_map.get(k).unwrap()); + } + + // Test iter on `Archived` version + for (k, v) in rkyv_map.iter() { + assert_eq!(original_map.get(&k.to_native()), Some(&v.to_native())); + } + } + + #[cfg(feature = "rkyv_derive")] + #[test] + fn test_rkyv_get_borrow() { + let original_map = HashMap::from_iter([("a".to_string(), ()), ("b".to_string(), ())]); + let map = Map::try_from(original_map).unwrap(); + let rkyv_bytes = rkyv::to_bytes::(&map).unwrap(); + let rkyv_map = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); + + assert_eq!(map.get("a"), Some(&())); + assert!(rkyv_map.contains_key("a")); + assert_eq!(map.get("b"), Some(&())); + assert!(rkyv_map.contains_key("b")); + assert_eq!(map.get("c"), None); + assert!(!rkyv_map.contains_key("c")); + } + + macro_rules! proptest_map_with_dict_model { + ($(($b:expr, $s:expr, $gamma:expr)),* $(,)?) => { + $( + paste! { + proptest! { + #[test] + fn [](model: HashMap, arbitrary: HashSet) { + let entropy_map: Map = Map::from_iter_with_params( + model.clone(), + $gamma as f32 / 100.0 + ).unwrap(); + + // Assert that length matches model. + assert_eq!(entropy_map.len(), model.len()); + assert_eq!(entropy_map.is_empty(), model.is_empty()); + + // Assert that keys and values match model. + assert_eq!( + HashSet::<_, RandomState>::from_iter(entropy_map.keys()), + HashSet::from_iter(model.keys()) + ); + assert_eq!( + HashSet::<_, RandomState>::from_iter(entropy_map.values()), + HashSet::from_iter(model.values()) + ); + + // Assert that contains and get operations match model for contained elements. + for (k, v) in &model { + assert!(entropy_map.contains_key(&k)); + assert_eq!(entropy_map.get(&k), Some(v)); + } + + // Assert that contains and get operations match model for random elements. + for k in arbitrary { + assert_eq!( + model.contains_key(&k), + entropy_map.contains_key(&k), + ); + assert_eq!(entropy_map.get(&k), model.get(&k)); + } + } + } + } + )* + }; + } + + proptest_map_with_dict_model!( + // (1, 8, 100), + (2, 8, 100), + (4, 8, 100), + (7, 8, 100), + (8, 8, 100), + (15, 8, 100), + (16, 8, 100), + (23, 8, 100), + (24, 8, 100), + (31, 8, 100), + (32, 8, 100), + (33, 8, 100), + (48, 8, 100), + (53, 8, 100), + (61, 8, 100), + (63, 8, 100), + (64, 8, 100), + (32, 7, 100), + (32, 5, 100), + (32, 4, 100), + (32, 3, 100), + (32, 1, 100), + (32, 0, 100), + (32, 8, 200), + (32, 6, 200), + ); +} diff --git a/src/map_tuple.rs b/src/map_tuple.rs new file mode 100644 index 0000000..d123d52 --- /dev/null +++ b/src/map_tuple.rs @@ -0,0 +1,492 @@ +//! A module providing `Map`, an immutable hash map implementation. +//! +//! `Map` is a hash map structure that optimizes for space by utilizing a minimal perfect +//! hash function (MPHF) for indexing the map's keys. +//! The MPHF provides direct access to the indices of keys. +//! Keys are stored to ensure that `get` operation will return `None` if key +//! wasn't present in the original set. + +use std::{ + borrow::Borrow, + collections::HashMap, + hash::{Hash, Hasher}, + mem::size_of_val, +}; + +use num::{PrimInt, Unsigned}; +use wyhash::WyHash; + +use crate::mphf::{Mphf, MphfError, DEFAULT_GAMMA}; + +/// An efficient, immutable hash map. +#[derive(Default)] +#[cfg_attr(feature = "rkyv_derive", derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize))] +pub struct MapTuple +where + ST: PrimInt + Unsigned, + H: Hasher + Default, +{ + /// Minimally Perfect Hash Function for keys indices retrieval + mphf: Mphf, + keys_vals: Box<[(K, V)]>, +} + +impl MapTuple +where + K: Hash, + ST: PrimInt + Unsigned, + H: Hasher + Default, +{ + /// Constructs a `Map` from an iterator of key-value pairs and MPHF function params. + pub fn from_iter_with_params(iter: I, gamma: f32) -> Result + where + I: IntoIterator, + { + let mut keys_vals: Vec<_> = iter.into_iter().collect(); + + let mphf = Mphf::from_iter(keys_vals.iter().map(|(k, _v)| k), gamma)?; + + // Re-order `keys` and `values_index` according to `mphf` + for i in 0..keys_vals.len() { + loop { + let idx = mphf.get(&keys_vals[i].0).unwrap(); + if idx == i { + break; + } + keys_vals.swap(i, idx); + } + } + + Ok(Self { mphf, keys_vals: keys_vals.into_boxed_slice() }) + } + + /// Returns a reference to the value corresponding to the key. Returns `None` if the key is + /// not present in the map. + /// + /// # Examples + /// ``` + /// # use std::collections::HashMap; + /// # use entropy_map::Map; + /// let map = MapTuple::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); + /// assert_eq!(map.get(&1), Some(&2)); + /// assert_eq!(map.get(&5), None); + /// ``` + #[inline] + pub fn get(&self, key: &Q) -> Option<&V> + where + K: Borrow + PartialEq, + Q: Hash + Eq + ?Sized, + { + let idx = self.mphf.get(key)?; + + // SAFETY: `idx` is always within bounds (ensured during construction) + unsafe { + let (k, v) = self.keys_vals.get_unchecked(idx); + if k == key { + Some(v) + } else { + None + } + } + } + + /// Returns the number of key-value pairs in the map. + /// + /// # Examples + /// ``` + /// # use std::collections::HashMap; + /// # use entropy_map::Map; + /// let map = MapTuple::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); + /// assert_eq!(map.len(), 2); + /// ``` + #[inline] + pub fn len(&self) -> usize { + self.keys_vals.len() + } + + /// Returns `true` if the map contains no elements. + /// + /// # Examples + /// ``` + /// # use std::collections::HashMap; + /// # use entropy_map::Map; + /// let map = MapTuple::try_from(HashMap::from([(0, 0); 0])).unwrap(); + /// assert_eq!(map.is_empty(), true); + /// let map = MapTuple::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); + /// assert_eq!(map.is_empty(), false); + /// ``` + #[inline] + pub fn is_empty(&self) -> bool { + self.keys_vals.is_empty() + } + + /// Checks if the map contains the specified key. + /// + /// # Examples + /// ``` + /// # use std::collections::HashMap; + /// # use entropy_map::Map; + /// let map = MapTuple::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); + /// assert_eq!(map.contains_key(&1), true); + /// assert_eq!(map.contains_key(&2), false); + /// ``` + #[inline] + pub fn contains_key(&self, key: &Q) -> bool + where + K: Borrow + PartialEq, + Q: Hash + Eq + ?Sized, + { + if let Some(idx) = self.mphf.get(key) { + // SAFETY: `idx` is always within bounds (ensured during construction) + unsafe { &self.keys_vals.get_unchecked(idx).0 == key } + } else { + false + } + } + + /// Returns an iterator over the map, yielding key-value pairs. + /// + /// # Examples + /// ``` + /// # use std::collections::HashMap; + /// # use entropy_map::Map; + /// let map = MapTuple::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); + /// for (key, val) in map.iter() { + /// println!("key: {key} val: {val}"); + /// } + /// ``` + #[inline] + pub fn iter(&self) -> impl Iterator { + self.keys_vals.iter() + } + + /// Returns an iterator over the keys of the map. + /// + /// # Examples + /// ``` + /// # use std::collections::HashMap; + /// # use entropy_map::Map; + /// let map = MapTuple::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); + /// for key in map.keys() { + /// println!("{key}"); + /// } + /// ``` + #[inline] + pub fn keys(&self) -> impl Iterator { + self.keys_vals.iter().map(|(k, _v)| k) + } + + /// Returns an iterator over the values of the map. + /// + /// # Examples + /// ``` + /// # use std::collections::HashMap; + /// # use entropy_map::Map; + /// let map = MapTuple::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); + /// for val in map.values() { + /// println!("{val}"); + /// } + /// ``` + #[inline] + pub fn values(&self) -> impl Iterator { + self.keys_vals.iter().map(|(_k, v)| v) + } + + /// Returns the total number of bytes occupied by the structure. + /// + /// # Examples + /// ``` + /// # use std::collections::HashMap; + /// # use entropy_map::Map; + /// let map = MapTuple::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); + /// assert_eq!(map.size(), 222); + /// ``` + #[inline] + pub fn size(&self) -> usize { + size_of_val(self) + self.mphf.size() + size_of_val(self.keys_vals.as_ref()) + } +} + +/// Creates a `Map` from a `HashMap`. +impl TryFrom> for MapTuple +where + K: Eq + Hash + Clone, + V: Eq + Clone + Hash, +{ + type Error = MphfError; + + #[inline] + fn try_from(value: HashMap) -> Result { + Self::from_iter_with_params(value, DEFAULT_GAMMA) + } +} + +/// Implement `get` for `Archived` version of `Map` if feature is enabled +#[cfg(feature = "rkyv_derive")] +impl ArchivedMapTuple +where + K: PartialEq + Hash + rkyv::Archive, + K::Archived: PartialEq, + V: rkyv::Archive, + ST: PrimInt + Unsigned + rkyv::Archive, + H: Hasher + Default, +{ + /// Checks if the map contains the specified key. + /// + /// # Examples + /// ``` + /// # use std::collections::HashMap; + /// # use entropy_map::ArchivedMapTuple; + /// # use entropy_map::Map; + /// let map = MapTuple::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); + /// let bytes = rkyv::to_bytes::(&map).unwrap(); + /// let archived_map = rkyv::access::, rkyv::rancor::Error>(&bytes).unwrap(); + /// assert_eq!(archived_map.contains_key(&1), true); + /// assert_eq!(archived_map.contains_key(&2), false); + /// ``` + #[inline] + pub fn contains_key(&self, key: &Q) -> bool + where + K: Borrow, + ::Archived: PartialEq, + Q: Hash + Eq, + { + if let Some(idx) = self.mphf.get(key) { + // SAFETY: `idx` is always within bounds (ensured during construction) + let rkyv::tuple::ArchivedTuple2(k, _v) = unsafe { self.keys_vals.get_unchecked(idx) }; + + k == key + } else { + false + } + } + + /// Returns a reference to the value corresponding to the key. Returns `None` if the key is + /// not present in the map. + /// + /// # Examples + /// ``` + /// # use std::collections::HashMap; + /// # use entropy_map::ArchivedMapTuple; + /// # use entropy_map::Map; + /// let map = MapTuple::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); + /// let bytes = rkyv::to_bytes::(&map).unwrap(); + /// let archived_map = rkyv::access::, rkyv::rancor::Error>(&bytes).unwrap(); + /// assert_eq!(archived_map.get(&1).map(|v| v.to_native()), Some(2)); + /// assert_eq!(archived_map.get(&5).map(|v| v.to_native()), None); + /// ``` + #[inline] + pub fn get(&self, key: &Q) -> Option<&V::Archived> + where + K: Borrow, + ::Archived: PartialEq, + Q: Hash + Eq, + { + let idx = self.mphf.get(key)?; + + // SAFETY: `idx` is always within bounds (ensured during construction) + unsafe { + let rkyv::tuple::ArchivedTuple2(k, v) = self.keys_vals.get_unchecked(idx); + if k == key { + Some(v) + } else { + None + } + } + } + + /// Returns an iterator over the archived map, yielding archived key-value pairs. + #[inline] + pub fn iter(&self) -> impl Iterator::Archived> { + self.keys_vals.iter() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use paste::paste; + use proptest::prelude::*; + use rand::{Rng, SeedableRng}; + use rand_chacha::ChaCha8Rng; + use std::collections::{hash_map::RandomState, HashSet}; + + fn gen_map(items_num: usize) -> HashMap { + let mut rng = ChaCha8Rng::seed_from_u64(123); + + (0..items_num) + .map(|_| { + let key = rng.gen::(); + let value = rng.gen_range(1..=10); + (key, value) + }) + .collect() + } + + #[test] + fn test_map_with_dict() { + // Collect original key-value pairs directly into a HashMap + let original_map = gen_map(1000); + + // Create the map from the iterator + let map = MapTuple::try_from(original_map.clone()).unwrap(); + + // Test len + assert_eq!(map.len(), original_map.len()); + + // Test is_empty + assert_eq!(map.is_empty(), original_map.is_empty()); + + // Test get, contains_key + for (key, value) in &original_map { + assert_eq!(map.get(key), Some(value)); + assert!(map.contains_key(key)); + } + + // Test iter + for (k, v) in map.iter() { + assert_eq!(original_map.get(k), Some(v)); + } + + // Test keys + for k in map.keys() { + assert!(original_map.contains_key(k)); + } + + // Test values + for &v in map.values() { + assert!(original_map.values().any(|&val| val == v)); + } + + // Test size + assert_eq!(map.size(), 16554); + } + + /// Assert that we can call `.get()` with `K::borrow()`. + #[test] + fn test_get_borrow() { + let original_map = HashMap::from_iter([("a".to_string(), ()), ("b".to_string(), ())]); + let map = MapTuple::try_from(original_map).unwrap(); + + assert_eq!(map.get("a"), Some(&())); + assert!(map.contains_key("a")); + assert_eq!(map.get("b"), Some(&())); + assert!(map.contains_key("b")); + assert_eq!(map.get("c"), None); + assert!(!map.contains_key("c")); + } + + #[cfg(feature = "rkyv_derive")] + #[test] + fn test_rkyv() { + // create regular `HashMap`, then `Map`, then serialize to `rkyv` bytes. + let original_map = gen_map(1000); + let map = MapTuple::try_from(original_map.clone()).unwrap(); + let rkyv_bytes = rkyv::to_bytes::(&map).unwrap(); + + assert_eq!(rkyv_bytes.len(), 16424); + + let rkyv_map = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); + + // Test get on `Archived` version + for (k, v) in original_map.iter() { + assert_eq!(v, rkyv_map.get(k).unwrap()); + } + + // Test iter on `Archived` version + for rkyv::tuple::ArchivedTuple2(k, v) in rkyv_map.iter() { + assert_eq!(original_map.get(&k.to_native()), Some(&v.to_native())); + } + } + + #[cfg(feature = "rkyv_derive")] + #[test] + fn test_rkyv_get_borrow() { + let original_map = HashMap::from_iter([("a".to_string(), ()), ("b".to_string(), ())]); + let map = MapTuple::try_from(original_map).unwrap(); + let rkyv_bytes = rkyv::to_bytes::(&map).unwrap(); + let rkyv_map = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); + + assert_eq!(map.get("a"), Some(&())); + assert!(rkyv_map.contains_key("a")); + assert_eq!(map.get("b"), Some(&())); + assert!(rkyv_map.contains_key("b")); + assert_eq!(map.get("c"), None); + assert!(!rkyv_map.contains_key("c")); + } + + macro_rules! proptest_map_with_dict_model { + ($(($b:expr, $s:expr, $gamma:expr)),* $(,)?) => { + $( + paste! { + proptest! { + #[test] + fn [](model: HashMap, arbitrary: HashSet) { + let entropy_map: MapTuple = MapTuple::from_iter_with_params( + model.clone(), + $gamma as f32 / 100.0 + ).unwrap(); + + // Assert that length matches model. + assert_eq!(entropy_map.len(), model.len()); + assert_eq!(entropy_map.is_empty(), model.is_empty()); + + // Assert that keys and values match model. + assert_eq!( + HashSet::<_, RandomState>::from_iter(entropy_map.keys()), + HashSet::from_iter(model.keys()) + ); + assert_eq!( + HashSet::<_, RandomState>::from_iter(entropy_map.values()), + HashSet::from_iter(model.values()) + ); + + // Assert that contains and get operations match model for contained elements. + for (k, v) in &model { + assert!(entropy_map.contains_key(&k)); + assert_eq!(entropy_map.get(&k), Some(v)); + } + + // Assert that contains and get operations match model for random elements. + for k in arbitrary { + assert_eq!( + model.contains_key(&k), + entropy_map.contains_key(&k), + ); + assert_eq!(entropy_map.get(&k), model.get(&k)); + } + } + } + } + )* + }; + } + + proptest_map_with_dict_model!( + // (1, 8, 100), + (2, 8, 100), + (4, 8, 100), + (7, 8, 100), + (8, 8, 100), + (15, 8, 100), + (16, 8, 100), + (23, 8, 100), + (24, 8, 100), + (31, 8, 100), + (32, 8, 100), + (33, 8, 100), + (48, 8, 100), + (53, 8, 100), + (61, 8, 100), + (63, 8, 100), + (64, 8, 100), + (32, 7, 100), + (32, 5, 100), + (32, 4, 100), + (32, 3, 100), + (32, 1, 100), + (32, 0, 100), + (32, 8, 200), + (32, 6, 200), + ); +} diff --git a/src/map_with_dict.rs b/src/map_with_dict.rs index 40117f6..8983750 100644 --- a/src/map_with_dict.rs +++ b/src/map_with_dict.rs @@ -5,7 +5,7 @@ //! as it reduces the overall memory footprint by packing unique values into a dictionary. The MPHF //! provides direct access to the indices of keys, which correspond to their respective values in //! the values dictionary. Keys are stored to ensure that `get` operation will return `None` if key -//! wasn't present in original set. +//! wasn't present in the original set. use std::{ borrow::Borrow, @@ -39,7 +39,7 @@ where impl MapWithDict where - K: Eq + Hash + Clone, + K: Hash, V: Eq + Clone + Hash, ST: PrimInt + Unsigned, H: Hasher + Default, @@ -55,7 +55,7 @@ where let mut offsets_cache = HashMap::new(); for (k, v) in iter { - keys.push(k.clone()); + keys.push(k); if let Some(&offset) = offsets_cache.get(&v) { // re-use dictionary offset if found in cache @@ -65,7 +65,7 @@ where let offset = values_dict.len(); offsets_cache.insert(v.clone(), offset); values_index.push(offset); - values_dict.push(v.clone()); + values_dict.push(v); } } diff --git a/src/map_with_dict_bitpacked.rs b/src/map_with_dict_bitpacked.rs index c549e61..f74b23d 100644 --- a/src/map_with_dict_bitpacked.rs +++ b/src/map_with_dict_bitpacked.rs @@ -53,7 +53,7 @@ pub enum Error { impl MapWithDictBitpacked where - K: Hash + PartialEq + Clone, + K: Hash, ST: PrimInt + Unsigned, H: Hasher + Default, { @@ -71,7 +71,7 @@ where let v_len = iter.peek().map_or(0, |(_, v)| v.len()); for (k, v) in iter { - keys.push(k.clone()); + keys.push(k); if v.len() != v_len { return Err(Error::NotEqualValuesLengths); @@ -81,13 +81,14 @@ where // re-use dictionary offset if found in cache values_index.push(offset); } else { - // store current dictionary length as an offset in both index and cache let offset = values_dict.len(); - offsets_cache.insert(v.clone(), offset); - values_index.push(offset); // append packed values to the dictionary pack_values(&v, &mut values_dict); + + // store dictionary length as an offset in both index and cache + offsets_cache.insert(v, offset); + values_index.push(offset); } } diff --git a/src/rank.rs b/src/rank.rs index 2502f4f..1b581f8 100644 --- a/src/rank.rs +++ b/src/rank.rs @@ -139,7 +139,7 @@ impl RankedBits { l12_ranks.push(l12_rank.into()); } - RankedBits { bits, l12_ranks: l12_ranks.into_boxed_slice() } + Self { bits, l12_ranks: l12_ranks.into_boxed_slice() } } /// Returns the total number of bytes occupied by `RankedBits` diff --git a/src/set.rs b/src/set.rs index bb43ed8..b6cc6fb 100644 --- a/src/set.rs +++ b/src/set.rs @@ -70,7 +70,7 @@ where } } - Ok(Set { mphf, keys: keys.into_boxed_slice() }) + Ok(Self { mphf, keys: keys.into_boxed_slice() }) } /// Returns `true` if the set contains the value. From 19b620c100d56fca8c070dea7b388ce6940a5063 Mon Sep 17 00:00:00 2001 From: RoDmitry Date: Wed, 22 Oct 2025 01:06:28 +0000 Subject: [PATCH 06/17] Bench original --- benches/map.rs | 15 +++++++++++++-- benches/map_with_dict.rs | 15 +++++++++++++-- benches/map_with_dict_bitpacked.rs | 2 +- benches/set.rs | 10 +++++----- src/map.rs | 13 ++++++++----- src/map_tuple.rs | 7 ++++--- src/map_with_dict.rs | 23 +++++++++++++---------- src/map_with_dict_bitpacked.rs | 13 ++++++------- src/mphf.rs | 3 +++ src/rank.rs | 5 +++-- 10 files changed, 69 insertions(+), 37 deletions(-) diff --git a/benches/map.rs b/benches/map.rs index 7dc7ad4..c945dec 100644 --- a/benches/map.rs +++ b/benches/map.rs @@ -22,6 +22,9 @@ pub fn benchmark(c: &mut Criterion) { .collect(); println!("map generation took: {:?}", t0.elapsed()); + // created with another hasher so the memory order is different to check random access + let hash_map: HashMap = HashMap::from_iter(original_map.clone()); + let t0 = Instant::now(); let map = Map::try_from(original_map.clone()).expect("failed to build map"); println!("map construction took: {:?}", t0.elapsed()); @@ -29,7 +32,15 @@ pub fn benchmark(c: &mut Criterion) { let mut group = c.benchmark_group("Map"); group.throughput(Throughput::Elements(query_n as u64)); - group.bench_function("get", |b| { + group.bench_function("HashMap get", |b| { + b.iter(|| { + for key in original_map.keys().take(query_n) { + black_box(hash_map.get(key).unwrap()); + } + }); + }); + + group.bench_function("entropy get", |b| { b.iter(|| { for key in original_map.keys().take(query_n) { black_box(map.get(key).unwrap()); @@ -43,7 +54,7 @@ pub fn benchmark(c: &mut Criterion) { let rkyv_map = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); - group.bench_function("get-rkyv", |b| { + group.bench_function("entropy archived get", |b| { b.iter(|| { for key in original_map.keys().take(query_n) { black_box(rkyv_map.get(key).unwrap()); diff --git a/benches/map_with_dict.rs b/benches/map_with_dict.rs index b7ec4c7..fcc5c2f 100644 --- a/benches/map_with_dict.rs +++ b/benches/map_with_dict.rs @@ -36,6 +36,9 @@ pub fn benchmark(c: &mut Criterion) { .collect(); println!("map generation took: {:?}", t0.elapsed()); + // created with another hasher so the memory order is different to check random access + let hash_map: HashMap = HashMap::from_iter(original_map.clone()); + let t0 = Instant::now(); let map = MapWithDict::try_from(original_map.clone()).expect("failed to build map"); println!("map_with_dict construction took: {:?}", t0.elapsed()); @@ -43,7 +46,15 @@ pub fn benchmark(c: &mut Criterion) { let mut group = c.benchmark_group("MapWithDict"); group.throughput(Throughput::Elements(query_n as u64)); - group.bench_function("get", |b| { + group.bench_function("HashMap get", |b| { + b.iter(|| { + for key in original_map.keys().take(query_n) { + black_box(hash_map.get(key).unwrap()); + } + }); + }); + + group.bench_function("entropy get", |b| { b.iter(|| { for key in original_map.keys().take(query_n) { black_box(map.get(key).unwrap()); @@ -57,7 +68,7 @@ pub fn benchmark(c: &mut Criterion) { let rkyv_map = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); - group.bench_function("get-rkyv", |b| { + group.bench_function("entropy archived get", |b| { b.iter(|| { for key in original_map.keys().take(query_n) { black_box(rkyv_map.get(key).unwrap()); diff --git a/benches/map_with_dict_bitpacked.rs b/benches/map_with_dict_bitpacked.rs index e698c94..9ff12c5 100644 --- a/benches/map_with_dict_bitpacked.rs +++ b/benches/map_with_dict_bitpacked.rs @@ -57,7 +57,7 @@ pub fn benchmark(c: &mut Criterion) { let rkyv_map = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); - group.bench_function("get-rkyv", |b| { + group.bench_function("archived get_values", |b| { b.iter(|| { for key in original_map.keys().take(query_n) { black_box(rkyv_map.get_values(key, &mut values_buf)); diff --git a/benches/set.rs b/benches/set.rs index 5e2a949..68ea3f1 100644 --- a/benches/set.rs +++ b/benches/set.rs @@ -30,7 +30,7 @@ pub fn benchmark(c: &mut Criterion) { let mut group = c.benchmark_group("Set"); group.throughput(Throughput::Elements(query_n as u64)); - group.bench_function("entropy-contains-native", |b| { + group.bench_function("entropy contains", |b| { b.iter(|| { for key in original_set.iter().take(query_n) { black_box(set.contains(key)); @@ -40,7 +40,7 @@ pub fn benchmark(c: &mut Criterion) { let set_fxhash: Set = Set::from_iter_with_params(original_set.iter().cloned(), DEFAULT_GAMMA).expect("failed to build set"); - group.bench_function("entropy-contains-fxhash", |b| { + group.bench_function("entropy-fxhash contains", |b| { b.iter(|| { for key in original_set.iter().take(query_n) { black_box(set_fxhash.contains(key)); @@ -50,7 +50,7 @@ pub fn benchmark(c: &mut Criterion) { let set_default_hasher: Set = Set::from_iter_with_params(original_set.iter().cloned(), DEFAULT_GAMMA).expect("failed to build set"); - group.bench_function("entropy-contains-defaulthasher", |b| { + group.bench_function("entropy-DefaultHasher contains", |b| { b.iter(|| { for key in original_set.iter().take(query_n) { black_box(set_default_hasher.contains(key)); @@ -59,7 +59,7 @@ pub fn benchmark(c: &mut Criterion) { }); let fxhash_set: HashSet = HashSet::from_iter(original_set.iter().cloned()); - group.bench_function("std-contains-fxhash", |b| { + group.bench_function("HashSet-fxhash contains", |b| { b.iter(|| { for key in original_set.iter().take(query_n) { black_box(fxhash_set.contains(key)); @@ -69,7 +69,7 @@ pub fn benchmark(c: &mut Criterion) { let defaulthasher_set: HashSet> = HashSet::from_iter(original_set.iter().cloned()); - group.bench_function("std-contains-defaulthasher", |b| { + group.bench_function("HashSet-DefaultHasher contains", |b| { b.iter(|| { for key in original_set.iter().take(query_n) { black_box(defaulthasher_set.contains(key)); diff --git a/src/map.rs b/src/map.rs index 13e460a..e868976 100644 --- a/src/map.rs +++ b/src/map.rs @@ -9,7 +9,7 @@ use std::{ borrow::Borrow, collections::HashMap, - hash::{Hash, Hasher}, + hash::{BuildHasher, Hash, Hasher}, mem::size_of_val, }; @@ -217,15 +217,16 @@ where } /// Creates a `Map` from a `HashMap`. -impl TryFrom> for Map +impl TryFrom> for Map where K: Eq + Hash + Clone, V: Eq + Clone + Hash, + B: BuildHasher, { type Error = MphfError; #[inline] - fn try_from(value: HashMap) -> Result { + fn try_from(value: HashMap) -> Result { Self::from_iter_with_params(value, DEFAULT_GAMMA) } } @@ -371,7 +372,8 @@ mod tests { /// Assert that we can call `.get()` with `K::borrow()`. #[test] fn test_get_borrow() { - let original_map = HashMap::from_iter([("a".to_string(), ()), ("b".to_string(), ())]); + let original_map: HashMap = + HashMap::from_iter([("a".to_string(), ()), ("b".to_string(), ())]); let map = Map::try_from(original_map).unwrap(); assert_eq!(map.get("a"), Some(&())); @@ -408,7 +410,8 @@ mod tests { #[cfg(feature = "rkyv_derive")] #[test] fn test_rkyv_get_borrow() { - let original_map = HashMap::from_iter([("a".to_string(), ()), ("b".to_string(), ())]); + let original_map: HashMap = + HashMap::from_iter([("a".to_string(), ()), ("b".to_string(), ())]); let map = Map::try_from(original_map).unwrap(); let rkyv_bytes = rkyv::to_bytes::(&map).unwrap(); let rkyv_map = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); diff --git a/src/map_tuple.rs b/src/map_tuple.rs index d123d52..83ecabc 100644 --- a/src/map_tuple.rs +++ b/src/map_tuple.rs @@ -9,7 +9,7 @@ use std::{ borrow::Borrow, collections::HashMap, - hash::{Hash, Hasher}, + hash::{BuildHasher, Hash, Hasher}, mem::size_of_val, }; @@ -208,15 +208,16 @@ where } /// Creates a `Map` from a `HashMap`. -impl TryFrom> for MapTuple +impl TryFrom> for MapTuple where K: Eq + Hash + Clone, V: Eq + Clone + Hash, + B: BuildHasher, { type Error = MphfError; #[inline] - fn try_from(value: HashMap) -> Result { + fn try_from(value: HashMap) -> Result { Self::from_iter_with_params(value, DEFAULT_GAMMA) } } diff --git a/src/map_with_dict.rs b/src/map_with_dict.rs index 8983750..4806d5b 100644 --- a/src/map_with_dict.rs +++ b/src/map_with_dict.rs @@ -10,7 +10,7 @@ use std::{ borrow::Borrow, collections::HashMap, - hash::{Hash, Hasher}, + hash::{BuildHasher, Hash, Hasher}, mem::size_of_val, }; @@ -31,7 +31,9 @@ where mphf: Mphf, /// Map keys keys: Box<[K]>, - /// Points to the value index in the dictionary + /// Points to the value index in the dictionary. + /// If rkyv pointer width feature is not enabled, it will serialize usize as 32-bit integers by default. + /// So it limits the max amount of values if you use archived MapWithDict. values_index: Box<[usize]>, /// Map unique values values_dict: Box<[V]>, @@ -110,10 +112,9 @@ where { let idx = self.mphf.get(key)?; - // SAFETY: `idx` is always within bounds (ensured during construction) + // SAFETY: `idx` and `value_idx` are always within bounds (ensured during construction) unsafe { if self.keys.get_unchecked(idx) == key { - // SAFETY: `idx` and `value_idx` are always within bounds (ensure during construction) let value_idx = *self.values_index.get_unchecked(idx); Some(self.values_dict.get_unchecked(value_idx)) } else { @@ -254,15 +255,16 @@ where } /// Creates a `MapWithDict` from a `HashMap`. -impl TryFrom> for MapWithDict +impl TryFrom> for MapWithDict where K: Eq + Hash + Clone, V: Eq + Clone + Hash, + B: BuildHasher, { type Error = MphfError; #[inline] - fn try_from(value: HashMap) -> Result { + fn try_from(value: HashMap) -> Result { Self::from_iter_with_params(value, DEFAULT_GAMMA) } } @@ -328,10 +330,9 @@ where { let idx = self.mphf.get(key)?; - // SAFETY: `idx` is always within bounds (ensured during construction) + // SAFETY: `idx` and `value_idx` are always within bounds (ensured during construction) unsafe { if self.keys.get_unchecked(idx) == key { - // SAFETY: `idx` and `value_idx` are always within bounds (ensure during construction) let value_idx = self.values_index.get_unchecked(idx).to_native() as usize; Some(self.values_dict.get_unchecked(value_idx)) } else { @@ -417,7 +418,8 @@ mod tests { /// Assert that we can call `.get()` with `K::borrow()`. #[test] fn test_get_borrow() { - let original_map = HashMap::from_iter([("a".to_string(), ()), ("b".to_string(), ())]); + let original_map: HashMap = + HashMap::from_iter([("a".to_string(), ()), ("b".to_string(), ())]); let map = MapWithDict::try_from(original_map).unwrap(); assert_eq!(map.get("a"), Some(&())); @@ -454,7 +456,8 @@ mod tests { #[cfg(feature = "rkyv_derive")] #[test] fn test_rkyv_get_borrow() { - let original_map = HashMap::from_iter([("a".to_string(), ()), ("b".to_string(), ())]); + let original_map: HashMap = + HashMap::from_iter([("a".to_string(), ()), ("b".to_string(), ())]); let map = MapWithDict::try_from(original_map).unwrap(); let rkyv_bytes = rkyv::to_bytes::(&map).unwrap(); let rkyv_map = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); diff --git a/src/map_with_dict_bitpacked.rs b/src/map_with_dict_bitpacked.rs index f74b23d..9d17473 100644 --- a/src/map_with_dict_bitpacked.rs +++ b/src/map_with_dict_bitpacked.rs @@ -14,7 +14,7 @@ use std::{ borrow::Borrow, collections::HashMap, - hash::{Hash, Hasher}, + hash::{BuildHasher, Hash, Hasher}, mem::size_of_val, }; @@ -141,13 +141,12 @@ where None => return false, }; - // SAFETY: `idx` is always within bounds (ensured during construction) + // SAFETY: `idx` and `value_idx` are always within bounds (ensured during construction) unsafe { if self.keys.get_unchecked(idx) != key { return false; } - // SAFETY: `idx` and `value_idx` are always within bounds (ensure during construction) let value_idx = *self.values_index.get_unchecked(idx); let dict = self.values_dict.get_unchecked(value_idx..); unpack_values(dict, values); @@ -289,14 +288,15 @@ where } /// Creates a `MapWithDictBitpacked` from a `HashMap`. -impl TryFrom>> for MapWithDictBitpacked +impl TryFrom, B>> for MapWithDictBitpacked where K: PartialEq + Hash + Clone, + B: BuildHasher, { type Error = Error; #[inline] - fn try_from(value: HashMap>) -> Result { + fn try_from(value: HashMap, B>) -> Result { MapWithDictBitpacked::from_iter_with_params(value, DEFAULT_GAMMA) } } @@ -382,13 +382,12 @@ where None => return false, }; - // SAFETY: `idx` is always within bounds (ensured during construction) + // SAFETY: `idx` and `value_idx` are always within bounds (ensured during construction) unsafe { if self.keys.get_unchecked(idx) != key { return false; } - // SAFETY: `idx` and `value_idx` are always within bounds (ensure during construction) let value_idx = self.values_index.get_unchecked(idx).to_native() as usize; let dict = self.values_dict.get_unchecked(value_idx..); unpack_values(dict, values); diff --git a/src/mphf.rs b/src/mphf.rs index cd04b38..5b06e8a 100644 --- a/src/mphf.rs +++ b/src/mphf.rs @@ -3,6 +3,9 @@ //! This module implements a Minimal Perfect Hash Function (MPHF) based on fingerprinting techniques, //! as detailed in [Fingerprinting-based minimal perfect hashing revisited](https://doi.org/10.1145/3596453). //! +//! If you query with keys that were not used at the time of construction, collisions can happen. +//! Other structures are free of collisions, because they store `keys` and compare on each get. +//! //! This implementation is inspired by existing Rust crate [ph](https://github.com/beling/bsuccinct-rs/tree/main/ph), //! but prioritizes code simplicity and portability, with a special focus on optimizing the rank //! storage mechanism and reducing the construction time and querying latency of MPHF. diff --git a/src/rank.rs b/src/rank.rs index 1b581f8..fbea594 100644 --- a/src/rank.rs +++ b/src/rank.rs @@ -161,8 +161,9 @@ impl RankedBitsAccess for RankedBits { impl RankedBitsAccess for ArchivedRankedBits { #[inline] fn rank(&self, idx: usize) -> Option { - // transmute? - unsafe { Self::rank_impl(std::mem::transmute(self.bits.get()), &self.l12_ranks, idx) } + // todo: transmutes `u64_le` to `u64`. May result in incorrect bits on `be` targets. + // But if `be` user enables rkyv big_endian feature, it will not. So enable the feature for them. + unsafe { Self::rank_impl(core::mem::transmute(self.bits.get()), &self.l12_ranks, idx) } } } From 373b4127d60864d86148ba7786100a33ee1b36ba Mon Sep 17 00:00:00 2001 From: RoDmitry Date: Wed, 22 Oct 2025 18:22:31 +0000 Subject: [PATCH 07/17] Fix ST types --- benches/map.rs | 27 ++++++++++---- benches/map_with_dict.rs | 28 ++++++++------- benches/map_with_dict_bitpacked.rs | 24 ++++++------- benches/mphf.rs | 46 ++++++++++++------------ benches/rank.rs | 13 +++---- benches/set.rs | 27 ++++++++++++-- src/lib.rs | 57 ++++++++++++++++++++++++++++++ src/map.rs | 20 ++++++----- src/map_tuple.rs | 16 ++++++--- src/map_with_dict.rs | 10 ++++-- src/map_with_dict_bitpacked.rs | 10 ++++-- src/mphf.rs | 17 +++++---- src/set.rs | 10 ++++-- 13 files changed, 211 insertions(+), 94 deletions(-) diff --git a/benches/map.rs b/benches/map.rs index c945dec..fbbda2f 100644 --- a/benches/map.rs +++ b/benches/map.rs @@ -6,13 +6,29 @@ use criterion::{criterion_group, criterion_main, Criterion, Throughput}; use rand::{Rng, SeedableRng}; use rand_chacha::ChaCha8Rng; +/// Benchmark results for N = 1M: +/// +/// Map construction took: 14.06023608s +/// +/// Map/HashMap get +/// time: [18.350 ms 18.518 ms 18.707 ms] +/// thrpt: [53.455 Melem/s 54.001 Melem/s 54.496 Melem/s] +/// +/// Map/entropy get +/// time: [29.678 ms 29.985 ms 30.320 ms] +/// thrpt: [32.982 Melem/s 33.350 Melem/s 33.695 Melem/s] +/// +/// Map rkyv serialization took: 1.474797ms +/// +/// Map/entropy archived get +/// time: [25.789 ms 26.182 ms 26.617 ms] +/// thrpt: [37.570 Melem/s 38.194 Melem/s 38.776 Melem/s] pub fn benchmark(c: &mut Criterion) { let n: usize = env::var("N").unwrap_or("1000000".to_string()).parse().unwrap(); let query_n: usize = env::var("QN").unwrap_or("1000000".to_string()).parse().unwrap(); let mut rng = ChaCha8Rng::seed_from_u64(123); - let t0 = Instant::now(); let original_map: HashMap = (0..n) .map(|_| { let key = rng.gen::(); @@ -20,14 +36,13 @@ pub fn benchmark(c: &mut Criterion) { (key, value) }) .collect(); - println!("map generation took: {:?}", t0.elapsed()); // created with another hasher so the memory order is different to check random access let hash_map: HashMap = HashMap::from_iter(original_map.clone()); let t0 = Instant::now(); - let map = Map::try_from(original_map.clone()).expect("failed to build map"); - println!("map construction took: {:?}", t0.elapsed()); + let map: Map = Map::from_iter_with_params(original_map.clone(), 2.4).unwrap(); + println!("Map construction took: {:?}", t0.elapsed()); let mut group = c.benchmark_group("Map"); group.throughput(Throughput::Elements(query_n as u64)); @@ -50,9 +65,9 @@ pub fn benchmark(c: &mut Criterion) { let t0 = Instant::now(); let rkyv_bytes = rkyv::to_bytes::(&map).unwrap(); - println!("map rkyv serialization took: {:?}", t0.elapsed()); + println!("Map rkyv serialization took: {:?}", t0.elapsed()); - let rkyv_map = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); + let rkyv_map = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); group.bench_function("entropy archived get", |b| { b.iter(|| { diff --git a/benches/map_with_dict.rs b/benches/map_with_dict.rs index fcc5c2f..568c70d 100644 --- a/benches/map_with_dict.rs +++ b/benches/map_with_dict.rs @@ -8,24 +8,27 @@ use rand_chacha::ChaCha8Rng; /// Benchmark results for N = 1M: /// -/// map generation took: 55.309498ms -/// map_with_dict construction took: 1.411034205s -/// map_with_dict rkyv serialization took: 8.233451ms +/// MapWithDict construction took: 1.103815976s /// -/// # map_with_dict/get -/// time: [75.423 ms 75.814 ms 76.304 ms] -/// thrpt: [13.106 Melem/s 13.190 Melem/s 13.259 Melem/s] +/// MapWithDict/HashMap get +/// time: [18.856 ms 18.921 ms 18.994 ms] +/// thrpt: [52.650 Melem/s 52.850 Melem/s 53.033 Melem/s] /// -/// # map_with_dict/get-rkyv -/// time: [74.267 ms 74.681 ms 75.225 ms] -/// thrpt: [13.293 Melem/s 13.390 Melem/s 13.465 Melem/s] +/// MapWithDict/entropy get +/// time: [45.107 ms 45.406 ms 45.728 ms] +/// thrpt: [21.868 Melem/s 22.023 Melem/s 22.170 Melem/s] +/// +/// MapWithDict rkyv serialization took: 2.496905ms +/// +/// MapWithDict/entropy archived get +/// time: [40.738 ms 41.139 ms 41.575 ms] +/// thrpt: [24.053 Melem/s 24.308 Melem/s 24.547 Melem/s] pub fn benchmark(c: &mut Criterion) { let n: usize = env::var("N").unwrap_or("1000000".to_string()).parse().unwrap(); let query_n: usize = env::var("QN").unwrap_or("1000000".to_string()).parse().unwrap(); let mut rng = ChaCha8Rng::seed_from_u64(123); - let t0 = Instant::now(); let original_map: HashMap = (0..n) .map(|_| { let key = rng.gen::(); @@ -34,14 +37,13 @@ pub fn benchmark(c: &mut Criterion) { (key, value) }) .collect(); - println!("map generation took: {:?}", t0.elapsed()); // created with another hasher so the memory order is different to check random access let hash_map: HashMap = HashMap::from_iter(original_map.clone()); let t0 = Instant::now(); let map = MapWithDict::try_from(original_map.clone()).expect("failed to build map"); - println!("map_with_dict construction took: {:?}", t0.elapsed()); + println!("MapWithDict construction took: {:?}", t0.elapsed()); let mut group = c.benchmark_group("MapWithDict"); group.throughput(Throughput::Elements(query_n as u64)); @@ -64,7 +66,7 @@ pub fn benchmark(c: &mut Criterion) { let t0 = Instant::now(); let rkyv_bytes = rkyv::to_bytes::(&map).unwrap(); - println!("map_with_dict rkyv serialization took: {:?}", t0.elapsed()); + println!("MapWithDict rkyv serialization took: {:?}", t0.elapsed()); let rkyv_map = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); diff --git a/benches/map_with_dict_bitpacked.rs b/benches/map_with_dict_bitpacked.rs index 9ff12c5..fbaa069 100644 --- a/benches/map_with_dict_bitpacked.rs +++ b/benches/map_with_dict_bitpacked.rs @@ -8,24 +8,23 @@ use rand_chacha::ChaCha8Rng; /// Benchmark results for N = 1M: /// -/// map generation took: 199.621887ms -/// map_with_dict_bitpacked construction took: 2.36439657s -/// map_with_dict_bitpacked rkyv serialization took: 20.455775ms +/// MapWithDictBitpacked construction took: 1.530962829s /// -/// # map_with_dict_bitpacked/get_values -/// time: [169.36 ms 170.24 ms 171.06 ms] -/// thrpt: [5.8459 Melem/s 5.8740 Melem/s 5.9044 Melem/s] +/// MapWithDictBitpacked/get_values +/// time: [95.556 ms 96.288 ms 97.068 ms] +/// thrpt: [10.302 Melem/s 10.385 Melem/s 10.465 Melem/s] /// -/// # map_with_dict_bitpacked/get_values-rkyv -/// time: [167.92 ms 168.82 ms 169.65 ms] -/// thrpt: [5.8946 Melem/s 5.9233 Melem/s 5.9553 Melem/s] +/// MapWithDictBitpacked rkyv serialization took: 4.85859ms +/// +/// MapWithDictBitpacked/archived get_values +/// time: [79.066 ms 79.977 ms 81.002 ms] +/// thrpt: [12.345 Melem/s 12.504 Melem/s 12.648 Melem/s] pub fn benchmark(c: &mut Criterion) { let n: usize = env::var("N").unwrap_or("1000000".to_string()).parse().unwrap(); let query_n: usize = env::var("QN").unwrap_or("1000000".to_string()).parse().unwrap(); let mut rng = ChaCha8Rng::seed_from_u64(123); - let t0 = Instant::now(); let mut values_buf = vec![0; 10]; let original_map: HashMap> = (0..n) .map(|_| { @@ -34,11 +33,10 @@ pub fn benchmark(c: &mut Criterion) { (key, value) }) .collect(); - println!("map generation took: {:?}", t0.elapsed()); let t0 = Instant::now(); let map = MapWithDictBitpacked::try_from(original_map.clone()).expect("failed to build map"); - println!("map_with_dict_bitpacked construction took: {:?}", t0.elapsed()); + println!("MapWithDictBitpacked construction took: {:?}", t0.elapsed()); let mut group = c.benchmark_group("MapWithDictBitpacked"); group.throughput(Throughput::Elements(query_n as u64)); @@ -53,7 +51,7 @@ pub fn benchmark(c: &mut Criterion) { let t0 = Instant::now(); let rkyv_bytes = rkyv::to_bytes::(&map).unwrap(); - println!("map_with_dict_bitpacked rkyv serialization took: {:?}", t0.elapsed()); + println!("MapWithDictBitpacked rkyv serialization took: {:?}", t0.elapsed()); let rkyv_map = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); diff --git a/benches/mphf.rs b/benches/mphf.rs index b30d000..bf47710 100644 --- a/benches/mphf.rs +++ b/benches/mphf.rs @@ -7,27 +7,29 @@ use rand::random; /// # Benchmark results for N = 1M: /// -/// items generation took: 7.164763ms +/// Mphf (1.0) construction took: 1.291480619s, bits per key: 2.10 /// -/// # mphf/mphf-get/gamma-1.0 -/// mphf (1.0) construction took: 1.510804159s, bits per key = 2.10 -/// time: [14.326 ms 14.372 ms 14.427 ms] -/// thrpt: [69.315 Melem/s 69.582 Melem/s 69.803 Melem/s] +/// Mphf/get gamma 1.0 +/// time: [26.144 ms 26.267 ms 26.412 ms] +/// thrpt: [37.862 Melem/s 38.071 Melem/s 38.250 Melem/s] /// -/// # mphf/rkyv-mphf-get/gamma-1.0 -/// mphf (1.0) rkyv serialization took: 128.191µs -/// time: [14.389 ms 14.413 ms 14.446 ms] -/// thrpt: [69.225 Melem/s 69.382 Melem/s 69.499 Melem/s] +/// Mphf (1.0) rkyv serialization took: 21.024µs /// -/// # mphf/mphf-get/gamma-2.0 -/// mphf (2.0) construction took: 1.188994719s, bits per key = 2.72 -/// time: [4.5842 ms 4.5959 ms 4.6084 ms] -/// thrpt: [217.00 Melem/s 217.59 Melem/s 218.14 Melem/s] +/// Mphf/archived get gamma 1.0 +/// time: [26.309 ms 26.397 ms 26.520 ms] +/// thrpt: [37.707 Melem/s 37.883 Melem/s 38.010 Melem/s] /// -/// # mphf/rkyv-mphf-get/gamma-2.0 -/// mphf (2.0) rkyv serialization took: 165.901µs -/// time: [4.6885 ms 4.7272 ms 4.7728 ms] -/// thrpt: [209.52 Melem/s 211.54 Melem/s 213.29 Melem/s] +/// Mphf (2.0) construction took: 982.578471ms, bits per key: 2.72 +/// +/// Mphf/get gamma 2.0 +/// time: [19.458 ms 19.683 ms 19.928 ms] +/// thrpt: [50.179 Melem/s 50.805 Melem/s 51.392 Melem/s] +/// +/// Mphf (2.0) rkyv serialization took: 24.643µs +/// +/// Mphf/archived get gamma 2.0 +/// time: [19.901 ms 20.239 ms 20.663 ms] +/// thrpt: [48.396 Melem/s 49.411 Melem/s 50.250 Melem/s] pub fn benchmark(c: &mut Criterion) { let n: usize = env::var("N").unwrap_or("1000000".to_string()).parse().unwrap(); let query_n: usize = env::var("QN").unwrap_or("1000000".to_string()).parse().unwrap(); @@ -35,22 +37,20 @@ pub fn benchmark(c: &mut Criterion) { let mut group = c.benchmark_group("Mphf"); group.throughput(Throughput::Elements(query_n as u64)); - let t0 = Instant::now(); let items: Vec = (0..n).map(|_| random()).collect(); - println!("items generation took: {:?}", t0.elapsed()); for &gamma in &[1.0_f32, 2.0_f32] { let t0 = Instant::now(); let mphf = Mphf::<32, 8>::from_slice(&items, gamma).expect("failed to build mphf"); let bits = (mphf.size() as f32) * 8.0 / (n as f32); println!( - "mphf ({:.1}) construction took: {:?}, bits per key: {:.2}", + "Mphf ({:.1}) construction took: {:?}, bits per key: {:.2}", gamma, t0.elapsed(), bits ); - group.bench_function(format!("mphf-get/gamma-{:.1}", gamma), |b| { + group.bench_function(format!("get gamma {:.1}", gamma), |b| { b.iter(|| { for item in items.iter().take(query_n) { black_box(mphf.get(item).unwrap()); @@ -60,11 +60,11 @@ pub fn benchmark(c: &mut Criterion) { let t0 = Instant::now(); let rkyv_bytes = rkyv::to_bytes::(&mphf).unwrap(); - println!("mphf ({:.1}) rkyv serialization took: {:?}", gamma, t0.elapsed()); + println!("Mphf ({:.1}) rkyv serialization took: {:?}", gamma, t0.elapsed()); let rkyv_mphf = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); - group.bench_function(format!("rkyv-mphf-get/gamma-{:.1}", gamma), |b| { + group.bench_function(format!("archived get gamma {:.1}", gamma), |b| { b.iter(|| { for item in items.iter().take(query_n) { black_box(rkyv_mphf.get(item).unwrap()); diff --git a/benches/rank.rs b/benches/rank.rs index 4243b72..a0f2fa3 100644 --- a/benches/rank.rs +++ b/benches/rank.rs @@ -7,28 +7,25 @@ use rand::{prelude::SliceRandom, random}; /// Benchmark results for N = 1M: /// -/// indices generation took: 15.462759ms -/// ranked bits construction took: 21.978µs, overhead: 3.16% +/// RankedBits construction took: 9.904µs, overhead: 3.16% /// -/// # ranked_bits/rank -/// time: [616.89 µs 629.04 µs 643.46 µs] -/// thrpt: [1.5541 Gelem/s 1.5897 Gelem/s 1.6210 Gelem/s] +/// RankedBits/rank +/// time: [8.3597 ms 8.4021 ms 8.4608 ms] +/// thrpt: [118.19 Melem/s 119.02 Melem/s 119.62 Melem/s] pub fn benchmark(c: &mut Criterion) { let n: usize = env::var("N").unwrap_or("1000000".to_string()).parse().unwrap(); let query_n: usize = env::var("QN").unwrap_or("1000000".to_string()).parse().unwrap(); let n_u64 = n / 64; - let t0 = Instant::now(); let data: Vec = (0..n_u64).map(|_| random()).collect(); let mut indices: Vec = (0..n).collect(); indices.shuffle(&mut rand::thread_rng()); - println!("indices generation took: {:?}", t0.elapsed()); let t0 = Instant::now(); let ranked_bits = RankedBits::new(data.into_boxed_slice()); let overhead = ((ranked_bits.size() as f32) * 8.0 / (n as f32) - 1.0) * 100.0; println!( - "ranked bits construction took: {:?}, overhead: {:.2}%", + "RankedBits construction took: {:?}, overhead: {:.2}%", t0.elapsed(), overhead ); diff --git a/benches/set.rs b/benches/set.rs index 68ea3f1..ac5ed30 100644 --- a/benches/set.rs +++ b/benches/set.rs @@ -12,20 +12,41 @@ use criterion::{criterion_group, criterion_main, Criterion, Throughput}; use rand::{Rng, SeedableRng}; use rand_chacha::ChaCha8Rng; +/// Benchmark results for N = 1M: +/// +/// Set construction took: 1.022528566s +/// +/// Set/entropy contains +/// time: [26.171 ms 26.454 ms 26.795 ms] +/// thrpt: [37.320 Melem/s 37.802 Melem/s 38.210 Melem/s] +/// +/// Set/entropy-fxhash contains +/// time: [24.902 ms 25.265 ms 25.667 ms] +/// thrpt: [38.960 Melem/s 39.581 Melem/s 40.158 Melem/s] +/// +/// Set/entropy-DefaultHasher contains +/// time: [32.400 ms 32.839 ms 33.343 ms] +/// thrpt: [29.991 Melem/s 30.452 Melem/s 30.864 Melem/s] +/// +/// Set/HashSet-fxhash contains +/// time: [14.398 ms 14.704 ms 15.039 ms] +/// thrpt: [66.494 Melem/s 68.007 Melem/s 69.454 Melem/s] +/// +/// Set/HashSet-DefaultHasher contains +/// time: [34.512 ms 34.877 ms 35.292 ms] +/// thrpt: [28.335 Melem/s 28.673 Melem/s 28.975 Melem/s] pub fn benchmark(c: &mut Criterion) { let n: usize = env::var("N").unwrap_or("1000000".to_string()).parse().unwrap(); let query_n: usize = env::var("QN").unwrap_or("1000000".to_string()).parse().unwrap(); let mut rng = ChaCha8Rng::seed_from_u64(123); - let t0 = Instant::now(); let original_set: HashSet = (0..n).map(|_| rng.gen::()).collect(); - println!("set generation took: {:?}", t0.elapsed()); let t0 = Instant::now(); let set = Set::::from_iter_with_params(original_set.iter().cloned(), DEFAULT_GAMMA).expect("failed to build set"); - println!("set construction took: {:?}", t0.elapsed()); + println!("Set construction took: {:?}", t0.elapsed()); let mut group = c.benchmark_group("Set"); group.throughput(Throughput::Elements(query_n as u64)); diff --git a/src/lib.rs b/src/lib.rs index babd652..7f2c222 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,3 +11,60 @@ pub use map_with_dict_bitpacked::*; pub use mphf::*; pub use rank::*; pub use set::*; + +pub trait GroupSeed { + fn into_u32(self) -> u32; +} + +impl GroupSeed for u8 { + #[inline(always)] + fn into_u32(self) -> u32 { + self as u32 + } +} + +impl GroupSeed for u16 { + #[inline(always)] + fn into_u32(self) -> u32 { + self as u32 + } +} + +#[cfg(feature = "rkyv_derive")] +impl GroupSeed for rkyv::rend::u16_le { + #[inline(always)] + fn into_u32(self) -> u32 { + self.to_native() as u32 + } +} + +#[cfg(feature = "rkyv_derive")] +impl GroupSeed for rkyv::rend::u16_be { + #[inline(always)] + fn into_u32(self) -> u32 { + self.to_native() as u32 + } +} + +impl GroupSeed for u32 { + #[inline(always)] + fn into_u32(self) -> u32 { + self + } +} + +#[cfg(feature = "rkyv_derive")] +impl GroupSeed for rkyv::rend::u32_le { + #[inline(always)] + fn into_u32(self) -> u32 { + self.to_native() + } +} + +#[cfg(feature = "rkyv_derive")] +impl GroupSeed for rkyv::rend::u32_be { + #[inline(always)] + fn into_u32(self) -> u32 { + self.to_native() + } +} diff --git a/src/map.rs b/src/map.rs index e868976..8cfc8ee 100644 --- a/src/map.rs +++ b/src/map.rs @@ -16,7 +16,10 @@ use std::{ use num::{PrimInt, Unsigned}; use wyhash::WyHash; -use crate::mphf::{Mphf, MphfError, DEFAULT_GAMMA}; +use crate::{ + mphf::{Mphf, MphfError, DEFAULT_GAMMA}, + GroupSeed, +}; /// An efficient, immutable hash map. #[derive(Default)] @@ -37,7 +40,7 @@ where impl Map where K: Hash, - ST: PrimInt + Unsigned, + ST: PrimInt + Unsigned + GroupSeed, H: Hasher + Default, { /// Constructs a `Map` from an iterator of key-value pairs and MPHF function params. @@ -238,7 +241,8 @@ where K: PartialEq + Hash + rkyv::Archive, K::Archived: PartialEq, V: rkyv::Archive, - ST: PrimInt + Unsigned + rkyv::Archive, + ST: PrimInt + Unsigned + rkyv::Archive, + ::Archived: GroupSeed + Copy, H: Hasher + Default, { /// Checks if the map contains the specified key. @@ -336,7 +340,7 @@ mod tests { let original_map = gen_map(1000); // Create the map from the iterator - let map = Map::try_from(original_map.clone()).unwrap(); + let map: Map = Map::from_iter_with_params(original_map.clone(), DEFAULT_GAMMA).unwrap(); // Test len assert_eq!(map.len(), original_map.len()); @@ -366,7 +370,7 @@ mod tests { } // Test size - assert_eq!(map.size(), 12570); + assert_eq!(map.size(), 12546); } /// Assert that we can call `.get()` with `K::borrow()`. @@ -389,12 +393,12 @@ mod tests { fn test_rkyv() { // create regular `HashMap`, then `Map`, then serialize to `rkyv` bytes. let original_map = gen_map(1000); - let map = Map::try_from(original_map.clone()).unwrap(); + let map: Map = Map::from_iter_with_params(original_map.clone(), DEFAULT_GAMMA).unwrap(); let rkyv_bytes = rkyv::to_bytes::(&map).unwrap(); - assert_eq!(rkyv_bytes.len(), 12432); + assert_eq!(rkyv_bytes.len(), 12408); - let rkyv_map = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); + let rkyv_map = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); // Test get on `Archived` version for (k, v) in original_map.iter() { diff --git a/src/map_tuple.rs b/src/map_tuple.rs index 83ecabc..effecc0 100644 --- a/src/map_tuple.rs +++ b/src/map_tuple.rs @@ -16,7 +16,10 @@ use std::{ use num::{PrimInt, Unsigned}; use wyhash::WyHash; -use crate::mphf::{Mphf, MphfError, DEFAULT_GAMMA}; +use crate::{ + mphf::{Mphf, MphfError, DEFAULT_GAMMA}, + GroupSeed, +}; /// An efficient, immutable hash map. #[derive(Default)] @@ -34,7 +37,7 @@ where impl MapTuple where K: Hash, - ST: PrimInt + Unsigned, + ST: PrimInt + Unsigned + GroupSeed, H: Hasher + Default, { /// Constructs a `Map` from an iterator of key-value pairs and MPHF function params. @@ -229,7 +232,8 @@ where K: PartialEq + Hash + rkyv::Archive, K::Archived: PartialEq, V: rkyv::Archive, - ST: PrimInt + Unsigned + rkyv::Archive, + ST: PrimInt + Unsigned + rkyv::Archive, + ::Archived: GroupSeed + Copy, H: Hasher + Default, { /// Checks if the map contains the specified key. @@ -366,7 +370,8 @@ mod tests { /// Assert that we can call `.get()` with `K::borrow()`. #[test] fn test_get_borrow() { - let original_map = HashMap::from_iter([("a".to_string(), ()), ("b".to_string(), ())]); + let original_map: HashMap = + HashMap::from_iter([("a".to_string(), ()), ("b".to_string(), ())]); let map = MapTuple::try_from(original_map).unwrap(); assert_eq!(map.get("a"), Some(&())); @@ -403,7 +408,8 @@ mod tests { #[cfg(feature = "rkyv_derive")] #[test] fn test_rkyv_get_borrow() { - let original_map = HashMap::from_iter([("a".to_string(), ()), ("b".to_string(), ())]); + let original_map: HashMap = + HashMap::from_iter([("a".to_string(), ()), ("b".to_string(), ())]); let map = MapTuple::try_from(original_map).unwrap(); let rkyv_bytes = rkyv::to_bytes::(&map).unwrap(); let rkyv_map = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); diff --git a/src/map_with_dict.rs b/src/map_with_dict.rs index 4806d5b..df29f22 100644 --- a/src/map_with_dict.rs +++ b/src/map_with_dict.rs @@ -17,7 +17,10 @@ use std::{ use num::{PrimInt, Unsigned}; use wyhash::WyHash; -use crate::mphf::{Mphf, MphfError, DEFAULT_GAMMA}; +use crate::{ + mphf::{Mphf, MphfError, DEFAULT_GAMMA}, + GroupSeed, +}; /// An efficient, immutable hash map with values dictionary-packed for optimized space usage. #[derive(Default)] @@ -43,7 +46,7 @@ impl MapWithDict where K: Hash, V: Eq + Clone + Hash, - ST: PrimInt + Unsigned, + ST: PrimInt + Unsigned + GroupSeed, H: Hasher + Default, { /// Constructs a `MapWithDict` from an iterator of key-value pairs and MPHF function params. @@ -276,7 +279,8 @@ where K: PartialEq + Hash + rkyv::Archive, K::Archived: PartialEq, V: rkyv::Archive, - ST: PrimInt + Unsigned + rkyv::Archive, + ST: PrimInt + Unsigned + rkyv::Archive, + ::Archived: GroupSeed + Copy, H: Hasher + Default, { /// Checks if the map contains the specified key. diff --git a/src/map_with_dict_bitpacked.rs b/src/map_with_dict_bitpacked.rs index 9d17473..8be629f 100644 --- a/src/map_with_dict_bitpacked.rs +++ b/src/map_with_dict_bitpacked.rs @@ -22,7 +22,10 @@ use bitpacking::{BitPacker, BitPacker1x}; use num::{PrimInt, Unsigned}; use wyhash::WyHash; -use crate::mphf::{Mphf, DEFAULT_GAMMA}; +use crate::{ + mphf::{Mphf, DEFAULT_GAMMA}, + GroupSeed, +}; /// An efficient, immutable hash map with bit-packed `Vec` values for optimized space usage. #[derive(Default)] @@ -54,7 +57,7 @@ pub enum Error { impl MapWithDictBitpacked where K: Hash, - ST: PrimInt + Unsigned, + ST: PrimInt + Unsigned + GroupSeed, H: Hasher + Default, { /// Constructs a `MapWithDictBitpacked` from an iterator of key-value pairs and MPHF function params. @@ -356,7 +359,8 @@ impl ArchivedMapWithDictBitpacked, - ST: PrimInt + Unsigned + rkyv::Archive, + ST: PrimInt + Unsigned + rkyv::Archive, + ::Archived: GroupSeed + Copy, H: Hasher + Default, { /// Updates `values` to the array of values corresponding to the key. Returns `false` if the diff --git a/src/mphf.rs b/src/mphf.rs index 5b06e8a..f34c6ed 100644 --- a/src/mphf.rs +++ b/src/mphf.rs @@ -22,6 +22,7 @@ use wyhash::WyHash; use crate::{ mphf::MphfError::*, rank::{RankedBits, RankedBitsAccess}, + GroupSeed, }; /// A Minimal Perfect Hash Function (MPHF). @@ -246,7 +247,10 @@ impl(&self, key: &K) -> Option { + pub fn get(&self, key: &K) -> Option + where + ST: GroupSeed + Copy, + { Self::get_impl( key, self.level_groups.iter().copied(), @@ -258,10 +262,10 @@ impl( + fn get_impl( key: &K, level_groups: impl Iterator, - group_seeds: &[ST], + group_seeds: &[GS], ranked_bits: &impl RankedBitsAccess, ) -> Option { let mut groups_before = 0; @@ -269,7 +273,7 @@ impl(key), level as u32); let group_idx = groups_before + fastmod32(level_hash as u32, groups); // SAFETY: `group_idx` is always within bounds (ensured during calculation) - let group_seed = unsafe { group_seeds.get_unchecked(group_idx).to_u32().unwrap() }; + let group_seed = unsafe { group_seeds.get_unchecked(group_idx).into_u32() }; let bit_idx = bit_index_for_seed::(level_hash, group_seed, group_idx); if let Some(rank) = ranked_bits.rank(bit_idx) { return Some(rank); @@ -329,7 +333,8 @@ fn fastmod32(x: u32, n: u32) -> usize { #[cfg(feature = "rkyv_derive")] impl ArchivedMphf where - ST: PrimInt + Unsigned + rkyv::Archive, + ST: PrimInt + Unsigned + rkyv::Archive, + ::Archived: GroupSeed + Copy, H: Hasher + Default, { #[inline] @@ -337,7 +342,7 @@ where Mphf::::get_impl( key, self.level_groups.iter().map(|v| v.to_native()), - &self.group_seeds, + self.group_seeds.get(), &self.ranked_bits, ) } diff --git a/src/set.rs b/src/set.rs index b6cc6fb..26bc397 100644 --- a/src/set.rs +++ b/src/set.rs @@ -20,7 +20,10 @@ use std::{ use num::{PrimInt, Unsigned}; use wyhash::WyHash; -use crate::mphf::{Mphf, MphfError, DEFAULT_GAMMA}; +use crate::{ + mphf::{Mphf, MphfError, DEFAULT_GAMMA}, + GroupSeed, +}; /// An efficient, immutable set. #[derive(Default)] @@ -39,7 +42,7 @@ where impl Set where K: Eq + Hash, - ST: PrimInt + Unsigned, + ST: PrimInt + Unsigned + GroupSeed, H: Hasher + Default, { /// Constructs a `Set` from an iterator of keys and MPHF function parameters. @@ -176,7 +179,8 @@ impl ArchivedSet where K: Eq + Hash + rkyv::Archive, K::Archived: PartialEq, - ST: PrimInt + Unsigned + rkyv::Archive, + ST: PrimInt + Unsigned + rkyv::Archive, + ::Archived: GroupSeed + Copy, H: Hasher + Default, { /// Returns `true` if the set contains the value. From 29190e21149894d7c96ac72997bfb677399aa1e4 Mon Sep 17 00:00:00 2001 From: RoDmitry Date: Wed, 22 Oct 2025 20:40:57 +0000 Subject: [PATCH 08/17] Clippy --- Cargo.toml | 1 - benches/mphf.rs | 12 ++++++------ src/map.rs | 8 ++++---- src/map_tuple.rs | 8 ++++---- src/map_with_dict.rs | 8 ++++---- src/rank.rs | 7 +++++-- src/set.rs | 4 ++-- 7 files changed, 25 insertions(+), 23 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 32cee79..20605f3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,7 +27,6 @@ paste = "1.0.14" proptest = "1.4.0" rand = "0.8.5" rand_chacha = "0.3.1" -rkyv = "0.8" rustc-hash = "2" test-case = "3.3.1" diff --git a/benches/mphf.rs b/benches/mphf.rs index bf47710..150136b 100644 --- a/benches/mphf.rs +++ b/benches/mphf.rs @@ -9,25 +9,25 @@ use rand::random; /// /// Mphf (1.0) construction took: 1.291480619s, bits per key: 2.10 /// -/// Mphf/get gamma 1.0 +/// Mphf/get gamma: 1.0 /// time: [26.144 ms 26.267 ms 26.412 ms] /// thrpt: [37.862 Melem/s 38.071 Melem/s 38.250 Melem/s] /// /// Mphf (1.0) rkyv serialization took: 21.024µs /// -/// Mphf/archived get gamma 1.0 +/// Mphf/archived get gamma: 1.0 /// time: [26.309 ms 26.397 ms 26.520 ms] /// thrpt: [37.707 Melem/s 37.883 Melem/s 38.010 Melem/s] /// /// Mphf (2.0) construction took: 982.578471ms, bits per key: 2.72 /// -/// Mphf/get gamma 2.0 +/// Mphf/get gamma: 2.0 /// time: [19.458 ms 19.683 ms 19.928 ms] /// thrpt: [50.179 Melem/s 50.805 Melem/s 51.392 Melem/s] /// /// Mphf (2.0) rkyv serialization took: 24.643µs /// -/// Mphf/archived get gamma 2.0 +/// Mphf/archived get gamma: 2.0 /// time: [19.901 ms 20.239 ms 20.663 ms] /// thrpt: [48.396 Melem/s 49.411 Melem/s 50.250 Melem/s] pub fn benchmark(c: &mut Criterion) { @@ -50,7 +50,7 @@ pub fn benchmark(c: &mut Criterion) { bits ); - group.bench_function(format!("get gamma {:.1}", gamma), |b| { + group.bench_function(format!("get gamma: {:.1}", gamma), |b| { b.iter(|| { for item in items.iter().take(query_n) { black_box(mphf.get(item).unwrap()); @@ -64,7 +64,7 @@ pub fn benchmark(c: &mut Criterion) { let rkyv_mphf = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); - group.bench_function(format!("archived get gamma {:.1}", gamma), |b| { + group.bench_function(format!("archived get gamma: {:.1}", gamma), |b| { b.iter(|| { for item in items.iter().take(query_n) { black_box(rkyv_mphf.get(item).unwrap()); diff --git a/src/map.rs b/src/map.rs index 8cfc8ee..6c9eaf4 100644 --- a/src/map.rs +++ b/src/map.rs @@ -259,11 +259,11 @@ where /// assert_eq!(archived_map.contains_key(&2), false); /// ``` #[inline] - pub fn contains_key(&self, key: &Q) -> bool + pub fn contains_key(&self, key: &Q) -> bool where K: Borrow, ::Archived: PartialEq, - Q: Hash + Eq, + Q: Hash + Eq + ?Sized, { if let Some(idx) = self.mphf.get(key) { // SAFETY: `idx` is always within bounds (ensured during construction) @@ -288,11 +288,11 @@ where /// assert_eq!(archived_map.get(&5).map(|v| v.to_native()), None); /// ``` #[inline] - pub fn get(&self, key: &Q) -> Option<&V::Archived> + pub fn get(&self, key: &Q) -> Option<&V::Archived> where K: Borrow, ::Archived: PartialEq, - Q: Hash + Eq, + Q: Hash + Eq + ?Sized, { let idx = self.mphf.get(key)?; diff --git a/src/map_tuple.rs b/src/map_tuple.rs index effecc0..615aaa9 100644 --- a/src/map_tuple.rs +++ b/src/map_tuple.rs @@ -250,11 +250,11 @@ where /// assert_eq!(archived_map.contains_key(&2), false); /// ``` #[inline] - pub fn contains_key(&self, key: &Q) -> bool + pub fn contains_key(&self, key: &Q) -> bool where K: Borrow, ::Archived: PartialEq, - Q: Hash + Eq, + Q: Hash + Eq + ?Sized, { if let Some(idx) = self.mphf.get(key) { // SAFETY: `idx` is always within bounds (ensured during construction) @@ -281,11 +281,11 @@ where /// assert_eq!(archived_map.get(&5).map(|v| v.to_native()), None); /// ``` #[inline] - pub fn get(&self, key: &Q) -> Option<&V::Archived> + pub fn get(&self, key: &Q) -> Option<&V::Archived> where K: Borrow, ::Archived: PartialEq, - Q: Hash + Eq, + Q: Hash + Eq + ?Sized, { let idx = self.mphf.get(key)?; diff --git a/src/map_with_dict.rs b/src/map_with_dict.rs index df29f22..748d4b9 100644 --- a/src/map_with_dict.rs +++ b/src/map_with_dict.rs @@ -297,11 +297,11 @@ where /// assert_eq!(archived_map.contains_key(&2), false); /// ``` #[inline] - pub fn contains_key(&self, key: &Q) -> bool + pub fn contains_key(&self, key: &Q) -> bool where K: Borrow, ::Archived: PartialEq, - Q: Hash + Eq, + Q: Hash + Eq + ?Sized, { if let Some(idx) = self.mphf.get(key) { // SAFETY: `idx` is always within bounds (ensured during construction) @@ -326,11 +326,11 @@ where /// assert_eq!(archived_map.get(&5).map(|v| v.to_native()), None); /// ``` #[inline] - pub fn get(&self, key: &Q) -> Option<&V::Archived> + pub fn get(&self, key: &Q) -> Option<&V::Archived> where K: Borrow, ::Archived: PartialEq, - Q: Hash + Eq, + Q: Hash + Eq + ?Sized, { let idx = self.mphf.get(key)?; diff --git a/src/rank.rs b/src/rank.rs index fbea594..ae3dd76 100644 --- a/src/rank.rs +++ b/src/rank.rs @@ -162,8 +162,11 @@ impl RankedBitsAccess for ArchivedRankedBits { #[inline] fn rank(&self, idx: usize) -> Option { // todo: transmutes `u64_le` to `u64`. May result in incorrect bits on `be` targets. - // But if `be` user enables rkyv big_endian feature, it will not. So enable the feature for them. - unsafe { Self::rank_impl(core::mem::transmute(self.bits.get()), &self.l12_ranks, idx) } + // But if `be` user enables rkyv big_endian feature, it will be fixed. So maybe enable the feature for them. + #[allow(clippy::missing_transmute_annotations)] + unsafe { + Self::rank_impl(core::mem::transmute(self.bits.get()), &self.l12_ranks, idx) + } } } diff --git a/src/set.rs b/src/set.rs index 26bc397..772e72a 100644 --- a/src/set.rs +++ b/src/set.rs @@ -196,11 +196,11 @@ where /// assert_eq!(archived_set.contains(&4), false); /// ``` #[inline] - pub fn contains(&self, key: &Q) -> bool + pub fn contains(&self, key: &Q) -> bool where K: Borrow, ::Archived: PartialEq, - Q: Hash + Eq, + Q: Hash + Eq + ?Sized, { // SAFETY: `idx` is always within bounds (ensured during construction) self.mphf From ab08b51f5962defce37a24b7ef3a883f2b8d8d5f Mon Sep 17 00:00:00 2001 From: RoDmitry Date: Thu, 23 Oct 2025 11:28:44 +0000 Subject: [PATCH 09/17] Fix rank_impl bits native --- src/lib.rs | 43 +++++++++++++++++++++++++++------- src/map.rs | 6 ++--- src/map_with_dict.rs | 6 ++--- src/map_with_dict_bitpacked.rs | 6 ++--- src/mphf.rs | 9 +++---- src/rank.rs | 17 ++++++-------- src/set.rs | 6 ++--- 7 files changed, 59 insertions(+), 34 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 7f2c222..9838f1b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,18 +12,18 @@ pub use mphf::*; pub use rank::*; pub use set::*; -pub trait GroupSeed { +pub trait IntoGroupSeed: Copy { fn into_u32(self) -> u32; } -impl GroupSeed for u8 { +impl IntoGroupSeed for u8 { #[inline(always)] fn into_u32(self) -> u32 { self as u32 } } -impl GroupSeed for u16 { +impl IntoGroupSeed for u16 { #[inline(always)] fn into_u32(self) -> u32 { self as u32 @@ -31,7 +31,7 @@ impl GroupSeed for u16 { } #[cfg(feature = "rkyv_derive")] -impl GroupSeed for rkyv::rend::u16_le { +impl IntoGroupSeed for rkyv::rend::u16_le { #[inline(always)] fn into_u32(self) -> u32 { self.to_native() as u32 @@ -39,14 +39,14 @@ impl GroupSeed for rkyv::rend::u16_le { } #[cfg(feature = "rkyv_derive")] -impl GroupSeed for rkyv::rend::u16_be { +impl IntoGroupSeed for rkyv::rend::u16_be { #[inline(always)] fn into_u32(self) -> u32 { self.to_native() as u32 } } -impl GroupSeed for u32 { +impl IntoGroupSeed for u32 { #[inline(always)] fn into_u32(self) -> u32 { self @@ -54,7 +54,7 @@ impl GroupSeed for u32 { } #[cfg(feature = "rkyv_derive")] -impl GroupSeed for rkyv::rend::u32_le { +impl IntoGroupSeed for rkyv::rend::u32_le { #[inline(always)] fn into_u32(self) -> u32 { self.to_native() @@ -62,9 +62,36 @@ impl GroupSeed for rkyv::rend::u32_le { } #[cfg(feature = "rkyv_derive")] -impl GroupSeed for rkyv::rend::u32_be { +impl IntoGroupSeed for rkyv::rend::u32_be { #[inline(always)] fn into_u32(self) -> u32 { self.to_native() } } + +pub trait IntoRankBits: Copy { + fn into_u64(self) -> u64; +} + +impl IntoRankBits for u64 { + #[inline(always)] + fn into_u64(self) -> u64 { + self + } +} + +#[cfg(feature = "rkyv_derive")] +impl IntoRankBits for rkyv::rend::u64_le { + #[inline(always)] + fn into_u64(self) -> u64 { + self.to_native() + } +} + +#[cfg(feature = "rkyv_derive")] +impl IntoRankBits for rkyv::rend::u64_be { + #[inline(always)] + fn into_u64(self) -> u64 { + self.to_native() + } +} diff --git a/src/map.rs b/src/map.rs index 6c9eaf4..71ee297 100644 --- a/src/map.rs +++ b/src/map.rs @@ -18,7 +18,7 @@ use wyhash::WyHash; use crate::{ mphf::{Mphf, MphfError, DEFAULT_GAMMA}, - GroupSeed, + IntoGroupSeed, }; /// An efficient, immutable hash map. @@ -40,7 +40,7 @@ where impl Map where K: Hash, - ST: PrimInt + Unsigned + GroupSeed, + ST: PrimInt + Unsigned + IntoGroupSeed, H: Hasher + Default, { /// Constructs a `Map` from an iterator of key-value pairs and MPHF function params. @@ -242,7 +242,7 @@ where K::Archived: PartialEq, V: rkyv::Archive, ST: PrimInt + Unsigned + rkyv::Archive, - ::Archived: GroupSeed + Copy, + ::Archived: IntoGroupSeed, H: Hasher + Default, { /// Checks if the map contains the specified key. diff --git a/src/map_with_dict.rs b/src/map_with_dict.rs index 748d4b9..3d50743 100644 --- a/src/map_with_dict.rs +++ b/src/map_with_dict.rs @@ -19,7 +19,7 @@ use wyhash::WyHash; use crate::{ mphf::{Mphf, MphfError, DEFAULT_GAMMA}, - GroupSeed, + IntoGroupSeed, }; /// An efficient, immutable hash map with values dictionary-packed for optimized space usage. @@ -46,7 +46,7 @@ impl MapWithDict where K: Hash, V: Eq + Clone + Hash, - ST: PrimInt + Unsigned + GroupSeed, + ST: PrimInt + Unsigned + IntoGroupSeed, H: Hasher + Default, { /// Constructs a `MapWithDict` from an iterator of key-value pairs and MPHF function params. @@ -280,7 +280,7 @@ where K::Archived: PartialEq, V: rkyv::Archive, ST: PrimInt + Unsigned + rkyv::Archive, - ::Archived: GroupSeed + Copy, + ::Archived: IntoGroupSeed, H: Hasher + Default, { /// Checks if the map contains the specified key. diff --git a/src/map_with_dict_bitpacked.rs b/src/map_with_dict_bitpacked.rs index 8be629f..8d05b95 100644 --- a/src/map_with_dict_bitpacked.rs +++ b/src/map_with_dict_bitpacked.rs @@ -24,7 +24,7 @@ use wyhash::WyHash; use crate::{ mphf::{Mphf, DEFAULT_GAMMA}, - GroupSeed, + IntoGroupSeed, }; /// An efficient, immutable hash map with bit-packed `Vec` values for optimized space usage. @@ -57,7 +57,7 @@ pub enum Error { impl MapWithDictBitpacked where K: Hash, - ST: PrimInt + Unsigned + GroupSeed, + ST: PrimInt + Unsigned + IntoGroupSeed, H: Hasher + Default, { /// Constructs a `MapWithDictBitpacked` from an iterator of key-value pairs and MPHF function params. @@ -360,7 +360,7 @@ where K: PartialEq + Hash + rkyv::Archive, K::Archived: PartialEq, ST: PrimInt + Unsigned + rkyv::Archive, - ::Archived: GroupSeed + Copy, + ::Archived: IntoGroupSeed, H: Hasher + Default, { /// Updates `values` to the array of values corresponding to the key. Returns `false` if the diff --git a/src/mphf.rs b/src/mphf.rs index f34c6ed..ce8f090 100644 --- a/src/mphf.rs +++ b/src/mphf.rs @@ -22,7 +22,7 @@ use wyhash::WyHash; use crate::{ mphf::MphfError::*, rank::{RankedBits, RankedBitsAccess}, - GroupSeed, + IntoGroupSeed, }; /// A Minimal Perfect Hash Function (MPHF). @@ -75,6 +75,7 @@ impl(keys: &[K], gamma: f32) -> Result { Self::from_iter(keys.iter(), gamma) } @@ -249,7 +250,7 @@ impl(&self, key: &K) -> Option where - ST: GroupSeed + Copy, + ST: IntoGroupSeed, { Self::get_impl( key, @@ -262,7 +263,7 @@ impl( + fn get_impl( key: &K, level_groups: impl Iterator, group_seeds: &[GS], @@ -334,7 +335,7 @@ fn fastmod32(x: u32, n: u32) -> usize { impl ArchivedMphf where ST: PrimInt + Unsigned + rkyv::Archive, - ::Archived: GroupSeed + Copy, + ::Archived: IntoGroupSeed, H: Hasher + Default, { #[inline] diff --git a/src/rank.rs b/src/rank.rs index ae3dd76..27ed994 100644 --- a/src/rank.rs +++ b/src/rank.rs @@ -5,6 +5,8 @@ use std::mem::size_of_val; +use crate::IntoRankBits; + /// Size of the L2 block in bits. const L2_BIT_SIZE: usize = 512; /// Size of the L1 block in bits, calculated as a multiple of the L2 block size. @@ -24,10 +26,10 @@ pub trait RankedBitsAccess { /// This method is unsafe because `idx` must be within the bounds of the bits stored in `RankedBitsAccess`. /// An index out of bounds can lead to undefined behavior. #[inline] - unsafe fn rank_impl(bits: &[u64], l12_ranks: &T, idx: usize) -> Option { + unsafe fn rank_impl(bits: &[B], l12_ranks: &T, idx: usize) -> Option { let word_idx = idx / 64; let bit_idx = idx % 64; - let word = *bits.get_unchecked(word_idx); + let word = bits.get_unchecked(word_idx).into_u64(); if (word & (1u64 << bit_idx)) == 0 { return None; @@ -41,9 +43,9 @@ pub trait RankedBitsAccess { let offset = (idx / L2_BIT_SIZE) * 8; let block = bits.get_unchecked(offset..offset + blocks_num); - let block_rank = block.iter().map(|&x| x.count_ones() as usize).sum::(); + let block_rank = block.iter().map(|&x| x.into_u64().count_ones() as usize).sum::(); - let word = *bits.get_unchecked(offset + blocks_num); + let word = bits.get_unchecked(offset + blocks_num).into_u64(); let word_mask = ((1u64 << (idx_within_l2 % 64)) - 1) * (idx_within_l2 > 0) as u64; let word_rank = (word & word_mask).count_ones() as usize; @@ -161,12 +163,7 @@ impl RankedBitsAccess for RankedBits { impl RankedBitsAccess for ArchivedRankedBits { #[inline] fn rank(&self, idx: usize) -> Option { - // todo: transmutes `u64_le` to `u64`. May result in incorrect bits on `be` targets. - // But if `be` user enables rkyv big_endian feature, it will be fixed. So maybe enable the feature for them. - #[allow(clippy::missing_transmute_annotations)] - unsafe { - Self::rank_impl(core::mem::transmute(self.bits.get()), &self.l12_ranks, idx) - } + unsafe { Self::rank_impl(self.bits.get(), &self.l12_ranks, idx) } } } diff --git a/src/set.rs b/src/set.rs index 772e72a..3484883 100644 --- a/src/set.rs +++ b/src/set.rs @@ -22,7 +22,7 @@ use wyhash::WyHash; use crate::{ mphf::{Mphf, MphfError, DEFAULT_GAMMA}, - GroupSeed, + IntoGroupSeed, }; /// An efficient, immutable set. @@ -42,7 +42,7 @@ where impl Set where K: Eq + Hash, - ST: PrimInt + Unsigned + GroupSeed, + ST: PrimInt + Unsigned + IntoGroupSeed, H: Hasher + Default, { /// Constructs a `Set` from an iterator of keys and MPHF function parameters. @@ -180,7 +180,7 @@ where K: Eq + Hash + rkyv::Archive, K::Archived: PartialEq, ST: PrimInt + Unsigned + rkyv::Archive, - ::Archived: GroupSeed + Copy, + ::Archived: IntoGroupSeed, H: Hasher + Default, { /// Returns `true` if the set contains the value. From b03ec103f2296bb010992a23eec0b7d1addff465 Mon Sep 17 00:00:00 2001 From: RoDmitry Date: Thu, 23 Oct 2025 13:15:47 +0000 Subject: [PATCH 10/17] Remove unused Map tuple --- src/map_tuple.rs | 499 ----------------------------------------------- 1 file changed, 499 deletions(-) delete mode 100644 src/map_tuple.rs diff --git a/src/map_tuple.rs b/src/map_tuple.rs deleted file mode 100644 index 615aaa9..0000000 --- a/src/map_tuple.rs +++ /dev/null @@ -1,499 +0,0 @@ -//! A module providing `Map`, an immutable hash map implementation. -//! -//! `Map` is a hash map structure that optimizes for space by utilizing a minimal perfect -//! hash function (MPHF) for indexing the map's keys. -//! The MPHF provides direct access to the indices of keys. -//! Keys are stored to ensure that `get` operation will return `None` if key -//! wasn't present in the original set. - -use std::{ - borrow::Borrow, - collections::HashMap, - hash::{BuildHasher, Hash, Hasher}, - mem::size_of_val, -}; - -use num::{PrimInt, Unsigned}; -use wyhash::WyHash; - -use crate::{ - mphf::{Mphf, MphfError, DEFAULT_GAMMA}, - GroupSeed, -}; - -/// An efficient, immutable hash map. -#[derive(Default)] -#[cfg_attr(feature = "rkyv_derive", derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize))] -pub struct MapTuple -where - ST: PrimInt + Unsigned, - H: Hasher + Default, -{ - /// Minimally Perfect Hash Function for keys indices retrieval - mphf: Mphf, - keys_vals: Box<[(K, V)]>, -} - -impl MapTuple -where - K: Hash, - ST: PrimInt + Unsigned + GroupSeed, - H: Hasher + Default, -{ - /// Constructs a `Map` from an iterator of key-value pairs and MPHF function params. - pub fn from_iter_with_params(iter: I, gamma: f32) -> Result - where - I: IntoIterator, - { - let mut keys_vals: Vec<_> = iter.into_iter().collect(); - - let mphf = Mphf::from_iter(keys_vals.iter().map(|(k, _v)| k), gamma)?; - - // Re-order `keys` and `values_index` according to `mphf` - for i in 0..keys_vals.len() { - loop { - let idx = mphf.get(&keys_vals[i].0).unwrap(); - if idx == i { - break; - } - keys_vals.swap(i, idx); - } - } - - Ok(Self { mphf, keys_vals: keys_vals.into_boxed_slice() }) - } - - /// Returns a reference to the value corresponding to the key. Returns `None` if the key is - /// not present in the map. - /// - /// # Examples - /// ``` - /// # use std::collections::HashMap; - /// # use entropy_map::Map; - /// let map = MapTuple::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); - /// assert_eq!(map.get(&1), Some(&2)); - /// assert_eq!(map.get(&5), None); - /// ``` - #[inline] - pub fn get(&self, key: &Q) -> Option<&V> - where - K: Borrow + PartialEq, - Q: Hash + Eq + ?Sized, - { - let idx = self.mphf.get(key)?; - - // SAFETY: `idx` is always within bounds (ensured during construction) - unsafe { - let (k, v) = self.keys_vals.get_unchecked(idx); - if k == key { - Some(v) - } else { - None - } - } - } - - /// Returns the number of key-value pairs in the map. - /// - /// # Examples - /// ``` - /// # use std::collections::HashMap; - /// # use entropy_map::Map; - /// let map = MapTuple::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); - /// assert_eq!(map.len(), 2); - /// ``` - #[inline] - pub fn len(&self) -> usize { - self.keys_vals.len() - } - - /// Returns `true` if the map contains no elements. - /// - /// # Examples - /// ``` - /// # use std::collections::HashMap; - /// # use entropy_map::Map; - /// let map = MapTuple::try_from(HashMap::from([(0, 0); 0])).unwrap(); - /// assert_eq!(map.is_empty(), true); - /// let map = MapTuple::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); - /// assert_eq!(map.is_empty(), false); - /// ``` - #[inline] - pub fn is_empty(&self) -> bool { - self.keys_vals.is_empty() - } - - /// Checks if the map contains the specified key. - /// - /// # Examples - /// ``` - /// # use std::collections::HashMap; - /// # use entropy_map::Map; - /// let map = MapTuple::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); - /// assert_eq!(map.contains_key(&1), true); - /// assert_eq!(map.contains_key(&2), false); - /// ``` - #[inline] - pub fn contains_key(&self, key: &Q) -> bool - where - K: Borrow + PartialEq, - Q: Hash + Eq + ?Sized, - { - if let Some(idx) = self.mphf.get(key) { - // SAFETY: `idx` is always within bounds (ensured during construction) - unsafe { &self.keys_vals.get_unchecked(idx).0 == key } - } else { - false - } - } - - /// Returns an iterator over the map, yielding key-value pairs. - /// - /// # Examples - /// ``` - /// # use std::collections::HashMap; - /// # use entropy_map::Map; - /// let map = MapTuple::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); - /// for (key, val) in map.iter() { - /// println!("key: {key} val: {val}"); - /// } - /// ``` - #[inline] - pub fn iter(&self) -> impl Iterator { - self.keys_vals.iter() - } - - /// Returns an iterator over the keys of the map. - /// - /// # Examples - /// ``` - /// # use std::collections::HashMap; - /// # use entropy_map::Map; - /// let map = MapTuple::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); - /// for key in map.keys() { - /// println!("{key}"); - /// } - /// ``` - #[inline] - pub fn keys(&self) -> impl Iterator { - self.keys_vals.iter().map(|(k, _v)| k) - } - - /// Returns an iterator over the values of the map. - /// - /// # Examples - /// ``` - /// # use std::collections::HashMap; - /// # use entropy_map::Map; - /// let map = MapTuple::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); - /// for val in map.values() { - /// println!("{val}"); - /// } - /// ``` - #[inline] - pub fn values(&self) -> impl Iterator { - self.keys_vals.iter().map(|(_k, v)| v) - } - - /// Returns the total number of bytes occupied by the structure. - /// - /// # Examples - /// ``` - /// # use std::collections::HashMap; - /// # use entropy_map::Map; - /// let map = MapTuple::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); - /// assert_eq!(map.size(), 222); - /// ``` - #[inline] - pub fn size(&self) -> usize { - size_of_val(self) + self.mphf.size() + size_of_val(self.keys_vals.as_ref()) - } -} - -/// Creates a `Map` from a `HashMap`. -impl TryFrom> for MapTuple -where - K: Eq + Hash + Clone, - V: Eq + Clone + Hash, - B: BuildHasher, -{ - type Error = MphfError; - - #[inline] - fn try_from(value: HashMap) -> Result { - Self::from_iter_with_params(value, DEFAULT_GAMMA) - } -} - -/// Implement `get` for `Archived` version of `Map` if feature is enabled -#[cfg(feature = "rkyv_derive")] -impl ArchivedMapTuple -where - K: PartialEq + Hash + rkyv::Archive, - K::Archived: PartialEq, - V: rkyv::Archive, - ST: PrimInt + Unsigned + rkyv::Archive, - ::Archived: GroupSeed + Copy, - H: Hasher + Default, -{ - /// Checks if the map contains the specified key. - /// - /// # Examples - /// ``` - /// # use std::collections::HashMap; - /// # use entropy_map::ArchivedMapTuple; - /// # use entropy_map::Map; - /// let map = MapTuple::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); - /// let bytes = rkyv::to_bytes::(&map).unwrap(); - /// let archived_map = rkyv::access::, rkyv::rancor::Error>(&bytes).unwrap(); - /// assert_eq!(archived_map.contains_key(&1), true); - /// assert_eq!(archived_map.contains_key(&2), false); - /// ``` - #[inline] - pub fn contains_key(&self, key: &Q) -> bool - where - K: Borrow, - ::Archived: PartialEq, - Q: Hash + Eq + ?Sized, - { - if let Some(idx) = self.mphf.get(key) { - // SAFETY: `idx` is always within bounds (ensured during construction) - let rkyv::tuple::ArchivedTuple2(k, _v) = unsafe { self.keys_vals.get_unchecked(idx) }; - - k == key - } else { - false - } - } - - /// Returns a reference to the value corresponding to the key. Returns `None` if the key is - /// not present in the map. - /// - /// # Examples - /// ``` - /// # use std::collections::HashMap; - /// # use entropy_map::ArchivedMapTuple; - /// # use entropy_map::Map; - /// let map = MapTuple::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); - /// let bytes = rkyv::to_bytes::(&map).unwrap(); - /// let archived_map = rkyv::access::, rkyv::rancor::Error>(&bytes).unwrap(); - /// assert_eq!(archived_map.get(&1).map(|v| v.to_native()), Some(2)); - /// assert_eq!(archived_map.get(&5).map(|v| v.to_native()), None); - /// ``` - #[inline] - pub fn get(&self, key: &Q) -> Option<&V::Archived> - where - K: Borrow, - ::Archived: PartialEq, - Q: Hash + Eq + ?Sized, - { - let idx = self.mphf.get(key)?; - - // SAFETY: `idx` is always within bounds (ensured during construction) - unsafe { - let rkyv::tuple::ArchivedTuple2(k, v) = self.keys_vals.get_unchecked(idx); - if k == key { - Some(v) - } else { - None - } - } - } - - /// Returns an iterator over the archived map, yielding archived key-value pairs. - #[inline] - pub fn iter(&self) -> impl Iterator::Archived> { - self.keys_vals.iter() - } -} - -#[cfg(test)] -mod tests { - use super::*; - use paste::paste; - use proptest::prelude::*; - use rand::{Rng, SeedableRng}; - use rand_chacha::ChaCha8Rng; - use std::collections::{hash_map::RandomState, HashSet}; - - fn gen_map(items_num: usize) -> HashMap { - let mut rng = ChaCha8Rng::seed_from_u64(123); - - (0..items_num) - .map(|_| { - let key = rng.gen::(); - let value = rng.gen_range(1..=10); - (key, value) - }) - .collect() - } - - #[test] - fn test_map_with_dict() { - // Collect original key-value pairs directly into a HashMap - let original_map = gen_map(1000); - - // Create the map from the iterator - let map = MapTuple::try_from(original_map.clone()).unwrap(); - - // Test len - assert_eq!(map.len(), original_map.len()); - - // Test is_empty - assert_eq!(map.is_empty(), original_map.is_empty()); - - // Test get, contains_key - for (key, value) in &original_map { - assert_eq!(map.get(key), Some(value)); - assert!(map.contains_key(key)); - } - - // Test iter - for (k, v) in map.iter() { - assert_eq!(original_map.get(k), Some(v)); - } - - // Test keys - for k in map.keys() { - assert!(original_map.contains_key(k)); - } - - // Test values - for &v in map.values() { - assert!(original_map.values().any(|&val| val == v)); - } - - // Test size - assert_eq!(map.size(), 16554); - } - - /// Assert that we can call `.get()` with `K::borrow()`. - #[test] - fn test_get_borrow() { - let original_map: HashMap = - HashMap::from_iter([("a".to_string(), ()), ("b".to_string(), ())]); - let map = MapTuple::try_from(original_map).unwrap(); - - assert_eq!(map.get("a"), Some(&())); - assert!(map.contains_key("a")); - assert_eq!(map.get("b"), Some(&())); - assert!(map.contains_key("b")); - assert_eq!(map.get("c"), None); - assert!(!map.contains_key("c")); - } - - #[cfg(feature = "rkyv_derive")] - #[test] - fn test_rkyv() { - // create regular `HashMap`, then `Map`, then serialize to `rkyv` bytes. - let original_map = gen_map(1000); - let map = MapTuple::try_from(original_map.clone()).unwrap(); - let rkyv_bytes = rkyv::to_bytes::(&map).unwrap(); - - assert_eq!(rkyv_bytes.len(), 16424); - - let rkyv_map = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); - - // Test get on `Archived` version - for (k, v) in original_map.iter() { - assert_eq!(v, rkyv_map.get(k).unwrap()); - } - - // Test iter on `Archived` version - for rkyv::tuple::ArchivedTuple2(k, v) in rkyv_map.iter() { - assert_eq!(original_map.get(&k.to_native()), Some(&v.to_native())); - } - } - - #[cfg(feature = "rkyv_derive")] - #[test] - fn test_rkyv_get_borrow() { - let original_map: HashMap = - HashMap::from_iter([("a".to_string(), ()), ("b".to_string(), ())]); - let map = MapTuple::try_from(original_map).unwrap(); - let rkyv_bytes = rkyv::to_bytes::(&map).unwrap(); - let rkyv_map = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); - - assert_eq!(map.get("a"), Some(&())); - assert!(rkyv_map.contains_key("a")); - assert_eq!(map.get("b"), Some(&())); - assert!(rkyv_map.contains_key("b")); - assert_eq!(map.get("c"), None); - assert!(!rkyv_map.contains_key("c")); - } - - macro_rules! proptest_map_with_dict_model { - ($(($b:expr, $s:expr, $gamma:expr)),* $(,)?) => { - $( - paste! { - proptest! { - #[test] - fn [](model: HashMap, arbitrary: HashSet) { - let entropy_map: MapTuple = MapTuple::from_iter_with_params( - model.clone(), - $gamma as f32 / 100.0 - ).unwrap(); - - // Assert that length matches model. - assert_eq!(entropy_map.len(), model.len()); - assert_eq!(entropy_map.is_empty(), model.is_empty()); - - // Assert that keys and values match model. - assert_eq!( - HashSet::<_, RandomState>::from_iter(entropy_map.keys()), - HashSet::from_iter(model.keys()) - ); - assert_eq!( - HashSet::<_, RandomState>::from_iter(entropy_map.values()), - HashSet::from_iter(model.values()) - ); - - // Assert that contains and get operations match model for contained elements. - for (k, v) in &model { - assert!(entropy_map.contains_key(&k)); - assert_eq!(entropy_map.get(&k), Some(v)); - } - - // Assert that contains and get operations match model for random elements. - for k in arbitrary { - assert_eq!( - model.contains_key(&k), - entropy_map.contains_key(&k), - ); - assert_eq!(entropy_map.get(&k), model.get(&k)); - } - } - } - } - )* - }; - } - - proptest_map_with_dict_model!( - // (1, 8, 100), - (2, 8, 100), - (4, 8, 100), - (7, 8, 100), - (8, 8, 100), - (15, 8, 100), - (16, 8, 100), - (23, 8, 100), - (24, 8, 100), - (31, 8, 100), - (32, 8, 100), - (33, 8, 100), - (48, 8, 100), - (53, 8, 100), - (61, 8, 100), - (63, 8, 100), - (64, 8, 100), - (32, 7, 100), - (32, 5, 100), - (32, 4, 100), - (32, 3, 100), - (32, 1, 100), - (32, 0, 100), - (32, 8, 200), - (32, 6, 200), - ); -} From 2941f6c2f59b1406f1ecf86817bdee0d02fbcfcb Mon Sep 17 00:00:00 2001 From: RoDmitry Date: Thu, 23 Oct 2025 13:19:38 +0000 Subject: [PATCH 11/17] Rename Map tests --- src/map.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/map.rs b/src/map.rs index 71ee297..3241f28 100644 --- a/src/map.rs +++ b/src/map.rs @@ -335,7 +335,7 @@ mod tests { } #[test] - fn test_map_with_dict() { + fn test_map() { // Collect original key-value pairs directly into a HashMap let original_map = gen_map(1000); @@ -428,13 +428,13 @@ mod tests { assert!(!rkyv_map.contains_key("c")); } - macro_rules! proptest_map_with_dict_model { + macro_rules! proptest_map_model { ($(($b:expr, $s:expr, $gamma:expr)),* $(,)?) => { $( paste! { proptest! { #[test] - fn [](model: HashMap, arbitrary: HashSet) { + fn [](model: HashMap, arbitrary: HashSet) { let entropy_map: Map = Map::from_iter_with_params( model.clone(), $gamma as f32 / 100.0 @@ -475,7 +475,7 @@ mod tests { }; } - proptest_map_with_dict_model!( + proptest_map_model!( // (1, 8, 100), (2, 8, 100), (4, 8, 100), From 5eab5466c2e022525df52bea74cc98cb11a864de Mon Sep 17 00:00:00 2001 From: RoDmitry Date: Thu, 23 Oct 2025 13:21:39 +0000 Subject: [PATCH 12/17] Comment --- src/map.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/map.rs b/src/map.rs index 3241f28..9fc24a8 100644 --- a/src/map.rs +++ b/src/map.rs @@ -35,6 +35,7 @@ where keys: Box<[K]>, /// Map values values: Box<[V]>, + // storage as tuple `Box<[(K, V)]>` works slower } impl Map From b458bd4297ab3360d91641092947d3f51aec3daa Mon Sep 17 00:00:00 2001 From: RoDmitry Date: Sat, 25 Oct 2025 20:24:31 +0000 Subject: [PATCH 13/17] Fix try_from --- src/map.rs | 3 +-- src/map_with_dict.rs | 2 +- src/map_with_dict_bitpacked.rs | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/map.rs b/src/map.rs index 9fc24a8..3e095e8 100644 --- a/src/map.rs +++ b/src/map.rs @@ -223,8 +223,7 @@ where /// Creates a `Map` from a `HashMap`. impl TryFrom> for Map where - K: Eq + Hash + Clone, - V: Eq + Clone + Hash, + K: Hash, B: BuildHasher, { type Error = MphfError; diff --git a/src/map_with_dict.rs b/src/map_with_dict.rs index 3d50743..637e0b1 100644 --- a/src/map_with_dict.rs +++ b/src/map_with_dict.rs @@ -260,7 +260,7 @@ where /// Creates a `MapWithDict` from a `HashMap`. impl TryFrom> for MapWithDict where - K: Eq + Hash + Clone, + K: Hash, V: Eq + Clone + Hash, B: BuildHasher, { diff --git a/src/map_with_dict_bitpacked.rs b/src/map_with_dict_bitpacked.rs index 8d05b95..d74eb48 100644 --- a/src/map_with_dict_bitpacked.rs +++ b/src/map_with_dict_bitpacked.rs @@ -293,7 +293,7 @@ where /// Creates a `MapWithDictBitpacked` from a `HashMap`. impl TryFrom, B>> for MapWithDictBitpacked where - K: PartialEq + Hash + Clone, + K: Hash, B: BuildHasher, { type Error = Error; From bce4980d90be540e5001779ee59751e1d8aa153e Mon Sep 17 00:00:00 2001 From: RoDmitry Date: Sun, 26 Oct 2025 17:21:22 +0000 Subject: [PATCH 14/17] MphfError impl Error --- src/mphf.rs | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/mphf.rs b/src/mphf.rs index ce8f090..8f99d12 100644 --- a/src/mphf.rs +++ b/src/mphf.rs @@ -10,6 +10,7 @@ //! but prioritizes code simplicity and portability, with a special focus on optimizing the rank //! storage mechanism and reducing the construction time and querying latency of MPHF. +use core::fmt; use std::{ hash::{Hash, Hasher}, marker::PhantomData, @@ -51,14 +52,26 @@ const MAX_LEVELS: usize = 64; /// Errors that can occur when initializing `Mphf`. #[derive(Debug)] pub enum MphfError { - /// Error when the maximum number of levels is exceeded during initialization. - MaxLevelsExceeded, - /// Error when the seed type `ST` is too small to store `S` bits - InvalidSeedType, /// Error when the `gamma` parameter is less than 1.0. InvalidGammaParameter, + /// Error when the seed type `ST` is too small to store `S` bits + InvalidSeedType, + /// Error when the maximum number of levels is exceeded during initialization. + MaxLevelsExceeded, +} + +impl fmt::Display for MphfError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::InvalidGammaParameter => write!(f, "the `gamma` parameter is less than 1.0"), + Self::InvalidSeedType => write!(f, "the seed type `ST` is too small to store `S` bits"), + Self::MaxLevelsExceeded => write!(f, "the maximum number of levels is exceeded during initialization"), + } + } } +impl std::error::Error for MphfError {} + /// Default `gamma` parameter for MPHF. pub const DEFAULT_GAMMA: f32 = 2.0; From 1a507ae4c4f2f53340bcef6e2de8a06048f986f5 Mon Sep 17 00:00:00 2001 From: RoDmitry Date: Mon, 27 Oct 2025 00:06:58 +0000 Subject: [PATCH 15/17] ArchivedMap missing methods --- src/map.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/map.rs b/src/map.rs index 3e095e8..ad4f95a 100644 --- a/src/map.rs +++ b/src/map.rs @@ -311,6 +311,18 @@ where pub fn iter(&self) -> impl Iterator { self.keys.iter().zip(self.values.iter()) } + + /// Returns the number of key-value pairs in the map. + #[inline] + pub fn len(&self) -> usize { + self.keys.len() + } + + /// Returns `true` if the map contains no elements. + #[inline] + pub fn is_empty(&self) -> bool { + self.keys.is_empty() + } } #[cfg(test)] From 38ff8a51c478e014c3761c9f5262100a938a7943 Mon Sep 17 00:00:00 2001 From: RoDmitry Date: Mon, 27 Oct 2025 13:09:14 +0000 Subject: [PATCH 16/17] Bench ArchivedHashMap --- benches/map.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/benches/map.rs b/benches/map.rs index fbbda2f..354eb6f 100644 --- a/benches/map.rs +++ b/benches/map.rs @@ -17,6 +17,10 @@ use rand_chacha::ChaCha8Rng; /// Map/entropy get /// time: [29.678 ms 29.985 ms 30.320 ms] /// thrpt: [32.982 Melem/s 33.350 Melem/s 33.695 Melem/s] +/// +/// Map/HashMap archived get +/// time: [37.152 ms 37.373 ms 37.712 ms] +/// thrpt: [26.517 Melem/s 26.757 Melem/s 26.917 Melem/s] /// /// Map rkyv serialization took: 1.474797ms /// @@ -63,6 +67,21 @@ pub fn benchmark(c: &mut Criterion) { }); }); + let rkyv_bytes = rkyv::to_bytes::(&hash_map).unwrap(); + let rkyv_hash_map = rkyv::access::< + rkyv::collections::swiss_table::map::ArchivedHashMap, + rkyv::rancor::Error, + >(&rkyv_bytes) + .unwrap(); + + group.bench_function("HashMap archived get", |b| { + b.iter(|| { + for key in original_map.keys().take(query_n) { + black_box(rkyv_hash_map.get(key).unwrap()); + } + }); + }); + let t0 = Instant::now(); let rkyv_bytes = rkyv::to_bytes::(&map).unwrap(); println!("Map rkyv serialization took: {:?}", t0.elapsed()); From e41434ec6dc81b4616a71146011e82706428eca3 Mon Sep 17 00:00:00 2001 From: RoDmitry Date: Mon, 27 Oct 2025 14:34:57 +0000 Subject: [PATCH 17/17] Map tuple k-v storage --- benches/map.rs | 12 ++++---- src/map.rs | 75 +++++++++++++++++++++++--------------------------- 2 files changed, 40 insertions(+), 47 deletions(-) diff --git a/benches/map.rs b/benches/map.rs index 354eb6f..30e78d2 100644 --- a/benches/map.rs +++ b/benches/map.rs @@ -8,25 +8,25 @@ use rand_chacha::ChaCha8Rng; /// Benchmark results for N = 1M: /// -/// Map construction took: 14.06023608s +/// Map construction took: 13.64868523s /// /// Map/HashMap get /// time: [18.350 ms 18.518 ms 18.707 ms] /// thrpt: [53.455 Melem/s 54.001 Melem/s 54.496 Melem/s] /// /// Map/entropy get -/// time: [29.678 ms 29.985 ms 30.320 ms] -/// thrpt: [32.982 Melem/s 33.350 Melem/s 33.695 Melem/s] +/// time: [37.033 ms 37.293 ms 37.613 ms] +/// thrpt: [26.587 Melem/s 26.815 Melem/s 27.003 Melem/s] /// /// Map/HashMap archived get /// time: [37.152 ms 37.373 ms 37.712 ms] /// thrpt: [26.517 Melem/s 26.757 Melem/s 26.917 Melem/s] /// -/// Map rkyv serialization took: 1.474797ms +/// Map rkyv serialization took: 4.447392ms /// /// Map/entropy archived get -/// time: [25.789 ms 26.182 ms 26.617 ms] -/// thrpt: [37.570 Melem/s 38.194 Melem/s 38.776 Melem/s] +/// time: [40.613 ms 41.039 ms 41.563 ms] +/// thrpt: [24.060 Melem/s 24.367 Melem/s 24.623 Melem/s] pub fn benchmark(c: &mut Criterion) { let n: usize = env::var("N").unwrap_or("1000000".to_string()).parse().unwrap(); let query_n: usize = env::var("QN").unwrap_or("1000000".to_string()).parse().unwrap(); diff --git a/src/map.rs b/src/map.rs index ad4f95a..d1c1a0a 100644 --- a/src/map.rs +++ b/src/map.rs @@ -31,11 +31,7 @@ where { /// Minimally Perfect Hash Function for keys indices retrieval mphf: Mphf, - /// Map keys - keys: Box<[K]>, - /// Map values - values: Box<[V]>, - // storage as tuple `Box<[(K, V)]>` works slower + keys_vals: Box<[(K, V)]>, } impl Map @@ -49,29 +45,22 @@ where where I: IntoIterator, { - let mut keys = vec![]; - let mut values = vec![]; + let mut keys_vals: Vec<_> = iter.into_iter().collect(); - for (k, v) in iter { - keys.push(k); - values.push(v); - } - - let mphf = Mphf::from_slice(&keys, gamma)?; + let mphf = Mphf::from_iter(keys_vals.iter().map(|(k, _v)| k), gamma)?; // Re-order `keys` and `values_index` according to `mphf` - for i in 0..keys.len() { + for i in 0..keys_vals.len() { loop { - let idx = mphf.get(&keys[i]).unwrap(); + let idx = mphf.get(&keys_vals[i].0).unwrap(); if idx == i { break; } - keys.swap(i, idx); - values.swap(i, idx); + keys_vals.swap(i, idx); } } - Ok(Self { mphf, keys: keys.into_boxed_slice(), values: values.into_boxed_slice() }) + Ok(Self { mphf, keys_vals: keys_vals.into_boxed_slice() }) } /// Returns a reference to the value corresponding to the key. Returns `None` if the key is @@ -95,8 +84,9 @@ where // SAFETY: `idx` is always within bounds (ensured during construction) unsafe { - if self.keys.get_unchecked(idx) == key { - Some(self.values.get_unchecked(idx)) + let (k, v) = self.keys_vals.get_unchecked(idx); + if k == key { + Some(v) } else { None } @@ -114,7 +104,7 @@ where /// ``` #[inline] pub fn len(&self) -> usize { - self.keys.len() + self.keys_vals.len() } /// Returns `true` if the map contains no elements. @@ -130,7 +120,7 @@ where /// ``` #[inline] pub fn is_empty(&self) -> bool { - self.keys.is_empty() + self.keys_vals.is_empty() } /// Checks if the map contains the specified key. @@ -151,7 +141,7 @@ where { if let Some(idx) = self.mphf.get(key) { // SAFETY: `idx` is always within bounds (ensured during construction) - unsafe { self.keys.get_unchecked(idx) == key } + unsafe { &self.keys_vals.get_unchecked(idx).0 == key } } else { false } @@ -169,8 +159,8 @@ where /// } /// ``` #[inline] - pub fn iter(&self) -> impl Iterator { - self.keys.iter().zip(self.values.iter()) + pub fn iter(&self) -> impl Iterator { + self.keys_vals.iter() } /// Returns an iterator over the keys of the map. @@ -186,7 +176,7 @@ where /// ``` #[inline] pub fn keys(&self) -> impl Iterator { - self.keys.iter() + self.keys_vals.iter().map(|(k, _v)| k) } /// Returns an iterator over the values of the map. @@ -202,7 +192,7 @@ where /// ``` #[inline] pub fn values(&self) -> impl Iterator { - self.values.iter() + self.keys_vals.iter().map(|(_k, v)| v) } /// Returns the total number of bytes occupied by the structure. @@ -212,11 +202,11 @@ where /// # use std::collections::HashMap; /// # use entropy_map::Map; /// let map = Map::try_from(HashMap::from([(1, 2), (3, 4)])).unwrap(); - /// assert_eq!(map.size(), 238); + /// assert_eq!(map.size(), 222); /// ``` #[inline] pub fn size(&self) -> usize { - size_of_val(self) + self.mphf.size() + size_of_val(self.keys.as_ref()) + size_of_val(self.values.as_ref()) + size_of_val(self) + self.mphf.size() + size_of_val(self.keys_vals.as_ref()) } } @@ -267,7 +257,9 @@ where { if let Some(idx) = self.mphf.get(key) { // SAFETY: `idx` is always within bounds (ensured during construction) - unsafe { self.keys.get_unchecked(idx) == key } + let rkyv::tuple::ArchivedTuple2(k, _v) = unsafe { self.keys_vals.get_unchecked(idx) }; + + k == key } else { false } @@ -298,8 +290,9 @@ where // SAFETY: `idx` is always within bounds (ensured during construction) unsafe { - if self.keys.get_unchecked(idx) == key { - Some(self.values.get_unchecked(idx)) + let rkyv::tuple::ArchivedTuple2(k, v) = self.keys_vals.get_unchecked(idx); + if k == key { + Some(v) } else { None } @@ -308,20 +301,20 @@ where /// Returns an iterator over the archived map, yielding archived key-value pairs. #[inline] - pub fn iter(&self) -> impl Iterator { - self.keys.iter().zip(self.values.iter()) + pub fn iter(&self) -> impl Iterator::Archived> { + self.keys_vals.iter() } /// Returns the number of key-value pairs in the map. #[inline] pub fn len(&self) -> usize { - self.keys.len() + self.keys_vals.len() } /// Returns `true` if the map contains no elements. #[inline] pub fn is_empty(&self) -> bool { - self.keys.is_empty() + self.keys_vals.is_empty() } } @@ -367,8 +360,8 @@ mod tests { } // Test iter - for (&k, &v) in map.iter() { - assert_eq!(original_map.get(&k), Some(&v)); + for (k, v) in map.iter() { + assert_eq!(original_map.get(k), Some(v)); } // Test keys @@ -382,7 +375,7 @@ mod tests { } // Test size - assert_eq!(map.size(), 12546); + assert_eq!(map.size(), 16530); } /// Assert that we can call `.get()` with `K::borrow()`. @@ -408,7 +401,7 @@ mod tests { let map: Map = Map::from_iter_with_params(original_map.clone(), DEFAULT_GAMMA).unwrap(); let rkyv_bytes = rkyv::to_bytes::(&map).unwrap(); - assert_eq!(rkyv_bytes.len(), 12408); + assert_eq!(rkyv_bytes.len(), 16424); let rkyv_map = rkyv::access::, rkyv::rancor::Error>(&rkyv_bytes).unwrap(); @@ -418,7 +411,7 @@ mod tests { } // Test iter on `Archived` version - for (k, v) in rkyv_map.iter() { + for rkyv::tuple::ArchivedTuple2(k, v) in rkyv_map.iter() { assert_eq!(original_map.get(&k.to_native()), Some(&v.to_native())); } }