-
Notifications
You must be signed in to change notification settings - Fork 20
chore: fine tune frequencies and theta sketches #51
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,7 +35,7 @@ pub(super) struct ReversePurgeItemHashMap<T> { | |
| lg_length: u8, | ||
| load_threshold: usize, | ||
| keys: Vec<Option<T>>, | ||
| values: Vec<i64>, | ||
| values: Vec<u64>, | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All offsets and counts are non-negative. Use |
||
| states: Vec<u16>, | ||
| num_active: usize, | ||
| } | ||
|
|
@@ -59,7 +59,7 @@ impl<T: Eq + Hash> ReversePurgeItemHashMap<T> { | |
| } | ||
|
|
||
| /// Returns the value for `key`, or zero if the key is not present. | ||
| pub fn get(&self, key: &T) -> i64 { | ||
| pub fn get(&self, key: &T) -> u64 { | ||
| let probe = self.hash_probe(key); | ||
| if self.states[probe] > 0 { | ||
| return self.values[probe]; | ||
|
|
@@ -68,7 +68,7 @@ impl<T: Eq + Hash> ReversePurgeItemHashMap<T> { | |
| } | ||
|
|
||
| /// Adds `adjust_amount` to the value for `key`, inserting if absent. | ||
| pub fn adjust_or_put_value(&mut self, key: T, adjust_amount: i64) { | ||
| pub fn adjust_or_put_value(&mut self, key: T, adjust_amount: u64) { | ||
| let mask = self.keys.len() - 1; | ||
| let mut probe = (hash_item(&key) as usize) & mask; | ||
| let mut drift: usize = 1; | ||
|
|
@@ -95,20 +95,20 @@ impl<T: Eq + Hash> ReversePurgeItemHashMap<T> { | |
| } | ||
|
|
||
| /// Removes all keys with non-positive counts. | ||
| pub fn keep_only_positive_counts(&mut self) { | ||
| fn keep_only_positive_counts(&mut self) { | ||
| let len = self.keys.len(); | ||
| let mut first_probe = len - 1; | ||
| while self.states[first_probe] > 0 { | ||
| first_probe -= 1; | ||
| } | ||
| for probe in (0..first_probe).rev() { | ||
| if self.states[probe] > 0 && self.values[probe] <= 0 { | ||
| if self.states[probe] > 0 && self.values[probe] == 0 { | ||
| self.hash_delete(probe); | ||
| self.num_active -= 1; | ||
| } | ||
| } | ||
| for probe in (first_probe..len).rev() { | ||
| if self.states[probe] > 0 && self.values[probe] <= 0 { | ||
| if self.states[probe] > 0 && self.values[probe] == 0 { | ||
| self.hash_delete(probe); | ||
| self.num_active -= 1; | ||
| } | ||
|
|
@@ -118,16 +118,16 @@ impl<T: Eq + Hash> ReversePurgeItemHashMap<T> { | |
| /// Shifts all values by `adjust_amount`. | ||
| /// | ||
| /// This is used during purges to decrement counters. | ||
| pub fn adjust_all_values_by(&mut self, adjust_amount: i64) { | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Revoke |
||
| for value in &mut self.values { | ||
| *value += adjust_amount; | ||
| fn adjust_all_values_by(&mut self, adjust_amount: u64) { | ||
| for value in self.values.iter_mut() { | ||
| *value = value.saturating_sub(adjust_amount); | ||
| } | ||
| } | ||
|
|
||
| /// Purges the map by estimating the median count and removing non-positive entries. | ||
| /// | ||
| /// Returns the estimated median value that was subtracted from all counts. | ||
| pub fn purge(&mut self, sample_size: usize) -> i64 { | ||
| pub fn purge(&mut self, sample_size: usize) -> u64 { | ||
| let limit = sample_size.min(self.num_active).min(MAX_SAMPLE_SIZE); | ||
| let mut samples = Vec::with_capacity(limit); | ||
| let mut i = 0usize; | ||
|
|
@@ -140,7 +140,7 @@ impl<T: Eq + Hash> ReversePurgeItemHashMap<T> { | |
| let mid = samples.len() / 2; | ||
| samples.select_nth_unstable(mid); | ||
| let median = samples[mid]; | ||
| self.adjust_all_values_by(-median); | ||
| self.adjust_all_values_by(median); | ||
| self.keep_only_positive_counts(); | ||
| median | ||
| } | ||
|
|
@@ -206,7 +206,7 @@ impl<T: Eq + Hash> ReversePurgeItemHashMap<T> { | |
| } | ||
|
|
||
| /// Returns the active values in the map. | ||
| pub fn active_values(&self) -> Vec<i64> { | ||
| pub fn active_values(&self) -> Vec<u64> { | ||
| if self.num_active == 0 { | ||
| return Vec::new(); | ||
| } | ||
|
|
@@ -292,7 +292,7 @@ impl<'a, T> ReversePurgeItemIter<'a, T> { | |
| } | ||
|
|
||
| impl<'a, T> Iterator for ReversePurgeItemIter<'a, T> { | ||
| type Item = (&'a T, i64); | ||
| type Item = (&'a T, u64); | ||
|
|
||
| fn next(&mut self) -> Option<Self::Item> { | ||
| if self.count >= self.map.num_active { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,3 +39,6 @@ pub mod theta; | |
|
|
||
| mod codec; | ||
| mod hash; | ||
| mod resize; | ||
|
|
||
| pub use self::resize::ResizeFactor; | ||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To @PsiACE,
No need
ItemsSerdeif we already implement serde over concreteFrequentItemsSketch<i64>andFrequentItemsSketch<String>.But the C++ impl does have a Serde abstraction you may investigate in https://github.com/apache/datasketches-cpp/blob/7bb979d3/common/include/serde.hpp
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
cc @AlexanderSaydakov I'd appreciate it if you can share the story and usage of
serde.hpp.