From 317f3e5eee61ba86d923158b6544a7637acd40dd Mon Sep 17 00:00:00 2001 From: ArunPiduguDD Date: Mon, 15 Jun 2026 15:45:23 +0000 Subject: [PATCH 1/4] feat(tag_cardinality_limit transform): add exact_fingerprint mode for lower memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces Mode::ExactFingerprint (YAML: mode: exact_fingerprint), an opt-in storage mode that reduces per-accepted-value memory from ~128 B to ~9 B by storing 64-bit hash fingerprints of tag values instead of the full strings. Design choices: - Stores only u64 fingerprints; accepts a vanishingly small collision risk (≈ 7e-15 per set at the default value_limit=500), which can cause a minor cardinality undercount. Mode::Exact remains byte-exact for users who need it. - Fingerprints are computed with the std DefaultHasher (stateless, fixed keys, no per-set hasher state) — the same hasher TagValueSet's own Hash impl uses internally. - Fingerprint table uses HashBuildHasher (identity/pass-through hasher) to avoid double-hashing an already-uniformly-distributed u64. - Mode::ExactFingerprint and OverrideMode::ExactFingerprint are new, user-visible config variants. Existing Mode::Exact semantics are completely unchanged. Also fixes test_accepted_tag_value_set_probabilistic in tag_value_set.rs, which was erroneously constructing Mode::Exact and therefore not testing the Bloom path at all. Benchmarked on a local release binary across M=50K/100K, T=10/50, V=1/10/100. Memory reduction vs exact mode: 36-46% at V=1, 65-75% at V=10, 85-88% at V=100. See tcl_memtest/SESSION_NOTES_2026-06-12.md for full results. Co-authored-by: ArunPiduguDD --- ...dinality_limit_fingerprint_mode.feature.md | 7 + .../tag_cardinality_limit/config.rs | 11 ++ .../tag_cardinality_limit/tag_value_set.rs | 112 +++++++++++++- src/transforms/tag_cardinality_limit/tests.rs | 142 ++++++++++++++++++ .../generated/tag_cardinality_limit.cue | 10 +- .../transforms/tag_cardinality_limit.cue | 4 + 6 files changed, 282 insertions(+), 4 deletions(-) create mode 100644 changelog.d/tag_cardinality_limit_fingerprint_mode.feature.md diff --git a/changelog.d/tag_cardinality_limit_fingerprint_mode.feature.md b/changelog.d/tag_cardinality_limit_fingerprint_mode.feature.md new file mode 100644 index 0000000000000..9372f2b910fda --- /dev/null +++ b/changelog.d/tag_cardinality_limit_fingerprint_mode.feature.md @@ -0,0 +1,7 @@ +The `tag_cardinality_limit` transform now supports `mode: exact_fingerprint`, a new storage +mode that can reduce memory usage for high-cardinality tag values compared to +`mode: exact`. Instead of storing the full tag-value strings, only a 64 bit fingerprint hash of +each value is kept. The trade-off is that throughput is slightly impacted due to extra hashing +operations, and there is technically a (unlikely) chance of collisions at very high cardinalities + +authors: ArunPiduguDD diff --git a/src/transforms/tag_cardinality_limit/config.rs b/src/transforms/tag_cardinality_limit/config.rs index a1471746972fa..43e255f82281c 100644 --- a/src/transforms/tag_cardinality_limit/config.rs +++ b/src/transforms/tag_cardinality_limit/config.rs @@ -114,6 +114,13 @@ pub enum Mode { /// metrics with new tags after the limit has been hit. Exact, + /// This mode operates similarly to `exact` mode except it tracks cardinality using 64-bit hash fingerprints + /// of tag values instead of the original strings. This leads to lower memory requirements in most + /// scenarios (assuming average tag value size is greater than 8 bytes) at the cost of slightly + /// reduced throughput due to extra hashing operations and a very small chance of collisions at + /// very high cardinalities + ExactFingerprint, + /// Tracks cardinality probabilistically. /// /// This mode has lower memory requirements than `exact`, but may occasionally allow metric @@ -183,6 +190,9 @@ pub enum OverrideMode { /// Tracks cardinality exactly. See `Mode::Exact` for details. Exact, + /// Tracks cardinality using 64-bit hash fingerprints. See `Mode::ExactFingerprint` for details. + ExactFingerprint, + /// Tracks cardinality probabilistically. See `Mode::Probabilistic` for details. Probabilistic(BloomFilterConfig), @@ -196,6 +206,7 @@ impl OverrideMode { pub const fn as_mode(&self) -> Option { match self { OverrideMode::Exact => Some(Mode::Exact), + OverrideMode::ExactFingerprint => Some(Mode::ExactFingerprint), OverrideMode::Probabilistic(b) => Some(Mode::Probabilistic(*b)), OverrideMode::Excluded => None, } diff --git a/src/transforms/tag_cardinality_limit/tag_value_set.rs b/src/transforms/tag_cardinality_limit/tag_value_set.rs index 0d9bbc216db25..1ce91e7816692 100644 --- a/src/transforms/tag_cardinality_limit/tag_value_set.rs +++ b/src/transforms/tag_cardinality_limit/tag_value_set.rs @@ -1,10 +1,25 @@ -use std::{collections::HashSet, fmt}; +use std::{ + collections::HashSet, + fmt, + hash::{BuildHasher, BuildHasherDefault}, +}; use bloomy::BloomFilter; +use hash_hasher::HashedSet; +use seahash::SeaHasher; use crate::{event::metric::TagValueSet, transforms::tag_cardinality_limit::config::Mode}; /// Container for storing the set of accepted values for a given tag key. +/// +/// # Storage backend selection +/// +/// | `Mode` | Storage | +/// |----------------------|---------------------------------| +/// | `Exact` | `HashSet` | +/// | `ExactFingerprint` | `HashSet` (fingerprints) | +/// | `Probabilistic` | `BloomFilter | + #[derive(Debug)] pub struct AcceptedTagValueSet { storage: TagValueSetStorage, @@ -13,6 +28,8 @@ pub struct AcceptedTagValueSet { enum TagValueSetStorage { Set(HashSet), Bloom(BloomFilterStorage), + /// Stores 64-bit hash fingerprints of accepted tag values + Fingerprint(FingerprintStorage), } /// A bloom filter that tracks the number of items inserted into it. @@ -49,19 +66,51 @@ impl BloomFilterStorage { } } +struct FingerprintStorage { + fps: HashedSet, +} + +impl FingerprintStorage { + fn new() -> Self { + Self { + fps: HashedSet::default(), + } + } + + /// Compute a 64-bit fingerprint of a tag value + fn fingerprint(value: &TagValueSet) -> u64 { + BuildHasherDefault::::default().hash_one(value) + } + + fn insert(&mut self, value: &TagValueSet) { + self.fps.insert(Self::fingerprint(value)); + } + + fn contains(&self, value: &TagValueSet) -> bool { + self.fps.contains(&Self::fingerprint(value)) + } + + fn len(&self) -> usize { + self.fps.len() + } +} + impl fmt::Debug for TagValueSetStorage { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { TagValueSetStorage::Set(set) => write!(f, "Set({set:?})"), TagValueSetStorage::Bloom(_) => write!(f, "Bloom"), + TagValueSetStorage::Fingerprint(_) => write!(f, "Fingerprint"), } } } impl AcceptedTagValueSet { + /// Create a new `AcceptedTagValueSet` for the given mode. pub fn new(mode: &Mode) -> Self { let storage = match &mode { Mode::Exact => TagValueSetStorage::Set(HashSet::new()), + Mode::ExactFingerprint => TagValueSetStorage::Fingerprint(FingerprintStorage::new()), Mode::Probabilistic(config) => { TagValueSetStorage::Bloom(BloomFilterStorage::new(config.cache_size_per_key)) } @@ -73,6 +122,7 @@ impl AcceptedTagValueSet { match &self.storage { TagValueSetStorage::Set(set) => set.contains(value), TagValueSetStorage::Bloom(bloom) => bloom.contains(value), + TagValueSetStorage::Fingerprint(fp) => fp.contains(value), } } @@ -80,6 +130,7 @@ impl AcceptedTagValueSet { match &self.storage { TagValueSetStorage::Set(set) => set.len(), TagValueSetStorage::Bloom(bloom) => bloom.count(), + TagValueSetStorage::Fingerprint(fp) => fp.len(), } } @@ -89,6 +140,7 @@ impl AcceptedTagValueSet { set.insert(value); } TagValueSetStorage::Bloom(bloom) => bloom.insert(&value), + TagValueSetStorage::Fingerprint(fp) => fp.insert(&value), }; } } @@ -96,7 +148,10 @@ impl AcceptedTagValueSet { #[cfg(test)] mod tests { use super::*; - use crate::{event::metric::TagValueSet, transforms::tag_cardinality_limit::config::Mode}; + use crate::{ + event::metric::TagValueSet, + transforms::tag_cardinality_limit::config::{BloomFilterConfig, Mode}, + }; #[test] fn test_accepted_tag_value_set_exact() { @@ -116,7 +171,11 @@ mod tests { #[test] fn test_accepted_tag_value_set_probabilistic() { - let mut accepted_tag_value_set = AcceptedTagValueSet::new(&Mode::Exact); + // Previously this test mistakenly constructed Mode::Exact; fixed to use Probabilistic. + let mut accepted_tag_value_set = + AcceptedTagValueSet::new(&Mode::Probabilistic(BloomFilterConfig { + cache_size_per_key: 5 * 1024, + })); assert!(!accepted_tag_value_set.contains(&TagValueSet::from(["value1".to_string()]))); assert_eq!(accepted_tag_value_set.len(), 0); @@ -134,4 +193,51 @@ mod tests { assert_eq!(accepted_tag_value_set.len(), 2); assert!(accepted_tag_value_set.contains(&TagValueSet::from(["value2".to_string()]))); } + + #[test] + fn test_accepted_tag_value_set_fingerprint() { + let mut set = AcceptedTagValueSet::new(&Mode::ExactFingerprint); + + assert!(!set.contains(&TagValueSet::from(["value1".to_string()]))); + assert_eq!(set.len(), 0); + + set.insert(TagValueSet::from(["value1".to_string()])); + assert_eq!(set.len(), 1); + assert!(set.contains(&TagValueSet::from(["value1".to_string()]))); + + // Inserting the same value again must not increase the count. + set.insert(TagValueSet::from(["value1".to_string()])); + assert_eq!(set.len(), 1); + + set.insert(TagValueSet::from(["value2".to_string()])); + assert_eq!(set.len(), 2); + assert!(set.contains(&TagValueSet::from(["value2".to_string()]))); + + // An un-inserted value must not appear to be contained. + assert!(!set.contains(&TagValueSet::from(["value3".to_string()]))); + + // Fingerprinting is deterministic, so a separate set must agree on membership. + let mut set2 = AcceptedTagValueSet::new(&Mode::ExactFingerprint); + set2.insert(TagValueSet::from(["value1".to_string()])); + assert!(set2.contains(&TagValueSet::from(["value1".to_string()]))); + assert!(!set2.contains(&TagValueSet::from(["value3".to_string()]))); + } + + #[test] + fn test_fingerprint_distribution_no_collisions() { + // Empirically guards the "good distribution" claim: inserting many distinct values + // must yield an equal number of distinct fingerprints. At 64 bits the birthday + // collision probability for 100k values is ~2.7e-10, so any collision here would + // indicate a badly-distributed hash rather than bad luck. + let mut set = AcceptedTagValueSet::new(&Mode::ExactFingerprint); + let n = 100_000; + for i in 0..n { + set.insert(TagValueSet::from([format!("tag-value-{i}")])); + } + assert_eq!( + set.len(), + n, + "distinct values must produce distinct fingerprints" + ); + } } diff --git a/src/transforms/tag_cardinality_limit/tests.rs b/src/transforms/tag_cardinality_limit/tests.rs index b8a453a3c4844..e2004f60b3d50 100644 --- a/src/transforms/tag_cardinality_limit/tests.rs +++ b/src/transforms/tag_cardinality_limit/tests.rs @@ -121,6 +121,24 @@ fn make_transform_bloom_with_per_metric_limits( } } +fn make_transform_fingerprint( + value_limit: usize, + limit_exceeded_action: LimitExceededAction, +) -> Config { + Config { + global: Inner { + value_limit, + limit_exceeded_action, + mode: Mode::ExactFingerprint, + internal_metrics: InternalMetricsConfig::default(), + }, + tracking_scope: TrackingScope::default(), + max_tracked_keys: None, + per_metric_limits: HashMap::new(), + per_tag_limits: HashMap::new(), + } +} + fn make_transform_with_global_per_tag_limits( value_limit: usize, limit_exceeded_action: LimitExceededAction, @@ -1235,6 +1253,11 @@ fn global_per_tag_excluded_drop_tag_passthrough_bloom() { })); } +#[test] +fn global_per_tag_excluded_drop_tag_passthrough_fingerprint() { + global_per_tag_excluded_drop_tag_passthrough(Mode::ExactFingerprint); +} + /// A globally-excluded tag passes through unchanged on every metric, even when its values /// would have exceeded `value_limit`. Sibling non-excluded tags still respect the limit. fn global_per_tag_excluded_drop_tag_passthrough(mode: Mode) { @@ -1287,6 +1310,11 @@ fn global_per_tag_excluded_drop_event_passthrough_bloom() { })); } +#[test] +fn global_per_tag_excluded_drop_event_passthrough_fingerprint() { + global_per_tag_excluded_drop_event_passthrough(Mode::ExactFingerprint); +} + /// Under `DropEvent`, a globally-excluded tag never triggers a drop, but a non-excluded /// tag exceeding `value_limit` still does. fn global_per_tag_excluded_drop_event_passthrough(mode: Mode) { @@ -1461,3 +1489,117 @@ per_tag_limits: let excluded = parsed.per_tag_limits.get("excluded_tag").unwrap(); assert_eq!(excluded.mode, PerTagMode::Excluded); } + +/// Under `DropTag`, fingerprint mode accepts exactly `value_limit` distinct values per +/// tag key and drops subsequent new values. +#[test] +fn fingerprint_drop_tag_respects_value_limit() { + let mut transform = + TagCardinalityLimit::new(make_transform_fingerprint(2, LimitExceededAction::DropTag)); + + // First two distinct values for "env" are accepted. + let e1 = transform + .transform_one(make_metric(metric_tags!("env" => "prod"))) + .unwrap(); + assert_eq!("prod", e1.as_metric().tags().unwrap().get("env").unwrap()); + + let e2 = transform + .transform_one(make_metric(metric_tags!("env" => "staging"))) + .unwrap(); + assert_eq!( + "staging", + e2.as_metric().tags().unwrap().get("env").unwrap() + ); + + // Third distinct value — limit reached, tag must be dropped. + let e3 = transform + .transform_one(make_metric(metric_tags!("env" => "dev"))) + .unwrap(); + assert!( + !e3.as_metric().tags().unwrap().contains_key("env"), + "fingerprint mode should drop the tag after value_limit is reached" + ); + + // A previously-accepted value still passes through after the limit is hit. + let e4 = transform + .transform_one(make_metric(metric_tags!("env" => "prod"))) + .unwrap(); + assert_eq!("prod", e4.as_metric().tags().unwrap().get("env").unwrap()); +} + +/// Under `DropEvent`, fingerprint mode drops the entire event when any tag would exceed +/// its `value_limit`. +#[test] +fn fingerprint_drop_event_respects_value_limit() { + let mut transform = TagCardinalityLimit::new(make_transform_fingerprint( + 2, + LimitExceededAction::DropEvent, + )); + + let e1 = make_metric(metric_tags!("env" => "prod")); + let e2 = make_metric(metric_tags!("env" => "staging")); + let e3 = make_metric(metric_tags!("env" => "dev")); + // Re-send of an already-accepted value must NOT drop the event. + let e4 = make_metric(metric_tags!("env" => "prod")); + + assert_eq!(transform.transform_one(e1.clone()), Some(e1)); + assert_eq!(transform.transform_one(e2.clone()), Some(e2)); + assert_eq!( + transform.transform_one(e3), + None, + "3rd distinct value should drop the event" + ); + assert_eq!( + transform.transform_one(e4.clone()), + Some(e4), + "re-send of accepted value must not drop the event" + ); +} + +/// Fingerprint mode must never allocate a tracking entry for a tag that is globally +/// excluded, matching the `Mode::Exact` "never allocate" contract. +#[test] +fn fingerprint_excluded_tag_never_populates_cache() { + let config = make_transform_with_global_per_tag_limits( + 2, + LimitExceededAction::DropTag, + Mode::ExactFingerprint, + HashMap::from([("kube_pod_name".to_string(), make_per_tag_excluded())]), + ); + let mut transform = TagCardinalityLimit::new(config); + + for i in 0..10 { + let event = make_metric(metric_tags!( + "kube_pod_name" => format!("pod-{i}").as_str(), + "tag1" => "val1" + )); + transform.transform_one(event).unwrap(); + } + + let bucket = transform + .accepted_tags + .get(&None) + .expect("non-excluded tag1 should still allocate a global bucket"); + assert!( + bucket.contains_key("tag1"), + "non-excluded tag must still be tracked" + ); + assert!( + !bucket.contains_key("kube_pod_name"), + "excluded tag key must never enter the fingerprint cache" + ); +} + +/// Fingerprint mode YAML round-trips: `mode: exact_fingerprint` deserializes cleanly. +#[test] +fn fingerprint_mode_deserializes() { + let yaml = "mode: exact_fingerprint"; + let mode: Mode = serde_yaml::from_str(yaml).expect("exact_fingerprint should deserialize"); + assert_eq!(mode, Mode::ExactFingerprint); + + let serialized = serde_yaml::to_string(&mode).expect("should serialize"); + assert!( + serialized.contains("exact_fingerprint"), + "serialized form should contain 'exact_fingerprint'" + ); +} diff --git a/website/cue/reference/components/transforms/generated/tag_cardinality_limit.cue b/website/cue/reference/components/transforms/generated/tag_cardinality_limit.cue index fe47c37655f90..05a28ad70e627 100644 --- a/website/cue/reference/components/transforms/generated/tag_cardinality_limit.cue +++ b/website/cue/reference/components/transforms/generated/tag_cardinality_limit.cue @@ -66,6 +66,13 @@ generated: components: transforms: tag_cardinality_limit: configuration: { This mode has higher memory requirements than `probabilistic`, but never falsely outputs metrics with new tags after the limit has been hit. """ + exact_fingerprint: """ + This mode operates similarly to `exact` mode except it tracks cardinality using 64-bit hash fingerprints + of tag values instead of the original strings. This leads to lower memory requirements in most + scenarios (assuming average tag value size is greater than 8 bytes) at the cost of slightly + reduced throughput due to extra hashing operations and a very small chance of collisions at + very high cardinalities + """ probabilistic: """ Tracks cardinality probabilistically. @@ -126,7 +133,8 @@ generated: components: transforms: tag_cardinality_limit: configuration: { description: "Controls the approach taken for tracking tag cardinality." required: true type: string: enum: { - exact: "Tracks cardinality exactly. See `Mode::Exact` for details." + exact: "Tracks cardinality exactly. See `Mode::Exact` for details." + exact_fingerprint: "Tracks cardinality using 64-bit hash fingerprints. See `Mode::ExactFingerprint` for details." excluded: """ Skip cardinality tracking for this metric. All tag values pass through and nothing is limited. Other fields in this per-metric configuration are ignored when this is selected. diff --git a/website/cue/reference/components/transforms/tag_cardinality_limit.cue b/website/cue/reference/components/transforms/tag_cardinality_limit.cue index 5aad8656d8eff..ac2d475e1f9c2 100644 --- a/website/cue/reference/components/transforms/tag_cardinality_limit.cue +++ b/website/cue/reference/components/transforms/tag_cardinality_limit.cue @@ -125,6 +125,10 @@ components: transforms: tag_cardinality_limit: { metrics) ``` + Mode `exact_fingerprint` behaves like `exact` but stores an 8-byte hash of each + value instead of the value itself, so use the same formula with `8` in place of + the average tag value length. + In mode `probabilistic`, rather than storing all values seen for each key, each distinct key has a bloom filter which can probabilistically determine whether a given value has been seen for that key. The formula for estimating memory From a3756c163cc9231bd1dd975013a41b5b555e47b4 Mon Sep 17 00:00:00 2001 From: ArunPiduguDD Date: Wed, 17 Jun 2026 15:41:31 +0000 Subject: [PATCH 2/4] Add suggested changes (test refactor + fps rename + use derive) --- .../tag_cardinality_limit/tag_value_set.rs | 16 ++-- src/transforms/tag_cardinality_limit/tests.rs | 87 +++++-------------- 2 files changed, 30 insertions(+), 73 deletions(-) diff --git a/src/transforms/tag_cardinality_limit/tag_value_set.rs b/src/transforms/tag_cardinality_limit/tag_value_set.rs index 1ce91e7816692..7d8a556e78a1c 100644 --- a/src/transforms/tag_cardinality_limit/tag_value_set.rs +++ b/src/transforms/tag_cardinality_limit/tag_value_set.rs @@ -66,16 +66,12 @@ impl BloomFilterStorage { } } +#[derive(Default)] struct FingerprintStorage { - fps: HashedSet, + fingerprints: HashedSet, } impl FingerprintStorage { - fn new() -> Self { - Self { - fps: HashedSet::default(), - } - } /// Compute a 64-bit fingerprint of a tag value fn fingerprint(value: &TagValueSet) -> u64 { @@ -83,15 +79,15 @@ impl FingerprintStorage { } fn insert(&mut self, value: &TagValueSet) { - self.fps.insert(Self::fingerprint(value)); + self.fingerprints.insert(Self::fingerprint(value)); } fn contains(&self, value: &TagValueSet) -> bool { - self.fps.contains(&Self::fingerprint(value)) + self.fingerprints.contains(&Self::fingerprint(value)) } fn len(&self) -> usize { - self.fps.len() + self.fingerprints.len() } } @@ -110,7 +106,7 @@ impl AcceptedTagValueSet { pub fn new(mode: &Mode) -> Self { let storage = match &mode { Mode::Exact => TagValueSetStorage::Set(HashSet::new()), - Mode::ExactFingerprint => TagValueSetStorage::Fingerprint(FingerprintStorage::new()), + Mode::ExactFingerprint => TagValueSetStorage::Fingerprint(FingerprintStorage::default()), Mode::Probabilistic(config) => { TagValueSetStorage::Bloom(BloomFilterStorage::new(config.cache_size_per_key)) } diff --git a/src/transforms/tag_cardinality_limit/tests.rs b/src/transforms/tag_cardinality_limit/tests.rs index e2004f60b3d50..7ba6accbb7fba 100644 --- a/src/transforms/tag_cardinality_limit/tests.rs +++ b/src/transforms/tag_cardinality_limit/tests.rs @@ -169,6 +169,11 @@ async fn tag_cardinality_limit_drop_event_bloom() { drop_event(make_transform_bloom(2, LimitExceededAction::DropEvent)).await; } +#[tokio::test] +async fn tag_cardinality_limit_drop_event_fingerprint() { + drop_event(make_transform_fingerprint(2, LimitExceededAction::DropEvent)).await; +} + async fn drop_event(config: Config) { assert_transform_compliance(async move { let mut event1 = make_metric(metric_tags!("tag1" => "val1")); @@ -221,6 +226,11 @@ async fn tag_cardinality_limit_drop_tag_bloom() { drop_tag(make_transform_bloom(2, LimitExceededAction::DropTag)).await; } +#[tokio::test] +async fn tag_cardinality_limit_drop_tag_fingerprint() { + drop_tag(make_transform_fingerprint(2, LimitExceededAction::DropTag)).await; +} + async fn drop_tag(config: Config) { assert_transform_compliance(async move { let tags1 = metric_tags!("tag1" => "val1", "tag2" => "val1"); @@ -1490,70 +1500,21 @@ per_tag_limits: assert_eq!(excluded.mode, PerTagMode::Excluded); } -/// Under `DropTag`, fingerprint mode accepts exactly `value_limit` distinct values per -/// tag key and drops subsequent new values. +/// A re-sent already-accepted tag value must pass through even after the limit is hit, +/// for both DropTag and DropEvent actions. #[test] -fn fingerprint_drop_tag_respects_value_limit() { - let mut transform = - TagCardinalityLimit::new(make_transform_fingerprint(2, LimitExceededAction::DropTag)); - - // First two distinct values for "env" are accepted. - let e1 = transform - .transform_one(make_metric(metric_tags!("env" => "prod"))) - .unwrap(); - assert_eq!("prod", e1.as_metric().tags().unwrap().get("env").unwrap()); - - let e2 = transform - .transform_one(make_metric(metric_tags!("env" => "staging"))) - .unwrap(); - assert_eq!( - "staging", - e2.as_metric().tags().unwrap().get("env").unwrap() - ); - - // Third distinct value — limit reached, tag must be dropped. - let e3 = transform - .transform_one(make_metric(metric_tags!("env" => "dev"))) - .unwrap(); - assert!( - !e3.as_metric().tags().unwrap().contains_key("env"), - "fingerprint mode should drop the tag after value_limit is reached" - ); - - // A previously-accepted value still passes through after the limit is hit. - let e4 = transform - .transform_one(make_metric(metric_tags!("env" => "prod"))) - .unwrap(); - assert_eq!("prod", e4.as_metric().tags().unwrap().get("env").unwrap()); -} - -/// Under `DropEvent`, fingerprint mode drops the entire event when any tag would exceed -/// its `value_limit`. -#[test] -fn fingerprint_drop_event_respects_value_limit() { - let mut transform = TagCardinalityLimit::new(make_transform_fingerprint( - 2, - LimitExceededAction::DropEvent, - )); - - let e1 = make_metric(metric_tags!("env" => "prod")); - let e2 = make_metric(metric_tags!("env" => "staging")); - let e3 = make_metric(metric_tags!("env" => "dev")); - // Re-send of an already-accepted value must NOT drop the event. - let e4 = make_metric(metric_tags!("env" => "prod")); - - assert_eq!(transform.transform_one(e1.clone()), Some(e1)); - assert_eq!(transform.transform_one(e2.clone()), Some(e2)); - assert_eq!( - transform.transform_one(e3), - None, - "3rd distinct value should drop the event" - ); - assert_eq!( - transform.transform_one(e4.clone()), - Some(e4), - "re-send of accepted value must not drop the event" - ); +fn fingerprint_accepted_value_passes_through_after_limit() { + for action in [LimitExceededAction::DropTag, LimitExceededAction::DropEvent] { + let mut transform = + TagCardinalityLimit::new(make_transform_fingerprint(2, action)); + transform.transform_one(make_metric(metric_tags!("env" => "prod"))).unwrap(); + transform.transform_one(make_metric(metric_tags!("env" => "staging"))).unwrap(); + // Limit now hit; re-send of an already-accepted value must still pass through. + let e = transform + .transform_one(make_metric(metric_tags!("env" => "prod"))) + .unwrap(); + assert_eq!("prod", e.as_metric().tags().unwrap().get("env").unwrap()); + } } /// Fingerprint mode must never allocate a tracking entry for a tag that is globally From 430f0371b42431612587cd2fbaa4ec731ef0eb07 Mon Sep 17 00:00:00 2001 From: ArunPiduguDD Date: Wed, 17 Jun 2026 15:44:38 +0000 Subject: [PATCH 3/4] cargo fmt --- .../tag_cardinality_limit/tag_value_set.rs | 5 +++-- src/transforms/tag_cardinality_limit/tests.rs | 17 ++++++++++++----- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/transforms/tag_cardinality_limit/tag_value_set.rs b/src/transforms/tag_cardinality_limit/tag_value_set.rs index 7d8a556e78a1c..09ec332f803bb 100644 --- a/src/transforms/tag_cardinality_limit/tag_value_set.rs +++ b/src/transforms/tag_cardinality_limit/tag_value_set.rs @@ -72,7 +72,6 @@ struct FingerprintStorage { } impl FingerprintStorage { - /// Compute a 64-bit fingerprint of a tag value fn fingerprint(value: &TagValueSet) -> u64 { BuildHasherDefault::::default().hash_one(value) @@ -106,7 +105,9 @@ impl AcceptedTagValueSet { pub fn new(mode: &Mode) -> Self { let storage = match &mode { Mode::Exact => TagValueSetStorage::Set(HashSet::new()), - Mode::ExactFingerprint => TagValueSetStorage::Fingerprint(FingerprintStorage::default()), + Mode::ExactFingerprint => { + TagValueSetStorage::Fingerprint(FingerprintStorage::default()) + } Mode::Probabilistic(config) => { TagValueSetStorage::Bloom(BloomFilterStorage::new(config.cache_size_per_key)) } diff --git a/src/transforms/tag_cardinality_limit/tests.rs b/src/transforms/tag_cardinality_limit/tests.rs index 7ba6accbb7fba..423f65f8412bd 100644 --- a/src/transforms/tag_cardinality_limit/tests.rs +++ b/src/transforms/tag_cardinality_limit/tests.rs @@ -171,7 +171,11 @@ async fn tag_cardinality_limit_drop_event_bloom() { #[tokio::test] async fn tag_cardinality_limit_drop_event_fingerprint() { - drop_event(make_transform_fingerprint(2, LimitExceededAction::DropEvent)).await; + drop_event(make_transform_fingerprint( + 2, + LimitExceededAction::DropEvent, + )) + .await; } async fn drop_event(config: Config) { @@ -1505,10 +1509,13 @@ per_tag_limits: #[test] fn fingerprint_accepted_value_passes_through_after_limit() { for action in [LimitExceededAction::DropTag, LimitExceededAction::DropEvent] { - let mut transform = - TagCardinalityLimit::new(make_transform_fingerprint(2, action)); - transform.transform_one(make_metric(metric_tags!("env" => "prod"))).unwrap(); - transform.transform_one(make_metric(metric_tags!("env" => "staging"))).unwrap(); + let mut transform = TagCardinalityLimit::new(make_transform_fingerprint(2, action)); + transform + .transform_one(make_metric(metric_tags!("env" => "prod"))) + .unwrap(); + transform + .transform_one(make_metric(metric_tags!("env" => "staging"))) + .unwrap(); // Limit now hit; re-send of an already-accepted value must still pass through. let e = transform .transform_one(make_metric(metric_tags!("env" => "prod"))) From 9a7deab9c7c00bc123ab0818742a37ef99cb892d Mon Sep 17 00:00:00 2001 From: ArunPiduguDD Date: Tue, 23 Jun 2026 15:32:08 +0000 Subject: [PATCH 4/4] Initialize exact fingerprint hasher with random seed --- .../tag_cardinality_limit/config.rs | 2 +- .../tag_cardinality_limit/tag_value_set.rs | 39 ++++++++++++++----- .../generated/tag_cardinality_limit.cue | 2 +- 3 files changed, 32 insertions(+), 11 deletions(-) diff --git a/src/transforms/tag_cardinality_limit/config.rs b/src/transforms/tag_cardinality_limit/config.rs index 43e255f82281c..a7a83ba399ee6 100644 --- a/src/transforms/tag_cardinality_limit/config.rs +++ b/src/transforms/tag_cardinality_limit/config.rs @@ -118,7 +118,7 @@ pub enum Mode { /// of tag values instead of the original strings. This leads to lower memory requirements in most /// scenarios (assuming average tag value size is greater than 8 bytes) at the cost of slightly /// reduced throughput due to extra hashing operations and a very small chance of collisions at - /// very high cardinalities + /// very high cardinalities. ExactFingerprint, /// Tracks cardinality probabilistically. diff --git a/src/transforms/tag_cardinality_limit/tag_value_set.rs b/src/transforms/tag_cardinality_limit/tag_value_set.rs index 09ec332f803bb..a223bb3ffbedc 100644 --- a/src/transforms/tag_cardinality_limit/tag_value_set.rs +++ b/src/transforms/tag_cardinality_limit/tag_value_set.rs @@ -1,12 +1,11 @@ use std::{ - collections::HashSet, + collections::{HashSet, hash_map::RandomState}, fmt, - hash::{BuildHasher, BuildHasherDefault}, + hash::BuildHasher, }; use bloomy::BloomFilter; use hash_hasher::HashedSet; -use seahash::SeaHasher; use crate::{event::metric::TagValueSet, transforms::tag_cardinality_limit::config::Mode}; @@ -69,20 +68,22 @@ impl BloomFilterStorage { #[derive(Default)] struct FingerprintStorage { fingerprints: HashedSet, + /// Per-instance randomized hasher state. Each instance gets a distinct seed, making + /// pre-computed collision attacks infeasible. + seed: RandomState, } impl FingerprintStorage { - /// Compute a 64-bit fingerprint of a tag value - fn fingerprint(value: &TagValueSet) -> u64 { - BuildHasherDefault::::default().hash_one(value) + fn fingerprint(&self, value: &TagValueSet) -> u64 { + self.seed.hash_one(value) } fn insert(&mut self, value: &TagValueSet) { - self.fingerprints.insert(Self::fingerprint(value)); + self.fingerprints.insert(self.fingerprint(value)); } fn contains(&self, value: &TagValueSet) -> bool { - self.fingerprints.contains(&Self::fingerprint(value)) + self.fingerprints.contains(&self.fingerprint(value)) } fn len(&self) -> usize { @@ -213,13 +214,33 @@ mod tests { // An un-inserted value must not appear to be contained. assert!(!set.contains(&TagValueSet::from(["value3".to_string()]))); - // Fingerprinting is deterministic, so a separate set must agree on membership. + // Within-instance consistency: a value inserted into a set is found in that same set. let mut set2 = AcceptedTagValueSet::new(&Mode::ExactFingerprint); set2.insert(TagValueSet::from(["value1".to_string()])); assert!(set2.contains(&TagValueSet::from(["value1".to_string()]))); assert!(!set2.contains(&TagValueSet::from(["value3".to_string()]))); } + #[test] + fn test_fingerprint_storage_uses_independent_seeds() { + // Two fresh FingerprintStorage instances must normally produce different fingerprints + // for the same value, proving that the per-instance random seed is active and no + // shared fixed seed exists that an attacker could exploit. + // + // Collision probability across two independent instances is ~2^-64; a failure here + // would indicate the seed is not being randomised. + let probe = TagValueSet::from(["probe-value".to_string()]); + let s1 = AcceptedTagValueSet::new(&Mode::ExactFingerprint); + let s2 = AcceptedTagValueSet::new(&Mode::ExactFingerprint); + // Insert into s1, must NOT appear in s2 (different seed → different fingerprint) + let mut s1 = s1; + s1.insert(probe.clone()); + assert!( + !s2.contains(&probe), + "distinct FingerprintStorage instances must use independent random seeds" + ); + } + #[test] fn test_fingerprint_distribution_no_collisions() { // Empirically guards the "good distribution" claim: inserting many distinct values diff --git a/website/cue/reference/components/transforms/generated/tag_cardinality_limit.cue b/website/cue/reference/components/transforms/generated/tag_cardinality_limit.cue index 05a28ad70e627..7d4ebe3368e50 100644 --- a/website/cue/reference/components/transforms/generated/tag_cardinality_limit.cue +++ b/website/cue/reference/components/transforms/generated/tag_cardinality_limit.cue @@ -71,7 +71,7 @@ generated: components: transforms: tag_cardinality_limit: configuration: { of tag values instead of the original strings. This leads to lower memory requirements in most scenarios (assuming average tag value size is greater than 8 bytes) at the cost of slightly reduced throughput due to extra hashing operations and a very small chance of collisions at - very high cardinalities + very high cardinalities. """ probabilistic: """ Tracks cardinality probabilistically.