From 00d7ddc9d903612c3f4a0b4cbe34a709b11a939b Mon Sep 17 00:00:00 2001 From: Tatsuyuki Ishi Date: Sun, 21 Dec 2025 14:31:35 +0900 Subject: [PATCH] Protect against HashMap rehash spikes in UnresolvedSamples While adding support for in-kernel unwinding with sample rates as high as 4KHz, sample dropping became an issue. Looking closer using a `perf record` of compiler running on 16 CPU, 4KHz samples, rehashing was taking over 260ms in the worst case (see "Stack Chart" and highlighting rehash): https://share.firefox.dev/3Y32wsT This changes so that UnresolvedStacks uses a BTreeMap for the interning storage, which has a much more bounded O(log n) worst case insertion. An LRU hash map serves on top as an O(1) fast path. The new implementation is roughly on par and perhaps slightly faster, and processing time spikes are almost gone: https://share.firefox.dev/4pMM9wy Footnote, the `stacks` array itself also suffers from O(n) realloc behavior, but that doesn't seem to be problematic *yet*, so I'll leave this for when it's found to be a problem in the future. --- Cargo.lock | 37 ++++++++++++++++ samply/Cargo.toml | 1 + samply/src/shared/types.rs | 4 +- samply/src/shared/unresolved_samples.rs | 56 ++++++++++++++++++------- 4 files changed, 82 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3dbd5df0c..93cab0489 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -18,6 +18,19 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "getrandom 0.3.4", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.4" @@ -867,6 +880,12 @@ dependencies = [ "tracing", ] +[[package]] +name = "hashbrown" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e" + [[package]] name = "hashbrown" version = "0.15.5" @@ -2240,6 +2259,7 @@ dependencies = [ "samply-debugid", "samply-object", "samply-quota-manager", + "schnellru", "serde", "serde_derive", "serde_json", @@ -2353,6 +2373,17 @@ version = "0.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0a4416eddc0eaf31e04aa4039bd3db4288ea1ba613955d86cf9c310049c5d1e2" +[[package]] +name = "schnellru" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "356285bbf17bea63d9e52e96bd18f039672ac92b55b8cb997d6162a2a37d1649" +dependencies = [ + "ahash", + "cfg-if", + "hashbrown 0.13.2", +] + [[package]] name = "scopeguard" version = "1.2.0" @@ -2868,6 +2899,12 @@ version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + [[package]] name = "walkdir" version = "2.5.0" diff --git a/samply/Cargo.toml b/samply/Cargo.toml index bd3f16c5f..c2751192b 100644 --- a/samply/Cargo.toml +++ b/samply/Cargo.toml @@ -43,6 +43,7 @@ serde = "1.0.204" wholesym = { version = "0.8.1", path = "../wholesym", features = ["api"]} platform-dirs = "0.3" rustc-hash = "2" +schnellru = "0.2" mio = { version = "1", features = ["os-ext", "os-poll"] } ctrlc = "3.4.4" log = "0.4.21" diff --git a/samply/src/shared/types.rs b/samply/src/shared/types.rs index 210aa2491..b102224e6 100644 --- a/samply/src/shared/types.rs +++ b/samply/src/shared/types.rs @@ -7,7 +7,7 @@ use linux_perf_event_reader::CpuMode; pub type FastHashMap = rustc_hash::FxHashMap; -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum StackMode { User, Kernel, @@ -37,7 +37,7 @@ impl From for StackMode { } } -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum StackFrame { InstructionPointer(u64, StackMode), ReturnAddress(u64, StackMode), diff --git a/samply/src/shared/unresolved_samples.rs b/samply/src/shared/unresolved_samples.rs index 0b0c6bd54..c36f810d7 100644 --- a/samply/src/shared/unresolved_samples.rs +++ b/samply/src/shared/unresolved_samples.rs @@ -1,6 +1,9 @@ use std::collections::hash_map::Entry; +use std::collections::BTreeMap; use fxprof_processed_profile::{CpuDelta, FrameHandle, MarkerHandle, ThreadHandle, Timestamp}; +use rustc_hash::FxBuildHasher; +use schnellru::{ByLength, LruMap}; use super::types::{FastHashMap, StackFrame, StackMode}; @@ -154,7 +157,7 @@ pub struct SampleData { pub weight: i32, } -#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)] pub struct UnresolvedStackHandle(u32); impl UnresolvedStackHandle { @@ -162,10 +165,36 @@ impl UnresolvedStackHandle { pub const EMPTY: Self = Self(u32::MAX); } -#[derive(Debug, Clone, Default)] +/// An interner that recursively interns stacks in the form of (prefix, frame). +/// +/// The interner avoids the worst case O(n) runtime behavior of hash maps due to rehashes. Under the +/// hood, BTreeMap is used to store the full mapping with worst case O(log n) inserts, while an LRU +/// hash map maintains a O(1) fast path for the majority of accesses. pub struct UnresolvedStacks { pub stacks: Vec<(UnresolvedStackHandle, StackFrame)>, // (prefix, frame) - pub stack_lookup: FastHashMap<(UnresolvedStackHandle, StackFrame), UnresolvedStackHandle>, // (prefix, frame) -> stack index + stack_lookup: BTreeMap<(UnresolvedStackHandle, StackFrame), UnresolvedStackHandle>, // (prefix, frame) -> stack index + stack_cache: + LruMap<(UnresolvedStackHandle, StackFrame), UnresolvedStackHandle, ByLength, FxBuildHasher>, +} + +impl UnresolvedStacks { + // Two things to consider for the size of table: + // - The cache should be small enough to fit in CPU cache + // - LruMap uses SwissTable under the hood where erases use tombstones and rehashing is required + // for compaction. The cache should be small enough to bound rehashing cost + // + // Assuming 64B per entry, a 256KB L2 can fit 4K entries, and rehashing time seems to be under 1ms. + const CACHE_SIZE: u32 = 4096; +} + +impl Default for UnresolvedStacks { + fn default() -> Self { + Self { + stacks: Vec::new(), + stack_cache: LruMap::with_hasher(ByLength::new(Self::CACHE_SIZE), FxBuildHasher), + stack_lookup: BTreeMap::new(), + } + } } impl UnresolvedStacks { @@ -182,11 +211,17 @@ impl UnresolvedStacks { ) -> UnresolvedStackHandle { for frame in frames { let x = (prefix, frame); + if let Some(node) = self.stack_cache.get(&x).copied() { + prefix = node; + continue; + } + let node = *self.stack_lookup.entry(x).or_insert_with(|| { let new_index = self.stacks.len() as u32; self.stacks.push(x); UnresolvedStackHandle(new_index) }); + self.stack_cache.insert(x, node); prefix = node; } prefix @@ -198,17 +233,10 @@ impl UnresolvedStacks { &mut self, frames: impl Iterator, ) -> UnresolvedStackHandle { - let mut prefix = UnresolvedStackHandle::EMPTY; - for frame in frames.filter(|f| f.stack_mode() != Some(StackMode::Kernel)) { - let x = (prefix, frame); - let node = *self.stack_lookup.entry(x).or_insert_with(|| { - let new_index = self.stacks.len() as u32; - self.stacks.push(x); - UnresolvedStackHandle(new_index) - }); - prefix = node; - } - prefix + self.convert_with_prefix( + UnresolvedStackHandle::EMPTY, + frames.filter(|f| f.stack_mode() != Some(StackMode::Kernel)), + ) } // Appends the stack to `buf`, starting with the callee-most frame.