Skip to content

Commit c5cbcc2

Browse files
committed
Protect against HashMap rehash spikes in UnresolvedSamples
While adding support for in-kernel unwinding with sample rates as high as 4KHz, sample dropping became an issue. Looking closer using a `perf record` of compiler running on 16 CPU, 4KHz samples, rehashing was taking over 260ms in the worst case (see "Stack Chart" and highlighting rehash): https://share.firefox.dev/3Y32wsT This changes so that UnresolvedStacks uses a BTreeMap for the interning storage, which has a much more bounded O(log n) worst case insertion. An LRU hash map serves on top as an O(1) fast path. The new implementation is roughly on par and perhaps slightly faster, and processing time spikes are almost gone: https://share.firefox.dev/4pMM9wy Footnote, the `stacks` array itself also suffers from O(n) realloc behavior, but that doesn't seem to be problematic *yet*, so I'll leave this for when it's found to be a problem in the future.
1 parent c947d63 commit c5cbcc2

File tree

4 files changed

+81
-16
lines changed

4 files changed

+81
-16
lines changed

Cargo.lock

Lines changed: 37 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

samply/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ serde = "1.0.204"
4343
wholesym = { version = "0.8.1", path = "../wholesym", features = ["api"]}
4444
platform-dirs = "0.3"
4545
rustc-hash = "2"
46+
schnellru = "0.2"
4647
mio = { version = "1", features = ["os-ext", "os-poll"] }
4748
ctrlc = "3.4.4"
4849
log = "0.4.21"

samply/src/shared/types.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ use linux_perf_event_reader::CpuMode;
77

88
pub type FastHashMap<K, V> = rustc_hash::FxHashMap<K, V>;
99

10-
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
10+
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
1111
pub enum StackMode {
1212
User,
1313
Kernel,
@@ -37,7 +37,7 @@ impl From<CpuMode> for StackMode {
3737
}
3838
}
3939

40-
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
40+
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
4141
pub enum StackFrame {
4242
InstructionPointer(u64, StackMode),
4343
ReturnAddress(u64, StackMode),

samply/src/shared/unresolved_samples.rs

Lines changed: 41 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
use std::collections::hash_map::Entry;
2+
use std::collections::BTreeMap;
23

34
use fxprof_processed_profile::{CpuDelta, FrameHandle, MarkerHandle, ThreadHandle, Timestamp};
5+
use rustc_hash::FxBuildHasher;
6+
use schnellru::{ByLength, LruMap};
47

58
use super::types::{FastHashMap, StackFrame, StackMode};
69

@@ -154,18 +157,43 @@ pub struct SampleData {
154157
pub weight: i32,
155158
}
156159

157-
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
160+
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
158161
pub struct UnresolvedStackHandle(u32);
159162

160163
impl UnresolvedStackHandle {
161164
/// Represents the empty stack / the root stack node
162165
pub const EMPTY: Self = Self(u32::MAX);
163166
}
164167

165-
#[derive(Debug, Clone, Default)]
168+
/// An interner that recursively interns stacks in the form of (prefix, frame).
169+
///
170+
/// The interner avoids the worst case O(n) runtime behavior of hash maps due to rehashes. Under the
171+
/// hood, BTreeMap is used to store the full mapping with worst case O(log n) inserts, while an LRU
172+
/// hash map maintains a O(1) fast path for the majority of accesses.
166173
pub struct UnresolvedStacks {
167174
pub stacks: Vec<(UnresolvedStackHandle, StackFrame)>, // (prefix, frame)
168-
pub stack_lookup: FastHashMap<(UnresolvedStackHandle, StackFrame), UnresolvedStackHandle>, // (prefix, frame) -> stack index
175+
stack_lookup: BTreeMap<(UnresolvedStackHandle, StackFrame), UnresolvedStackHandle>, // (prefix, frame) -> stack index
176+
stack_cache: LruMap<(UnresolvedStackHandle, StackFrame), UnresolvedStackHandle, ByLength, FxBuildHasher>,
177+
}
178+
179+
impl UnresolvedStacks {
180+
// Two things to consider for the size of table:
181+
// - The cache should be small enough to fit in CPU cache
182+
// - LruMap uses SwissTable under the hood where erases use tombstones and rehashing is required
183+
// for compaction. The cache should be small enough to bound rehashing cost
184+
//
185+
// Assuming 64B per entry, a 256KB L2 can fit 4K entries, and rehashing time seems to be under 1ms.
186+
const CACHE_SIZE: u32 = 4096;
187+
}
188+
189+
impl Default for UnresolvedStacks {
190+
fn default() -> Self {
191+
Self {
192+
stacks: Vec::new(),
193+
stack_cache: LruMap::with_hasher(ByLength::new(Self::CACHE_SIZE), FxBuildHasher),
194+
stack_lookup: BTreeMap::new(),
195+
}
196+
}
169197
}
170198

171199
impl UnresolvedStacks {
@@ -182,11 +210,17 @@ impl UnresolvedStacks {
182210
) -> UnresolvedStackHandle {
183211
for frame in frames {
184212
let x = (prefix, frame);
213+
if let Some(node) = self.stack_cache.get(&x).copied() {
214+
prefix = node;
215+
continue;
216+
}
217+
185218
let node = *self.stack_lookup.entry(x).or_insert_with(|| {
186219
let new_index = self.stacks.len() as u32;
187220
self.stacks.push(x);
188221
UnresolvedStackHandle(new_index)
189222
});
223+
self.stack_cache.insert(x, node);
190224
prefix = node;
191225
}
192226
prefix
@@ -198,17 +232,10 @@ impl UnresolvedStacks {
198232
&mut self,
199233
frames: impl Iterator<Item = StackFrame>,
200234
) -> UnresolvedStackHandle {
201-
let mut prefix = UnresolvedStackHandle::EMPTY;
202-
for frame in frames.filter(|f| f.stack_mode() != Some(StackMode::Kernel)) {
203-
let x = (prefix, frame);
204-
let node = *self.stack_lookup.entry(x).or_insert_with(|| {
205-
let new_index = self.stacks.len() as u32;
206-
self.stacks.push(x);
207-
UnresolvedStackHandle(new_index)
208-
});
209-
prefix = node;
210-
}
211-
prefix
235+
self.convert_with_prefix(
236+
UnresolvedStackHandle::EMPTY,
237+
frames.filter(|f| f.stack_mode() != Some(StackMode::Kernel)),
238+
)
212239
}
213240

214241
// Appends the stack to `buf`, starting with the callee-most frame.

0 commit comments

Comments
 (0)