Skip to content

Commit 80bbe09

Browse files
authored
Protect against HashMap rehash spikes in UnresolvedSamples (#737)
While adding support for in-kernel unwinding with sample rates as high as 4KHz, sample dropping became an issue. Looking closer using a `perf record` of compiler running on 16 CPU, 4KHz samples, rehashing was taking over 260ms in the worst case (see "Stack Chart" and highlighting rehash): https://share.firefox.dev/3Y32wsT This changes so that UnresolvedStacks uses a BTreeMap for the interning storage, which has a much more bounded O(log n) worst case insertion. An LRU hash map serves on top as an O(1) fast path. The new implementation is roughly on par and perhaps slightly faster, and processing time spikes are almost gone: https://share.firefox.dev/4pMM9wy Footnote, the `stacks` array itself also suffers from O(n) realloc behavior, but that doesn't seem to be problematic *yet*, so I'll leave this for when it's found to be a problem in the future.
2 parents a5c0dbf + 18edc12 commit 80bbe09

File tree

4 files changed

+82
-16
lines changed

4 files changed

+82
-16
lines changed

Cargo.lock

Lines changed: 37 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

samply/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ serde = "1.0.204"
4343
wholesym = { version = "0.8.1", path = "../wholesym", features = ["api"]}
4444
platform-dirs = "0.3"
4545
rustc-hash = "2"
46+
schnellru = "0.2"
4647
mio = { version = "1", features = ["os-ext", "os-poll"] }
4748
ctrlc = "3.4.4"
4849
log = "0.4.21"

samply/src/shared/types.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ use linux_perf_event_reader::CpuMode;
77

88
pub type FastHashMap<K, V> = rustc_hash::FxHashMap<K, V>;
99

10-
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
10+
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
1111
pub enum StackMode {
1212
User,
1313
Kernel,
@@ -37,7 +37,7 @@ impl From<CpuMode> for StackMode {
3737
}
3838
}
3939

40-
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
40+
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
4141
pub enum StackFrame {
4242
InstructionPointer(u64, StackMode),
4343
ReturnAddress(u64, StackMode),

samply/src/shared/unresolved_samples.rs

Lines changed: 42 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
use std::collections::hash_map::Entry;
2+
use std::collections::BTreeMap;
23

34
use fxprof_processed_profile::{CpuDelta, FrameHandle, MarkerHandle, ThreadHandle, Timestamp};
5+
use rustc_hash::FxBuildHasher;
6+
use schnellru::{ByLength, LruMap};
47

58
use super::types::{FastHashMap, StackFrame, StackMode};
69

@@ -154,18 +157,44 @@ pub struct SampleData {
154157
pub weight: i32,
155158
}
156159

157-
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
160+
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
158161
pub struct UnresolvedStackHandle(u32);
159162

160163
impl UnresolvedStackHandle {
161164
/// Represents the empty stack / the root stack node
162165
pub const EMPTY: Self = Self(u32::MAX);
163166
}
164167

165-
#[derive(Debug, Clone, Default)]
168+
/// An interner that recursively interns stacks in the form of (prefix, frame).
169+
///
170+
/// The interner avoids the worst case O(n) runtime behavior of hash maps due to rehashes. Under the
171+
/// hood, BTreeMap is used to store the full mapping with worst case O(log n) inserts, while an LRU
172+
/// hash map maintains a O(1) fast path for the majority of accesses.
166173
pub struct UnresolvedStacks {
167174
pub stacks: Vec<(UnresolvedStackHandle, StackFrame)>, // (prefix, frame)
168-
pub stack_lookup: FastHashMap<(UnresolvedStackHandle, StackFrame), UnresolvedStackHandle>, // (prefix, frame) -> stack index
175+
stack_lookup: BTreeMap<(UnresolvedStackHandle, StackFrame), UnresolvedStackHandle>, // (prefix, frame) -> stack index
176+
stack_cache:
177+
LruMap<(UnresolvedStackHandle, StackFrame), UnresolvedStackHandle, ByLength, FxBuildHasher>,
178+
}
179+
180+
impl UnresolvedStacks {
181+
// Two things to consider for the size of table:
182+
// - The cache should be small enough to fit in CPU cache
183+
// - LruMap uses SwissTable under the hood where erases use tombstones and rehashing is required
184+
// for compaction. The cache should be small enough to bound rehashing cost
185+
//
186+
// Assuming 64B per entry, a 256KB L2 can fit 4K entries, and rehashing time seems to be under 1ms.
187+
const CACHE_SIZE: u32 = 4096;
188+
}
189+
190+
impl Default for UnresolvedStacks {
191+
fn default() -> Self {
192+
Self {
193+
stacks: Vec::new(),
194+
stack_cache: LruMap::with_hasher(ByLength::new(Self::CACHE_SIZE), FxBuildHasher),
195+
stack_lookup: BTreeMap::new(),
196+
}
197+
}
169198
}
170199

171200
impl UnresolvedStacks {
@@ -182,11 +211,17 @@ impl UnresolvedStacks {
182211
) -> UnresolvedStackHandle {
183212
for frame in frames {
184213
let x = (prefix, frame);
214+
if let Some(node) = self.stack_cache.get(&x).copied() {
215+
prefix = node;
216+
continue;
217+
}
218+
185219
let node = *self.stack_lookup.entry(x).or_insert_with(|| {
186220
let new_index = self.stacks.len() as u32;
187221
self.stacks.push(x);
188222
UnresolvedStackHandle(new_index)
189223
});
224+
self.stack_cache.insert(x, node);
190225
prefix = node;
191226
}
192227
prefix
@@ -198,17 +233,10 @@ impl UnresolvedStacks {
198233
&mut self,
199234
frames: impl Iterator<Item = StackFrame>,
200235
) -> UnresolvedStackHandle {
201-
let mut prefix = UnresolvedStackHandle::EMPTY;
202-
for frame in frames.filter(|f| f.stack_mode() != Some(StackMode::Kernel)) {
203-
let x = (prefix, frame);
204-
let node = *self.stack_lookup.entry(x).or_insert_with(|| {
205-
let new_index = self.stacks.len() as u32;
206-
self.stacks.push(x);
207-
UnresolvedStackHandle(new_index)
208-
});
209-
prefix = node;
210-
}
211-
prefix
236+
self.convert_with_prefix(
237+
UnresolvedStackHandle::EMPTY,
238+
frames.filter(|f| f.stack_mode() != Some(StackMode::Kernel)),
239+
)
212240
}
213241

214242
// Appends the stack to `buf`, starting with the callee-most frame.

0 commit comments

Comments
 (0)