Skip to content

Commit 7f65690

Browse files
feat: get rid of path hashing in favor of an index-based shared map
1 parent aed5b4e commit 7f65690

7 files changed

Lines changed: 89 additions & 75 deletions

File tree

crates/runner-shared/src/metadata.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ use serde::{Deserialize, Serialize};
33
use std::collections::HashMap;
44
use std::io::BufWriter;
55
use std::path::Path;
6+
use std::path::PathBuf;
67

78
use crate::debug_info::{DebugInfoPidMapping, ModuleDebugInfo};
89
use crate::fifo::MarkerType;
@@ -48,6 +49,11 @@ pub struct PerfMetadata {
4849
/// Per-pid symbol references, mapping PID to list of perf map index + load bias
4950
#[serde(default, skip_serializing_if = "HashMap::is_empty")]
5051
pub symbol_pid_mappings_by_pid: HashMap<i32, Vec<SymbolPidMapping>>,
52+
53+
/// Mapping from semantic key to original binary path
54+
/// Kept for traceability, and if we ever need to reconstruct the original paths from the keys
55+
#[serde(default, skip_serializing_if = "HashMap::is_empty")]
56+
pub key_to_path: HashMap<String, PathBuf>,
5157
}
5258

5359
impl PerfMetadata {

src/executor/wall_time/perf/jit_dump.rs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ impl JitDump {
5353
pub fn into_unwind_data(
5454
self,
5555
) -> Result<(
56-
HashMap<String, UnwindData>,
56+
HashMap<PathBuf, UnwindData>,
5757
Vec<UnwindDataPidMappingWithFullPath>,
5858
)> {
5959
let file = std::fs::File::open(self.path)?;
@@ -80,11 +80,12 @@ impl JitDump {
8080
continue;
8181
};
8282

83-
let path = format!("jit_{name}");
83+
let path_string = format!("jit_{name}");
84+
let path = PathBuf::from(&path_string);
8485
unwind_data_by_path.insert(
8586
path.clone(),
8687
UnwindData {
87-
path: path.clone(),
88+
path: path_string,
8889
base_svma: 0,
8990
eh_frame_hdr,
9091
eh_frame_hdr_svma: 0..0,
@@ -129,7 +130,7 @@ pub async fn harvest_perf_jit_for_pids(
129130
) -> Result<(
130131
HashMap<PathBuf, ModuleSymbols>,
131132
HashMap<i32, Vec<SymbolPidMappingWithFullPath>>,
132-
HashMap<String, UnwindData>,
133+
HashMap<PathBuf, UnwindData>,
133134
HashMap<i32, Vec<UnwindDataPidMappingWithFullPath>>,
134135
)> {
135136
let mut all_symbols_by_path = HashMap::new();

src/executor/wall_time/perf/mod.rs

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -331,16 +331,26 @@ impl BenchmarkData {
331331
.extend(mappings);
332332
}
333333

334-
let symbol_pid_mappings_by_pid =
335-
save_artifacts::save_symbols(path_ref, &symbols_by_path, &pid_symbol_mappings_by_pid);
334+
let mut path_to_key = std::collections::HashMap::<PathBuf, String>::new();
336335

337-
let (debug_info, debug_info_pid_mappings_by_pid) =
338-
save_artifacts::save_debug_info(&symbols_by_path, &pid_symbol_mappings_by_pid);
336+
let symbol_pid_mappings_by_pid = save_artifacts::save_symbols(
337+
path_ref,
338+
&symbols_by_path,
339+
&pid_symbol_mappings_by_pid,
340+
&mut path_to_key,
341+
);
342+
343+
let (debug_info, debug_info_pid_mappings_by_pid) = save_artifacts::save_debug_info(
344+
&symbols_by_path,
345+
&pid_symbol_mappings_by_pid,
346+
&mut path_to_key,
347+
);
339348

340349
let unwind_data_pid_mappings_by_pid = save_artifacts::save_unwind_data(
341350
path_ref,
342351
&unwind_data_by_path,
343352
pid_unwind_data_mappings_by_pid,
353+
&mut path_to_key,
344354
);
345355

346356
let ignored_modules = save_artifacts::collect_ignored_modules(&pid_symbol_mappings_by_pid);
@@ -361,6 +371,10 @@ impl BenchmarkData {
361371
debug_info_pid_mappings_by_pid,
362372
unwind_data_pid_mappings_by_pid,
363373
symbol_pid_mappings_by_pid,
374+
key_to_path: path_to_key
375+
.into_iter()
376+
.map(|(path, key)| (key, path))
377+
.collect(),
364378
debug_info_by_pid: Default::default(), // No longer used
365379
};
366380
metadata.save_to(&path).unwrap();
Lines changed: 17 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,14 @@
1-
use std::hash::{DefaultHasher, Hash, Hasher};
1+
use std::path::Path;
22

3-
/// Convert a full path to a semantic key suitable for filenames.
3+
/// Build a semantic key from a global index and a path.
44
///
5-
/// The key is `{basename}-{hash:08x}` where `basename` is the last component
6-
/// of the path and `hash` is a truncated hash of the full path. This gives
7-
/// human-readable filenames while still being unique when multiple libraries
8-
/// share the same basename.
9-
pub fn path_to_semantic_key(path: &str) -> String {
10-
let basename = path.rsplit('/').next().unwrap_or(path);
11-
let mut hasher = DefaultHasher::new();
12-
path.hash(&mut hasher);
13-
let hash = hasher.finish() as u32;
14-
format!("{basename}-{hash:08x}")
5+
/// The key is `{index}__{basename}` where `basename` is the last component
6+
/// of the path. The index ensures uniqueness across all artifact types.
7+
pub fn indexed_semantic_key(index: usize, path: &Path) -> String {
8+
format!(
9+
"{index}__{}",
10+
path.file_name().unwrap_or_default().to_string_lossy()
11+
)
1512
}
1613

1714
#[cfg(test)]
@@ -20,39 +17,26 @@ mod tests {
2017

2118
#[test]
2219
fn test_normal_path() {
23-
let key = path_to_semantic_key("/usr/lib/libc.so.6");
24-
assert!(key.starts_with("libc.so.6-"));
25-
assert_eq!(key.len(), "libc.so.6-".len() + 8);
20+
let key = indexed_semantic_key(0, Path::new("/usr/lib/libc.so.6"));
21+
assert_eq!(key, "0__libc.so.6");
2622
}
2723

2824
#[test]
2925
fn test_jit_path() {
30-
let key = path_to_semantic_key("/tmp/jit-12345.so");
31-
assert!(key.starts_with("jit-12345.so-"));
32-
assert_eq!(key.len(), "jit-12345.so-".len() + 8);
26+
let key = indexed_semantic_key(5, Path::new("/tmp/jit-12345.so"));
27+
assert_eq!(key, "5__jit-12345.so");
3328
}
3429

3530
#[test]
3631
fn test_same_basename_different_paths() {
37-
let key1 = path_to_semantic_key("/usr/lib/libc.so.6");
38-
let key2 = path_to_semantic_key("/opt/lib/libc.so.6");
39-
// Both start with the same basename
40-
assert!(key1.starts_with("libc.so.6-"));
41-
assert!(key2.starts_with("libc.so.6-"));
42-
// But have different hashes
32+
let key1 = indexed_semantic_key(0, Path::new("/usr/lib/libc.so.6"));
33+
let key2 = indexed_semantic_key(1, Path::new("/opt/lib/libc.so.6"));
4334
assert_ne!(key1, key2);
4435
}
4536

4637
#[test]
4738
fn test_bare_filename() {
48-
let key = path_to_semantic_key("libfoo.so");
49-
assert!(key.starts_with("libfoo.so-"));
50-
}
51-
52-
#[test]
53-
fn test_deterministic() {
54-
let key1 = path_to_semantic_key("/usr/lib/libc.so.6");
55-
let key2 = path_to_semantic_key("/usr/lib/libc.so.6");
56-
assert_eq!(key1, key2);
39+
let key = indexed_semantic_key(3, Path::new("libfoo.so"));
40+
assert_eq!(key, "3__libfoo.so");
5741
}
5842
}

src/executor/wall_time/perf/parse_perf_file.rs

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ use std::path::PathBuf;
1515
/// Per-pid mounting info for a specific ELF, referencing it by path.
1616
#[derive(Debug)]
1717
pub struct UnwindDataPidMappingWithFullPath {
18-
pub path: String,
18+
pub path: PathBuf,
1919
pub timestamp: Option<u64>,
2020
pub avma_range: std::ops::Range<u64>,
2121
pub base_avma: u64,
@@ -35,7 +35,7 @@ pub struct MemmapRecordsOutput {
3535
/// Per-pid mounting info referencing symbols by path
3636
pub pid_symbol_mappings_by_pid: HashMap<pid_t, Vec<SymbolPidMappingWithFullPath>>,
3737
/// Deduplicated unwind data keyed by ELF path
38-
pub unwind_data_by_path: HashMap<String, UnwindData>,
38+
pub unwind_data_by_path: HashMap<PathBuf, UnwindData>,
3939
/// Per-pid mounting info referencing unwind data by path
4040
pub pid_unwind_data_mappings_by_pid: HashMap<pid_t, Vec<UnwindDataPidMappingWithFullPath>>,
4141
pub tracked_pids: HashSet<pid_t>,
@@ -51,7 +51,7 @@ pub fn parse_for_memmap2<P: AsRef<Path>>(
5151
) -> Result<MemmapRecordsOutput> {
5252
let mut symbols_by_path = HashMap::<PathBuf, ModuleSymbols>::new();
5353
let mut symbol_mappings_by_pid = HashMap::<pid_t, Vec<SymbolPidMappingWithFullPath>>::new();
54-
let mut unwind_data_by_path = HashMap::<String, UnwindData>::new();
54+
let mut unwind_data_by_path = HashMap::<PathBuf, UnwindData>::new();
5555
let mut unwind_mappings_by_pid = HashMap::<pid_t, Vec<UnwindDataPidMappingWithFullPath>>::new();
5656

5757
// 1MiB buffer
@@ -170,7 +170,7 @@ fn process_mmap2_record(
170170
record: linux_perf_data::linux_perf_event_reader::Mmap2Record,
171171
symbols_by_path: &mut HashMap<PathBuf, ModuleSymbols>,
172172
symbol_mappings_by_pid: &mut HashMap<pid_t, Vec<SymbolPidMappingWithFullPath>>,
173-
unwind_data_by_path: &mut HashMap<String, UnwindData>,
173+
unwind_data_by_path: &mut HashMap<PathBuf, UnwindData>,
174174
unwind_mappings_by_pid: &mut HashMap<pid_t, Vec<UnwindDataPidMappingWithFullPath>>,
175175
) {
176176
// Check PROT_EXEC early to avoid string allocation for non-executable mappings
@@ -257,9 +257,7 @@ fn process_mmap2_record(
257257
) {
258258
Ok((v3, mapping)) => {
259259
// Store pid-agnostic data once per path
260-
unwind_data_by_path
261-
.entry(record_path_string.clone())
262-
.or_insert(v3);
260+
unwind_data_by_path.entry(record_path.clone()).or_insert(v3);
263261

264262
// Always store per-pid mounting info
265263
unwind_mappings_by_pid

src/executor/wall_time/perf/save_artifacts.rs

Lines changed: 36 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -12,23 +12,38 @@ use std::path::{Path, PathBuf};
1212

1313
use super::parse_perf_file::{SymbolPidMappingWithFullPath, UnwindDataPidMappingWithFullPath};
1414

15+
/// Register a path in the map if absent, assigning a new unique key.
16+
/// Returns the assigned key.
17+
fn get_or_insert_key(path_to_key: &mut HashMap<PathBuf, String>, path: &Path) -> String {
18+
if let Some(key) = path_to_key.get(path) {
19+
return key.clone();
20+
}
21+
let key = naming::indexed_semantic_key(path_to_key.len(), path);
22+
path_to_key.insert(path.to_owned(), key.clone());
23+
key
24+
}
25+
1526
/// Save deduplicated symbol files to disk and build per-pid mappings using semantic keys.
1627
pub fn save_symbols(
1728
profile_folder: &Path,
1829
symbols_by_path: &HashMap<PathBuf, ModuleSymbols>,
1930
pid_symbol_mappings_by_pid: &HashMap<i32, Vec<SymbolPidMappingWithFullPath>>,
31+
path_to_key: &mut HashMap<PathBuf, String>,
2032
) -> HashMap<i32, Vec<SymbolPidMapping>> {
2133
debug!("Saving symbols ({} unique entries)", symbols_by_path.len());
2234

23-
let path_to_key = build_path_to_key_map(symbols_by_path.keys().map(|p| p.to_string_lossy()));
35+
// Pre-register all paths so the parallel section can use immutable borrows
36+
for path in symbols_by_path.keys() {
37+
get_or_insert_key(path_to_key, path);
38+
}
2439

2540
symbols_by_path.par_iter().for_each(|(path, module)| {
26-
let key = &path_to_key[&*path.to_string_lossy()];
41+
let key = &path_to_key[path];
2742
module.save_to(profile_folder, key).unwrap();
2843
});
2944

3045
convert_pid_mappings(pid_symbol_mappings_by_pid, |m| {
31-
let key = path_to_key.get(&*m.path.to_string_lossy())?.clone();
46+
let key = path_to_key.get(&m.path)?.clone();
3247
Some((
3348
key.clone(),
3449
SymbolPidMapping {
@@ -43,6 +58,7 @@ pub fn save_symbols(
4358
pub fn save_debug_info(
4459
symbols_by_path: &HashMap<PathBuf, ModuleSymbols>,
4560
pid_symbol_mappings_by_pid: &HashMap<i32, Vec<SymbolPidMappingWithFullPath>>,
61+
path_to_key: &mut HashMap<PathBuf, String>,
4662
) -> (
4763
HashMap<String, ModuleDebugInfo>,
4864
HashMap<i32, Vec<DebugInfoPidMapping>>,
@@ -51,16 +67,21 @@ pub fn save_debug_info(
5167

5268
let debug_info_by_elf_path = debug_info_by_path(symbols_by_path);
5369

54-
let path_to_key =
55-
build_path_to_key_map(debug_info_by_elf_path.keys().map(|p| p.to_string_lossy()));
70+
// Pre-register all paths so the closures below can use immutable borrows
71+
for path in debug_info_by_elf_path.keys() {
72+
get_or_insert_key(path_to_key, path);
73+
}
5674

5775
let debug_info: HashMap<String, ModuleDebugInfo> = debug_info_by_elf_path
5876
.into_iter()
59-
.map(|(path, info)| (path_to_key[&*path.to_string_lossy()].clone(), info))
77+
.filter_map(|(path, info)| {
78+
let key = path_to_key.get(&path)?.clone();
79+
Some((key, info))
80+
})
6081
.collect();
6182

6283
let pid_mappings = convert_pid_mappings(pid_symbol_mappings_by_pid, |m| {
63-
let key = path_to_key.get(&*m.path.to_string_lossy())?.clone();
84+
let key = path_to_key.get(&m.path)?.clone();
6485
Some((
6586
key.clone(),
6687
DebugInfoPidMapping {
@@ -76,18 +97,22 @@ pub fn save_debug_info(
7697
/// Save deduplicated unwind data files to disk and build per-pid mappings using semantic keys.
7798
pub fn save_unwind_data(
7899
profile_folder: &Path,
79-
unwind_data_by_path: &HashMap<String, UnwindData>,
100+
unwind_data_by_path: &HashMap<PathBuf, UnwindData>,
80101
pid_unwind_data_mappings_by_pid: HashMap<i32, Vec<UnwindDataPidMappingWithFullPath>>,
102+
path_to_key: &mut HashMap<PathBuf, String>,
81103
) -> HashMap<i32, Vec<UnwindDataPidMapping>> {
82104
debug!(
83105
"Saving unwind data ({} unique entries)",
84106
unwind_data_by_path.len()
85107
);
86108

87-
let path_to_key = build_path_to_key_map(unwind_data_by_path.keys().map(String::as_str));
109+
// Pre-register all paths so the parallel section can use immutable borrows
110+
for path in unwind_data_by_path.keys() {
111+
get_or_insert_key(path_to_key, path);
112+
}
88113

89114
unwind_data_by_path.par_iter().for_each(|(path, entry)| {
90-
let key = &path_to_key[path.as_str()];
115+
let key = &path_to_key[path];
91116
entry.save_to(profile_folder, key).unwrap();
92117
});
93118

@@ -97,7 +122,7 @@ pub fn save_unwind_data(
97122
let converted = mappings
98123
.into_iter()
99124
.map(|m| UnwindDataPidMapping {
100-
unwind_data_key: path_to_key[m.path.as_str()].clone(),
125+
unwind_data_key: path_to_key[&m.path].clone(),
101126
timestamp: m.timestamp,
102127
avma_range: m.avma_range,
103128
base_avma: m.base_avma,
@@ -173,18 +198,3 @@ fn convert_pid_mappings<T, K: Ord>(
173198
})
174199
.collect()
175200
}
176-
177-
/// Build a `path_string → semantic_key` map from an iterator of path-like items.
178-
fn build_path_to_key_map<I, S>(paths: I) -> HashMap<String, String>
179-
where
180-
I: Iterator<Item = S>,
181-
S: AsRef<str>,
182-
{
183-
paths
184-
.map(|p| {
185-
let s = p.as_ref().to_owned();
186-
let key = naming::path_to_semantic_key(&s);
187-
(s, key)
188-
})
189-
.collect()
190-
}

src/executor/wall_time/perf/unwind_data.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ use object::Object;
77
use object::ObjectSection;
88
use runner_shared::unwind_data::UnwindData;
99
use std::ops::Range;
10+
use std::path::PathBuf;
1011

1112
use super::parse_perf_file::UnwindDataPidMappingWithFullPath;
1213

@@ -84,7 +85,7 @@ pub fn unwind_data_from_elf(
8485
};
8586

8687
let mapping = UnwindDataPidMappingWithFullPath {
87-
path,
88+
path: PathBuf::from(&path),
8889
timestamp: None,
8990
avma_range,
9091
base_avma,

0 commit comments

Comments
 (0)