Skip to content

Commit 3337731

Browse files
committed
Improve performance of writing markers
This commit improves the performance of samply-markers when enabled, based on benchmarks of the Fibonacci example increased to n = 30 and measured with hyperfine. The Fibonacci example at n = 30 makes 2,692,536 recursive calls, emitting the same number of markers. This provides a great stress test for marker emission overhead. The configurations are as follows: - disabled: samply-markers disabled - enabled-baseline: samply-markers enabled before any performance improvements - buffered-writes: adds the buffered writes improvement - itoa-buffers: adds the itoa improvement on top of buffered writes Linux results: Benchmark 1: ./fib30-disabled Time (mean ± σ): 128.2 ms ± 0.6 ms [User: 127.2 ms, System: 0.9 ms] Range (min … max): 127.1 ms … 129.6 ms 23 runs Benchmark 2: ./fib30-enabled-baseline Time (mean ± σ): 3.144 s ± 0.009 s [User: 0.924 s, System: 2.220 s] Range (min … max): 3.134 s … 3.161 s 10 runs Benchmark 3: ./fib30-enabled-buffered-writes Time (mean ± σ): 522.9 ms ± 1.1 ms [User: 478.9 ms, System: 43.7 ms] Range (min … max): 520.2 ms … 524.1 ms 10 runs Benchmark 4: ./fib30-enabled-itoa-buffers Time (mean ± σ): 423.1 ms ± 1.8 ms [User: 377.8 ms, System: 45.1 ms] Range (min … max): 419.6 ms … 426.3 ms 10 runs Summary ./fib30-disabled ran 3.30 ± 0.02 times faster than ./fib30-enabled-itoa-buffers 4.08 ± 0.02 times faster than ./fib30-enabled-buffered-writes 24.53 ± 0.14 times faster than ./fib30-enabled-baseline macOS results: Benchmark 1: ./fib30-disabled Time (mean ± σ): 90.7 ms ± 0.4 ms [User: 89.4 ms, System: 0.9 ms] Range (min … max): 90.0 ms … 91.6 ms 31 runs Benchmark 2: ./fib30-enabled-baseline Time (mean ± σ): 4.384 s ± 0.053 s [User: 0.578 s, System: 3.783 s] Range (min … max): 4.295 s … 4.495 s 10 runs Benchmark 3: ./fib30-enabled-buffered-writes Time (mean ± σ): 257.4 ms ± 1.8 ms [User: 237.6 ms, System: 16.9 ms] Range (min … max): 255.1 ms … 261.8 ms 11 runs Benchmark 4: ./fib30-enabled-itoa-buffers Time (mean ± σ): 187.6 ms ± 1.1 ms [User: 169.9 ms, System: 16.1 ms] Range (min … max): 186.3 ms … 190.1 ms 15 runs Summary ./fib30-disabled ran 2.07 ± 0.01 times faster than ./fib30-enabled-itoa-buffers 2.84 ± 0.02 times faster than ./fib30-enabled-buffered-writes 48.32 ± 0.62 times faster than ./fib30-enabled-baseline Profiles: Ubuntu: * fib30-disabled: https://share.firefox.dev/48id64U * fib30-enabled-baseline: https://share.firefox.dev/47K8Taf * fib30-enabled-buffered-writes: https://share.firefox.dev/4oK5Uog * fib30-enabled-itoa-buffers: https://share.firefox.dev/47ZtGVQ macOS: * fib30-disabled: https://share.firefox.dev/3LMmaX2 * fib30-enabled-baseline: https://share.firefox.dev/49TPODF * fib30-enabled-buffered-writes: https://share.firefox.dev/4oFo1LP * fib30-enabled-itoa-buffers: https://share.firefox.dev/3K6rLqG Overall: - Using buffered writes dramatically reduces overhead. - Using itoa provides an additional ~19% improvement on Linux and ~27% on macOS.
1 parent 4e7944b commit 3337731

File tree

5 files changed

+421
-191
lines changed

5 files changed

+421
-191
lines changed

Cargo.lock

Lines changed: 1 addition & 10 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

samply-markers/Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,14 @@ keywords = ["profiling", "markers", "instrumentation", "tracing", "performance"]
1414
[features]
1515
default = []
1616
enabled = [
17+
"dep:itoa",
1718
"dep:nix",
18-
"dep:smallstr",
1919
"dep:tempfile",
2020
]
2121

2222
[target.'cfg(unix)'.dependencies]
23+
itoa = { version = "1.0.15", optional = true }
2324
nix = { version = "0.30.1", features = ["mman", "process", "time"], optional = true }
24-
smallstr = { version = "0.3.1", optional = true }
2525
tempfile = { version = "3.23.0", optional = true }
2626

2727
[target.'cfg(target_os = "macos")'.dependencies]

samply-markers/src/marker/timestamp.rs

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -38,16 +38,14 @@ impl SamplyTimestamp {
3838
Self(nanos)
3939
}
4040

41-
/// Formats a [`SamplyTimestamp`] without implementing [`std::fmt::Display`].
41+
/// Returns the inner nanosecond value.
4242
///
43-
/// Having a dedicated formatter keeps [`SamplyTimestamp`] opaque, ensuring that it
44-
/// cannot be stringified via [`ToString`] and compared.
43+
/// This is only exposed within the crate for formatting and testing purposes,
44+
/// keeping [`SamplyTimestamp`] opaque to external consumers.
45+
#[inline]
4546
#[cfg(feature = "enabled")]
46-
pub(crate) fn fmt<W>(self, writer: &mut W) -> std::fmt::Result
47-
where
48-
W: std::fmt::Write + ?Sized,
49-
{
50-
std::fmt::Write::write_fmt(writer, format_args!("{}", self.0))
47+
pub(crate) const fn as_nanos(self) -> u64 {
48+
self.0
5149
}
5250
}
5351

@@ -83,16 +81,12 @@ mod test {
8381

8482
#[test]
8583
#[cfg(feature = "enabled")]
86-
fn fmt_writes_correctly() {
84+
fn as_nanos_returns_correct_value() {
8785
let time = SamplyTimestamp::from_monotonic_nanos(9876543210);
88-
let mut buffer = String::new();
89-
90-
let result = time.fmt(&mut buffer);
91-
assert!(result.is_ok(), "Expected the fmt operation to succeed.");
92-
9386
assert_eq!(
94-
buffer, "9876543210",
95-
"Expected the buffer to contain the formatted timestamp."
87+
time.as_nanos(),
88+
9876543210,
89+
"Expected as_nanos to return the underlying nanosecond value."
9690
);
9791
}
9892

samply-markers/src/provider/unix.rs

Lines changed: 82 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
33
use std::cell::RefCell;
44
use std::fs::File;
5+
use std::io::BufWriter;
56
use std::io::Write;
67
use std::num::NonZeroUsize;
78
use std::path::Path;
@@ -22,10 +23,36 @@ use nix::time::ClockId;
2223
#[cfg(target_os = "linux")]
2324
use nix::time::clock_gettime;
2425

25-
use smallstr::SmallString;
2626
use tempfile::TempDir;
2727
use tempfile::tempdir;
2828

29+
/// A wrapper around [`std::io::BufWriter`] that flushes on drop to ensure data is written.
30+
///
31+
/// This is used in conjunction with [`std::thread_local!`] to ensure that buffers are
32+
/// flushed when the thread exits.
33+
struct FlushOnDrop(BufWriter<File>);
34+
35+
impl Drop for FlushOnDrop {
36+
fn drop(&mut self) {
37+
if let Err(e) = self.0.flush() {
38+
eprintln!("samply-markers: failed to flush marker-file buffer: {e}");
39+
}
40+
}
41+
}
42+
43+
impl std::ops::Deref for FlushOnDrop {
44+
type Target = BufWriter<File>;
45+
fn deref(&self) -> &Self::Target {
46+
&self.0
47+
}
48+
}
49+
50+
impl std::ops::DerefMut for FlushOnDrop {
51+
fn deref_mut(&mut self) -> &mut Self::Target {
52+
&mut self.0
53+
}
54+
}
55+
2956
/// A lazily created directory that holds per-thread mmapped marker files.
3057
static MARKERS_DIR: LazyLock<Option<PathBuf>> = LazyLock::new(|| {
3158
tempdir()
@@ -35,8 +62,8 @@ static MARKERS_DIR: LazyLock<Option<PathBuf>> = LazyLock::new(|| {
3562
});
3663

3764
thread_local! {
38-
/// A thread-local file used to append marker lines that samply will ingest.
39-
static MARKER_FILE: RefCell<Option<File>> = RefCell::new(create_marker_file());
65+
/// A thread-local buffered writer used to append marker lines that samply will ingest.
66+
static MARKER_FILE: RefCell<Option<FlushOnDrop>> = RefCell::new(create_marker_file());
4067
}
4168

4269
#[cfg(target_os = "macos")]
@@ -47,10 +74,12 @@ pub struct TimestampNowImpl;
4774

4875
impl TimestampNowProvider for TimestampNowImpl {
4976
/// Queries the monotonic clock and converts the result to monotonic nanoseconds.
77+
#[inline]
5078
fn now() -> SamplyTimestamp {
5179
#[cfg(target_os = "linux")]
5280
{
53-
let now = clock_gettime(ClockId::CLOCK_MONOTONIC).unwrap();
81+
let now = clock_gettime(ClockId::CLOCK_MONOTONIC)
82+
.expect("samply-markers: unable to get time from CLOCK_MONOTONIC");
5483

5584
// Monotonic nanoseconds should only ever be positive.
5685
#[allow(clippy::cast_sign_loss)]
@@ -94,15 +123,23 @@ pub struct WriteMarkerImpl;
94123

95124
impl WriteMarkerProvider for WriteMarkerImpl {
96125
/// Serializes the marker to the thread-local file, creating it on demand.
126+
#[inline]
97127
fn write_marker(start: SamplyTimestamp, end: SamplyTimestamp, marker: &SamplyMarker) {
98-
let mut s = SmallString::<[u8; 64]>::new();
99-
start.fmt(&mut s).unwrap();
100-
s.push(' ');
101-
end.fmt(&mut s).unwrap();
102-
s.push(' ');
103-
s.push_str(marker.name());
104-
s.push('\n');
105-
let _ = with_marker_file(|f| f.write_all(s.as_bytes()));
128+
let () = MARKER_FILE.with_borrow_mut(|file| {
129+
if let Some(f) = file.as_mut() {
130+
// Use stack-allocated itoa buffers to format timestamps without heap allocation.
131+
// Format: "{start_ns} {end_ns} {marker_name}\n"
132+
let mut start_buf = itoa::Buffer::new();
133+
let mut end_buf = itoa::Buffer::new();
134+
135+
let _ = f.write_all(start_buf.format(start.as_nanos()).as_bytes());
136+
let _ = f.write_all(b" ");
137+
let _ = f.write_all(end_buf.format(end.as_nanos()).as_bytes());
138+
let _ = f.write_all(b" ");
139+
let _ = f.write_all(marker.name().as_bytes());
140+
let _ = f.write_all(b"\n");
141+
}
142+
});
106143
}
107144
}
108145

@@ -111,16 +148,6 @@ fn markers_dir() -> Option<&'static Path> {
111148
Some(LazyLock::force(&MARKERS_DIR).as_ref()?)
112149
}
113150

114-
/// Executes the provided closure with a mutable handle to the thread-local marker file.
115-
///
116-
/// Returns [`None`] when the file could not be prepared.
117-
fn with_marker_file<F, R>(f: F) -> Option<R>
118-
where
119-
F: FnOnce(&mut File) -> R,
120-
{
121-
MARKER_FILE.with_borrow_mut(|file| file.as_mut().map(f))
122-
}
123-
124151
/// Returns a unique thread identifier for the current thread.
125152
#[cfg(target_os = "linux")]
126153
fn get_thread_id() -> u32 {
@@ -159,7 +186,7 @@ fn get_thread_id() -> u32 {
159186
}
160187

161188
/// Creates a new mmapped marker file for the current thread.
162-
fn create_marker_file() -> Option<File> {
189+
fn create_marker_file() -> Option<FlushOnDrop> {
163190
let file_name = markers_dir()?.join(format!(
164191
"marker-{}-{}.txt",
165192
nix::unistd::getpid().as_raw(),
@@ -216,5 +243,36 @@ fn create_marker_file() -> Option<File> {
216243
})
217244
.ok()?;
218245

219-
Some(file)
246+
// I benchmarked the buffer size with hyperfine, using the Fibonacci example
247+
// given n = 30, which generates about 2.7 million markers per run.
248+
//
249+
// Ubuntu results:
250+
// ------------------------------------------
251+
// 4 KiB: 470.9 ms ± 1.2 ms (5.3% slower)
252+
// 8 KiB: 457.4 ms ± 2.1 ms (2.2% slower)
253+
// 16 KiB: 451.0 ms ± 1.3 ms (0.8% slower)
254+
// 32 KiB: 447.6 ms ± 1.5 ms (0.1% slower)
255+
// 64 KiB: 447.4 ms ± 2.9 ms (optimal speed)
256+
// 128 KiB: 448.6 ms ± 1.1 ms (0.3% slower)
257+
// 256 KiB: 450.5 ms ± 1.4 ms (0.7% slower)
258+
// 512 KiB: 450.1 ms ± 1.1 ms (0.6% slower)
259+
// 1024 KiB: 449.9 ms ± 2.6 ms (0.6% slower)
260+
//
261+
// macOS results:
262+
// ------------------------------------------
263+
// 4 KiB: 303.0 ms ± 2.5 ms (43.9% slower)
264+
// 8 KiB: 258.3 ms ± 1.2 ms (22.6% slower)
265+
// 16 KiB: 236.4 ms ± 1.2 ms (12.2% slower)
266+
// 32 KiB: 223.4 ms ± 0.7 ms (06.0% slower)
267+
// 64 KiB: 217.2 ms ± 1.2 ms (03.1% slower)
268+
// 128 KiB: 213.3 ms ± 0.7 ms (01.3% slower)
269+
// 256 KiB: 211.9 ms ± 1.3 ms (00.6% slower)
270+
// 512 KiB: 210.6 ms ± 0.7 ms (optimal speed)
271+
// 1024 KiB: 210.7 ms ± 0.7 ms (00.0% slower)
272+
//
273+
// Even though macOS benefitted from a larger buffer size,
274+
// I feel like a 512 KiB buffer per thread feels like overkill.
275+
//
276+
// 64 KiB feels like a good balance based on this benchmark.
277+
Some(FlushOnDrop(BufWriter::with_capacity(64 * 1024, file)))
220278
}

0 commit comments

Comments
 (0)