From 4e33856871092f663ef0aa8dea48e086b29247a9 Mon Sep 17 00:00:00 2001
From: pshu <stormslowly@gmail.com>
Date: Wed, 27 May 2026 10:59:54 +0800
Subject: [PATCH 1/2] chore(bench): split specifier microbenches into a
 separate bench binary
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`specifier/realistic[rw/hash-only]` (~340 instructions) and
`specifier/branches[fragment/short]` (~170 instructions) are short
enough that the cold instruction-cache fill on each CodSpeed
measurement iteration dominates the result: ~200 cycles of fixed
overhead translates to +5% to +10% deltas under any unrelated
binary-layout shift, even when the parser itself is unchanged or
faster. CodSpeed surfaces these as false-positive regressions.

Move the four `specifier/*` bench groups into their own
`benches/specifier.rs` and register it as a second `[[bench]]` in
Cargo.toml. Each `[[bench]]` runs in its own process, so the
specifier binary gets a fresh, much smaller instruction-cache
footprint instead of competing with the large `bench_resolver` code
for cache lines. The per-case Ir is unchanged — what changes is the
working-set the kernel and the L1/LL caches see before measurement
starts, which makes cold-start misses predictable across runs.

- `benches/specifier.rs`: new file. Allocator wrapper mirrors the one
  in `bench_resolver` so allocation costs are measured identically.
- `benches/resolver.rs`: drops `specifier/*` groups, helpers,
  unused imports.
- `Cargo.toml`: adds `[[bench]] name = "specifier"`.

No code paths in `Specifier::parse` are touched; this is purely
test-infrastructure stabilization.
---
 Cargo.toml           |  10 ++
 benches/resolver.rs  | 244 ++------------------------------------
 benches/specifier.rs | 273 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 290 insertions(+), 237 deletions(-)
 create mode 100644 benches/specifier.rs
diff --git a/Cargo.toml b/Cargo.toml
index 529adb90..44f5447c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -25,6 +25,16 @@ harness           = false
 name              = "resolver"
 required-features = ["__internal_bench"]
 
+# Specifier microbenchmarks are split into their own bench binary so they
+# don't share an instruction cache with the much larger `resolver` bench
+# code. Cases like `specifier/realistic[rw/hash-only]` only execute a few
+# hundred instructions per iteration; cold-start cache misses in a combined
+# binary previously surfaced as false-positive CodSpeed regressions.
+[[bench]]
+harness           = false
+name              = "specifier"
+required-features = ["__internal_bench"]
+
 [lints.clippy]
 all   = { level = "warn", priority = -1 }
 cargo = { level = "warn", priority = -1 }
diff --git a/benches/resolver.rs b/benches/resolver.rs
index 2a66e6aa..08aa09cc 100644
--- a/benches/resolver.rs
+++ b/benches/resolver.rs
@@ -46,10 +46,8 @@ unsafe impl<A: GlobalAlloc> GlobalAlloc for NeverGrowInPlaceAllocator<A> {
   }
 }
 
-use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
-use rspack_resolver::{
-  FileSystemOptions, FileSystemOs, ResolveOptions, Resolver, __BenchSpecifier as Specifier,
-};
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
+use rspack_resolver::{FileSystemOptions, FileSystemOs, ResolveOptions, Resolver};
 use serde_json::Value;
 use tokio::{
   runtime::{self, Builder},
@@ -415,238 +413,10 @@ fn bench_resolver(c: &mut Criterion) {
   );
 }
 
-// ============================================================================
-// Specifier micro-benchmarks
-// ----------------------------------------------------------------------------
-// `parse_query_framgment` lives behind `Specifier::parse`. The wrapper does a
-// single byte read + length check, so benchmarking `parse` is effectively
-// benchmarking the query/fragment scanner. Cases are split into four groups:
-// branch matrix, length sweep, escape scaling, and realistic specimens.
-// ============================================================================
-
-// `path` is repeated to reach `len`, then optional `?query` / `#fragment`
-// suffix is appended. Lets us scale a single shape across short/medium/long
-// inputs without changing branch coverage.
-fn specifier_shaped(base: &str, len: usize, query: Option<&str>, fragment: Option<&str>) -> String {
-  let mut s = String::with_capacity(len + 64);
-  while s.len() < len {
-    s.push_str(base);
-  }
-  s.truncate(len);
-  if let Some(q) = query {
-    s.push('?');
-    s.push_str(q);
-  }
-  if let Some(f) = fragment {
-    s.push('#');
-    s.push_str(f);
-  }
-  s
-}
-
-// Webpack-style synthesized escapes: inserts `\0#` sequences inside the path
-// so the scanner hits the `prev == '\0'` branch and is forced onto the
-// Cow::Owned slow path.
-fn specifier_with_escapes(parts: &[&str]) -> String {
-  parts.join("\0#")
-}
-
-fn specifier_branch_cases() -> Vec<(&'static str, String)> {
-  vec![
-    // 1. (None, None) — fast path, Cow::Borrowed
-    ("none/short", "./foo.js".to_string()),
-    (
-      "none/medium",
-      "./packages/utils/src/internal/helpers/normalizePath.ts".to_string(),
-    ),
-    // 2. (Some, None) — query only
-    ("query/short", "./a.js?vue".to_string()),
-    (
-      "query/medium",
-      "./Button.tsx?vue&type=script&lang=ts&scoped=true&hash=abc12345".to_string(),
-    ),
-    // 3. (None, Some) — fragment only, scanner breaks early
-    ("fragment/short", "./a.js#main".to_string()),
-    (
-      "fragment/medium",
-      "./pages/Home.tsx#section-introduction-to-the-rspack-resolver".to_string(),
-    ),
-    // 4. (Some, Some) — query then fragment
-    ("query+fragment/short", "./a.js?x#y".to_string()),
-    (
-      "query+fragment/medium",
-      "./Widget.vue?vue&type=template&lang=html#root".to_string(),
-    ),
-    // 5. multiple `?`, only first becomes query_start
-    (
-      "multi-question",
-      "./a.js?one?two?three?four?five?six?seven".to_string(),
-    ),
-    // 6. `\0#` escape → slow path (Cow::Owned + char_indices filter)
-    (
-      "escape/single",
-      specifier_with_escapes(&["path/", "real-hash"]),
-    ),
-    // 7. multiple escapes — repeats the slow path several times in one input
-    (
-      "escape/many",
-      specifier_with_escapes(&["./pkg/", "repo", "repo2", "repo3", "repo4#hash"]),
-    ),
-    // 8. leading `/`, `.`, `#` → offset=1; the first char is skipped intentionally
-    ("leading-slash", "/abs/path/to/file.mjs?q#f".to_string()),
-    ("leading-hash", "#alias/module.cjs?q#f".to_string()),
-    // 9. bare module — offset=0, scan starts at index 0
-    (
-      "bare-module",
-      "@scope/package/sub/path/index.js?q#f".to_string(),
-    ),
-    // 10. `?` inside a fragment must NOT be promoted to query — scanner already
-    //     broke at the `#`, but worth pinning so a future refactor can't regress it.
-    (
-      "fragment-with-question",
-      "./a.js#frag?not-a-query&also-not".to_string(),
-    ),
-  ]
-}
-
-// Same shape, four sizes — measures how each branch scales with input length.
-// Sizes chosen at ~5x steps so codspeed renders a clear curve.
-const SPECIFIER_LENGTH_TIERS: &[(&str, usize)] = &[
-  ("len_8", 8),
-  ("len_64", 64),
-  ("len_256", 256),
-  ("len_1536", 1536),
-];
-
-#[allow(clippy::type_complexity)]
-fn specifier_length_shapes() -> Vec<(
-  &'static str,
-  &'static str,
-  Option<&'static str>,
-  Option<&'static str>,
-)> {
-  vec![
-    // Pure path: stresses the loop body without any branch hits.
-    ("path-only", "./a/b/c/d/e/", None, None),
-    // Query at the very end: full scan before query_start is set.
-    (
-      "query-tail",
-      "./a/b/c/d/e/",
-      Some("vue&type=script&lang=ts"),
-      None,
-    ),
-    // Fragment at the very end: full scan, then early break at last char.
-    ("frag-tail", "./a/b/c/d/e/", None, Some("section-end")),
-    // Both at the tail.
-    (
-      "both-tail",
-      "./a/b/c/d/e/",
-      Some("vue&type=script"),
-      Some("hash"),
-    ),
-  ]
-}
-
-// Hand-picked from typical rspack/webpack loader chains; these are what the
-// parser actually sees in a production resolve flow.
-fn specifier_realistic_cases() -> Vec<(&'static str, &'static str)> {
-  vec![
-    ("rw/loader-chain",
-     "./node_modules/.pnpm/vue-loader@17.0.0/node_modules/vue-loader/dist/templateLoader.js?vue&type=template&id=2f8c6e7a&scoped=true&lang=html"),
-    ("rw/css-modules",
-     "./src/components/Sidebar/Sidebar.module.css?ngGlobalStyle&hash=d41d8cd98f00b204e9800998ecf8427e"),
-    ("rw/asset-query",
-     "./public/assets/images/hero@2x.png?as=webp&w=1920&h=1080&quality=80&format=webp"),
-    ("rw/hash-only",
-     "./shared/utils/index.ts#tree-shaken-export-marker-do-not-strip"),
-    ("rw/inline-loader",
-     "!!./node_modules/css-loader/dist/cjs.js??ref--6-oneOf-1-1!./node_modules/postcss-loader/dist/cjs.js??ref--6-oneOf-1-2!./src/App.vue?vue&type=style&index=0&id=7ba5bd90&scoped=true&lang=css"),
-  ]
-}
-
-fn bench_specifier_branches(c: &mut Criterion) {
-  let mut group = c.benchmark_group("specifier/branches");
-  for (label, input) in specifier_branch_cases() {
-    group.throughput(Throughput::Bytes(input.len() as u64));
-    group.bench_with_input(BenchmarkId::from_parameter(label), &input, |b, s| {
-      b.iter(|| {
-        let parsed = Specifier::parse(black_box(s.as_str())).unwrap();
-        black_box(parsed);
-      });
-    });
-  }
-  group.finish();
-}
-
-fn bench_specifier_length_sweep(c: &mut Criterion) {
-  let mut group = c.benchmark_group("specifier/length");
-  for (shape_label, base, query, fragment) in specifier_length_shapes() {
-    for (len_label, len) in SPECIFIER_LENGTH_TIERS {
-      let input = specifier_shaped(base, *len, query, fragment);
-      let id = BenchmarkId::new(shape_label, len_label);
-      group.throughput(Throughput::Bytes(input.len() as u64));
-      group.bench_with_input(id, &input, |b, s| {
-        b.iter(|| {
-          let parsed = Specifier::parse(black_box(s.as_str())).unwrap();
-          black_box(parsed);
-        });
-      });
-    }
-  }
-  group.finish();
-}
-
-fn bench_specifier_escape_scaling(c: &mut Criterion) {
-  // Slow path scales with both input length AND the number of escapes (the
-  // filter closure does an O(n*k) `escaped_indexes.contains(&i)` per char).
-  // Worth a dedicated knob so the optimizer can target it.
-  let mut group = c.benchmark_group("specifier/escapes");
-  for &n in &[1usize, 4, 16, 64] {
-    // `parts.len()` must equal `n + 1` so that `join("\0#")` inserts exactly
-    // `n` separators (= `n` escape markers in the input). The first element
-    // is a path prefix, the last is the real `#fragment`, and we pad the
-    // middle with `n - 1` filler segments.
-    let mut parts = vec!["./pkg/"];
-    for _ in 0..n.saturating_sub(1) {
-      parts.push("segment");
-    }
-    parts.push("real#hash");
-    let input = specifier_with_escapes(&parts);
-    group.throughput(Throughput::Bytes(input.len() as u64));
-    group.bench_with_input(
-      BenchmarkId::from_parameter(format!("escapes_{n}")),
-      &input,
-      |b, s| {
-        b.iter(|| {
-          let parsed = Specifier::parse(black_box(s.as_str())).unwrap();
-          black_box(parsed);
-        });
-      },
-    );
-  }
-  group.finish();
-}
-
-fn bench_specifier_realistic(c: &mut Criterion) {
-  let mut group = c.benchmark_group("specifier/realistic");
-  for (label, input) in specifier_realistic_cases() {
-    group.throughput(Throughput::Bytes(input.len() as u64));
-    group.bench_with_input(BenchmarkId::from_parameter(label), input, |b, s| {
-      b.iter(|| {
-        let parsed = Specifier::parse(black_box(s)).unwrap();
-        black_box(parsed);
-      });
-    });
-  }
-  group.finish();
-}
+// Specifier microbenchmarks live in `benches/specifier.rs` (separate
+// `[[bench]]` binary) so the very short `specifier/*` cases get a fresh
+// instruction cache instead of competing with the resolver bench code for
+// cache lines. See that file for the rationale.
 
-criterion_group!(
-  resolver,
-  bench_resolver,
-  bench_specifier_branches,
-  bench_specifier_length_sweep,
-  bench_specifier_escape_scaling,
-  bench_specifier_realistic
-);
+criterion_group!(resolver, bench_resolver);
 criterion_main!(resolver);
diff --git a/benches/specifier.rs b/benches/specifier.rs
new file mode 100644
index 00000000..e28ff8aa
--- /dev/null
+++ b/benches/specifier.rs
@@ -0,0 +1,273 @@
+//! Microbenchmarks for `Specifier::parse`.
+//!
+//! Kept in a separate bench binary from `bench_resolver` for measurement
+//! stability: each `[[bench]]` runs in its own process, so the short
+//! `specifier/*` cases get a fresh, predictable instruction cache instead of
+//! competing with the much larger resolver bench code for cache lines. This
+//! keeps cold-start cache misses out of the per-case CodSpeed deltas.
+
+#[cfg(target_family = "wasm")]
+use std::alloc::System;
+use std::alloc::{GlobalAlloc, Layout};
+
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+use rspack_resolver::__BenchSpecifier as Specifier;
+
+#[global_allocator]
+#[cfg(not(target_family = "wasm"))]
+static GLOBAL: NeverGrowInPlaceAllocator<mimalloc::MiMalloc> =
+  NeverGrowInPlaceAllocator::new(mimalloc::MiMalloc);
+
+#[global_allocator]
+#[cfg(target_family = "wasm")]
+static GLOBAL: NeverGrowInPlaceAllocator<System> = NeverGrowInPlaceAllocator::new(System);
+
+/// Mirrors the allocator wrapper in `bench_resolver` so allocation costs are
+/// measured identically across both bench binaries. See `benches/resolver.rs`
+/// for the rationale.
+struct NeverGrowInPlaceAllocator<A> {
+  allocator: A,
+}
+
+impl<A> NeverGrowInPlaceAllocator<A> {
+  const fn new(allocator: A) -> Self {
+    Self { allocator }
+  }
+}
+
+// SAFETY: Methods simply delegate to the wrapped allocator.
+unsafe impl<A: GlobalAlloc> GlobalAlloc for NeverGrowInPlaceAllocator<A> {
+  unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
+    self.allocator.alloc(layout)
+  }
+
+  unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
+    self.allocator.dealloc(ptr, layout)
+  }
+}
+
+// `path` is repeated to reach `len`, then optional `?query` / `#fragment`
+// suffix is appended. Lets us scale a single shape across short/medium/long
+// inputs without changing branch coverage.
+fn specifier_shaped(base: &str, len: usize, query: Option<&str>, fragment: Option<&str>) -> String {
+  let mut s = String::with_capacity(len + 64);
+  while s.len() < len {
+    s.push_str(base);
+  }
+  s.truncate(len);
+  if let Some(q) = query {
+    s.push('?');
+    s.push_str(q);
+  }
+  if let Some(f) = fragment {
+    s.push('#');
+    s.push_str(f);
+  }
+  s
+}
+
+// Webpack-style synthesized escapes: inserts `\0#` sequences inside the path
+// so the scanner hits the `prev == '\0'` branch and is forced onto the
+// Cow::Owned slow path.
+fn specifier_with_escapes(parts: &[&str]) -> String {
+  parts.join("\0#")
+}
+
+fn specifier_branch_cases() -> Vec<(&'static str, String)> {
+  vec![
+    // 1. (None, None) — fast path, Cow::Borrowed
+    ("none/short", "./foo.js".to_string()),
+    (
+      "none/medium",
+      "./packages/utils/src/internal/helpers/normalizePath.ts".to_string(),
+    ),
+    // 2. (Some, None) — query only
+    ("query/short", "./a.js?vue".to_string()),
+    (
+      "query/medium",
+      "./Button.tsx?vue&type=script&lang=ts&scoped=true&hash=abc12345".to_string(),
+    ),
+    // 3. (None, Some) — fragment only, scanner breaks early
+    ("fragment/short", "./a.js#main".to_string()),
+    (
+      "fragment/medium",
+      "./pages/Home.tsx#section-introduction-to-the-rspack-resolver".to_string(),
+    ),
+    // 4. (Some, Some) — query then fragment
+    ("query+fragment/short", "./a.js?x#y".to_string()),
+    (
+      "query+fragment/medium",
+      "./Widget.vue?vue&type=template&lang=html#root".to_string(),
+    ),
+    // 5. multiple `?`, only first becomes query_start
+    (
+      "multi-question",
+      "./a.js?one?two?three?four?five?six?seven".to_string(),
+    ),
+    // 6. `\0#` escape → slow path (Cow::Owned + char_indices filter)
+    (
+      "escape/single",
+      specifier_with_escapes(&["path/", "real-hash"]),
+    ),
+    // 7. multiple escapes — repeats the slow path several times in one input
+    (
+      "escape/many",
+      specifier_with_escapes(&["./pkg/", "repo", "repo2", "repo3", "repo4#hash"]),
+    ),
+    // 8. leading `/`, `.`, `#` → offset=1; the first char is skipped intentionally
+    ("leading-slash", "/abs/path/to/file.mjs?q#f".to_string()),
+    ("leading-hash", "#alias/module.cjs?q#f".to_string()),
+    // 9. bare module — offset=0, scan starts at index 0
+    (
+      "bare-module",
+      "@scope/package/sub/path/index.js?q#f".to_string(),
+    ),
+    // 10. `?` inside a fragment must NOT be promoted to query — scanner already
+    //     broke at the `#`, but worth pinning so a future refactor can't regress it.
+    (
+      "fragment-with-question",
+      "./a.js#frag?not-a-query&also-not".to_string(),
+    ),
+  ]
+}
+
+// Same shape, four sizes — measures how each branch scales with input length.
+// Sizes chosen at ~5x steps so codspeed renders a clear curve.
+const SPECIFIER_LENGTH_TIERS: &[(&str, usize)] = &[
+  ("len_8", 8),
+  ("len_64", 64),
+  ("len_256", 256),
+  ("len_1536", 1536),
+];
+
+#[allow(clippy::type_complexity)]
+fn specifier_length_shapes() -> Vec<(
+  &'static str,
+  &'static str,
+  Option<&'static str>,
+  Option<&'static str>,
+)> {
+  vec![
+    // Pure path: stresses the loop body without any branch hits.
+    ("path-only", "./a/b/c/d/e/", None, None),
+    // Query at the very end: full scan before query_start is set.
+    (
+      "query-tail",
+      "./a/b/c/d/e/",
+      Some("vue&type=script&lang=ts"),
+      None,
+    ),
+    // Fragment at the very end: full scan, then early break at last char.
+    ("frag-tail", "./a/b/c/d/e/", None, Some("section-end")),
+    // Both at the tail.
+    (
+      "both-tail",
+      "./a/b/c/d/e/",
+      Some("vue&type=script"),
+      Some("hash"),
+    ),
+  ]
+}
+
+// Hand-picked from typical rspack/webpack loader chains; these are what the
+// parser actually sees in a production resolve flow.
+fn specifier_realistic_cases() -> Vec<(&'static str, &'static str)> {
+  vec![
+    ("rw/loader-chain",
+     "./node_modules/.pnpm/vue-loader@17.0.0/node_modules/vue-loader/dist/templateLoader.js?vue&type=template&id=2f8c6e7a&scoped=true&lang=html"),
+    ("rw/css-modules",
+     "./src/components/Sidebar/Sidebar.module.css?ngGlobalStyle&hash=d41d8cd98f00b204e9800998ecf8427e"),
+    ("rw/asset-query",
+     "./public/assets/images/hero@2x.png?as=webp&w=1920&h=1080&quality=80&format=webp"),
+    ("rw/hash-only",
+     "./shared/utils/index.ts#tree-shaken-export-marker-do-not-strip"),
+    ("rw/inline-loader",
+     "!!./node_modules/css-loader/dist/cjs.js??ref--6-oneOf-1-1!./node_modules/postcss-loader/dist/cjs.js??ref--6-oneOf-1-2!./src/App.vue?vue&type=style&index=0&id=7ba5bd90&scoped=true&lang=css"),
+  ]
+}
+
+fn bench_specifier_branches(c: &mut Criterion) {
+  let mut group = c.benchmark_group("specifier/branches");
+  for (label, input) in specifier_branch_cases() {
+    group.throughput(Throughput::Bytes(input.len() as u64));
+    group.bench_with_input(BenchmarkId::from_parameter(label), &input, |b, s| {
+      b.iter(|| {
+        let parsed = Specifier::parse(black_box(s.as_str())).unwrap();
+        black_box(parsed);
+      });
+    });
+  }
+  group.finish();
+}
+
+fn bench_specifier_length_sweep(c: &mut Criterion) {
+  let mut group = c.benchmark_group("specifier/length");
+  for (shape_label, base, query, fragment) in specifier_length_shapes() {
+    for (len_label, len) in SPECIFIER_LENGTH_TIERS {
+      let input = specifier_shaped(base, *len, query, fragment);
+      let id = BenchmarkId::new(shape_label, len_label);
+      group.throughput(Throughput::Bytes(input.len() as u64));
+      group.bench_with_input(id, &input, |b, s| {
+        b.iter(|| {
+          let parsed = Specifier::parse(black_box(s.as_str())).unwrap();
+          black_box(parsed);
+        });
+      });
+    }
+  }
+  group.finish();
+}
+
+fn bench_specifier_escape_scaling(c: &mut Criterion) {
+  // Slow path scales with both input length AND the number of escapes (the
+  // filter closure does an O(n*k) `escaped_indexes.contains(&i)` per char).
+  // Worth a dedicated knob so the optimizer can target it.
+  let mut group = c.benchmark_group("specifier/escapes");
+  for &n in &[1usize, 4, 16, 64] {
+    // `parts.len()` must equal `n + 1` so that `join("\0#")` inserts exactly
+    // `n` separators (= `n` escape markers in the input). The first element
+    // is a path prefix, the last is the real `#fragment`, and we pad the
+    // middle with `n - 1` filler segments.
+    let mut parts = vec!["./pkg/"];
+    for _ in 0..n.saturating_sub(1) {
+      parts.push("segment");
+    }
+    parts.push("real#hash");
+    let input = specifier_with_escapes(&parts);
+    group.throughput(Throughput::Bytes(input.len() as u64));
+    group.bench_with_input(
+      BenchmarkId::from_parameter(format!("escapes_{n}")),
+      &input,
+      |b, s| {
+        b.iter(|| {
+          let parsed = Specifier::parse(black_box(s.as_str())).unwrap();
+          black_box(parsed);
+        });
+      },
+    );
+  }
+  group.finish();
+}
+
+fn bench_specifier_realistic(c: &mut Criterion) {
+  let mut group = c.benchmark_group("specifier/realistic");
+  for (label, input) in specifier_realistic_cases() {
+    group.throughput(Throughput::Bytes(input.len() as u64));
+    group.bench_with_input(BenchmarkId::from_parameter(label), input, |b, s| {
+      b.iter(|| {
+        let parsed = Specifier::parse(black_box(s)).unwrap();
+        black_box(parsed);
+      });
+    });
+  }
+  group.finish();
+}
+
+criterion_group!(
+  specifier,
+  bench_specifier_branches,
+  bench_specifier_length_sweep,
+  bench_specifier_escape_scaling,
+  bench_specifier_realistic
+);
+criterion_main!(specifier);

From 68da90c63000de2be16050744eac0f5b4e65d570 Mon Sep 17 00:00:00 2001
From: pshu <stormslowly@gmail.com>
Date: Wed, 27 May 2026 17:14:27 +0800
Subject: [PATCH 2/2] chore(bench): warm parse before b.iter to isolate cold
 i-cache misses

CodSpeed's `WARMUP_RUNS=5` inside `b.iter` primes the harness but does
not absorb the single cold I-fetch miss (~105 estimated cycles) that a
binary-layout shift can introduce on short cases like
`specifier/realistic[rw/hash-only]`. Add a per-input `warm_parse` setup
pass that runs 32 parses outside the Callgrind instrumentation window,
paging in parse code, lazy-initializing the allocator, and training the
branch predictor on the actual input before measurement begins.
---
 benches/specifier.rs | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/benches/specifier.rs b/benches/specifier.rs
index e28ff8aa..019fa18c 100644
--- a/benches/specifier.rs
+++ b/benches/specifier.rs
@@ -5,6 +5,15 @@
 //! `specifier/*` cases get a fresh, predictable instruction cache instead of
 //! competing with the much larger resolver bench code for cache lines. This
 //! keeps cold-start cache misses out of the per-case CodSpeed deltas.
+//!
+//! Each `bench_with_input` additionally runs a per-input `warm_parse` setup
+//! pass before `b.iter`. CodSpeed's `WARMUP_RUNS=5` happens inside `b.iter`
+//! and is intended to prime the harness, not absorb cold cache misses from
+//! a freshly-relayouted binary. The pre-`b.iter` warmup pages in parse code,
+//! lazy-inits the allocator, and trains the branch predictor on the actual
+//! input *before* CodSpeed flips on Callgrind instrumentation, so a single
+//! cold I-fetch miss (~105 estimated cycles) no longer dominates the short
+//! `specifier/realistic[*]` deltas.
 
 #[cfg(target_family = "wasm")]
 use std::alloc::System;
@@ -186,11 +195,26 @@ fn specifier_realistic_cases() -> Vec<(&'static str, &'static str)> {
   ]
 }
 
+// Number of pre-`b.iter` parse calls used to warm i-cache, branch predictor
+// and allocator state for the input under measurement. Sized to comfortably
+// exceed the parse hot-path instruction footprint (~few KB) and to dwarf
+// CodSpeed's internal `WARMUP_RUNS=5`, which alone is not enough to absorb
+// the single cold-fetch miss caused by binary-layout shifts.
+const WARM_PARSE_ITERS: usize = 32;
+
+#[inline(never)]
+fn warm_parse(s: &str) {
+  for _ in 0..WARM_PARSE_ITERS {
+    let _ = black_box(Specifier::parse(black_box(s)));
+  }
+}
+
 fn bench_specifier_branches(c: &mut Criterion) {
   let mut group = c.benchmark_group("specifier/branches");
   for (label, input) in specifier_branch_cases() {
     group.throughput(Throughput::Bytes(input.len() as u64));
     group.bench_with_input(BenchmarkId::from_parameter(label), &input, |b, s| {
+      warm_parse(s.as_str());
       b.iter(|| {
         let parsed = Specifier::parse(black_box(s.as_str())).unwrap();
         black_box(parsed);
@@ -208,6 +232,7 @@ fn bench_specifier_length_sweep(c: &mut Criterion) {
       let id = BenchmarkId::new(shape_label, len_label);
       group.throughput(Throughput::Bytes(input.len() as u64));
       group.bench_with_input(id, &input, |b, s| {
+        warm_parse(s.as_str());
         b.iter(|| {
           let parsed = Specifier::parse(black_box(s.as_str())).unwrap();
           black_box(parsed);
@@ -239,6 +264,7 @@ fn bench_specifier_escape_scaling(c: &mut Criterion) {
       BenchmarkId::from_parameter(format!("escapes_{n}")),
       &input,
       |b, s| {
+        warm_parse(s.as_str());
         b.iter(|| {
           let parsed = Specifier::parse(black_box(s.as_str())).unwrap();
           black_box(parsed);
@@ -254,6 +280,7 @@ fn bench_specifier_realistic(c: &mut Criterion) {
   for (label, input) in specifier_realistic_cases() {
     group.throughput(Throughput::Bytes(input.len() as u64));
     group.bench_with_input(BenchmarkId::from_parameter(label), input, |b, s| {
+      warm_parse(s);
       b.iter(|| {
         let parsed = Specifier::parse(black_box(s)).unwrap();
         black_box(parsed);