From 2bf71158cde397de4b423ce100385a0e5561e900 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Fri, 8 May 2026 21:56:24 +0800
Subject: [PATCH 01/24] =?UTF-8?q?perf(pm):=20bump=20manifests-concurrency-?=
 =?UTF-8?q?limit=2064=20=E2=86=92=20256=20+=20add=20fetch=20breakdown?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

p1_resolve has been ~0.9s behind bun on phases bench for the past
several PRs. Pcap on prior runs measured bun opening ~260 parallel
TCP streams against registry.npmjs.org for resolve, while utoo
opened ~70 (the 64 manifests-concurrency-limit cap was at saturation).

Adding fetch-breakdown timing in ruborist showed where p1's 22s
(local Mac) actually goes:

  fetch-timings: n=2730
    sum_request   = 1089s   (88% — TCP+TLS+HTTP RTT to first byte)
    sum_body      = 138s    (11% — body download)
    sum_parse     = 2s      (0.16% — simd_json on rayon)

The dominant cost is per-request RTT, not parsing or body transfer.
The lever is the cap on concurrent in-flight requests.

This commit:

1. Adds `crates/ruborist/src/util/timing.rs` — process-wide atomic
   accumulator that records per-fetch (request_us, body_us,
   parse_us, bytes) inside both `fetch_full_manifest` and
   `fetch_version_manifest`. Reset before each preload phase, dumped
   at INFO level after preload + bfs.

2. Bumps `manifests-concurrency-limit` default 64 → 256 to match
   bun's observed working point against npmjs.org.

CI bench will validate. Expected: p1 utoo wall drops toward bun's
range (~2.3s on GHA).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/pm/src/util/user_config.rs       |  10 +-
 crates/ruborist/src/resolver/builder.rs |  17 ++-
 crates/ruborist/src/service/manifest.rs |  24 ++++-
 crates/ruborist/src/util/mod.rs         |   2 +
 crates/ruborist/src/util/timing.rs      | 134 ++++++++++++++++++++++++
 5 files changed, 181 insertions(+), 6 deletions(-)
 create mode 100644 crates/ruborist/src/util/timing.rs
diff --git a/crates/pm/src/util/user_config.rs b/crates/pm/src/util/user_config.rs
index 34ee45a34..bc281fb40 100644
--- a/crates/pm/src/util/user_config.rs
+++ b/crates/pm/src/util/user_config.rs
@@ -132,9 +132,15 @@ pub fn get_install_scope() -> InstallScope {
     INSTALL_SCOPE.get().copied().unwrap_or_default()
 }
 
-// Manifest fetch concurrency configuration
+// Manifest fetch concurrency configuration.
+//
+// 256 to match bun's observed ~260 parallel TCP streams against
+// registry.npmjs.org. Local fetch-breakdown instrumentation showed
+// 88% of preload-phase work is in per-request RTT (TCP+TLS+server),
+// only 11% body, 0.16% parse — so the dominant lever for p1 wall is
+// the cap on concurrent in-flight manifest requests.
 static MANIFESTS_CONCURRENCY_LIMIT: LazyLock<ConfigValue<usize>> =
-    LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 64));
+    LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 256));
 
 pub fn set_manifests_concurrency_limit(value: Option<usize>) {
     MANIFESTS_CONCURRENCY_LIMIT.set(value);
diff --git a/crates/ruborist/src/resolver/builder.rs b/crates/ruborist/src/resolver/builder.rs
index b0bf2794c..166372c91 100644
--- a/crates/ruborist/src/resolver/builder.rs
+++ b/crates/ruborist/src/resolver/builder.rs
@@ -756,6 +756,7 @@ async fn run_preload_phase<R: RegistryClient, E: EventReceiver>(
         return;
     }
 
+    crate::util::FETCH_TIMINGS.reset();
     let start = tokio::time::Instant::now();
 
     let initial_deps = gather_preload_deps(graph, config.peer_deps);
@@ -794,7 +795,13 @@ async fn run_preload_phase<R: RegistryClient, E: EventReceiver>(
         failed: stats.failed_count,
     });
 
-    tracing::debug!("Preload phase: {:?}", start.elapsed());
+    let preload_elapsed = start.elapsed();
+    tracing::debug!("Preload phase: {:?}", preload_elapsed);
+    tracing::info!(
+        "p1-breakdown preload_wall={}ms | {}",
+        preload_elapsed.as_millis(),
+        crate::util::FETCH_TIMINGS.snapshot().summary_line(),
+    );
 }
 
 /// Run the BFS traversal phase to build the dependency tree.
@@ -896,7 +903,13 @@ async fn run_bfs_phase<R: RegistryClient, E: EventReceiver>(
         current_level = next_level;
     }
 
-    tracing::debug!("Build phase: {:?}", start.elapsed());
+    let bfs_elapsed = start.elapsed();
+    tracing::debug!("Build phase: {:?}", bfs_elapsed);
+    tracing::info!(
+        "p1-breakdown bfs_wall={}ms | {}",
+        bfs_elapsed.as_millis(),
+        crate::util::FETCH_TIMINGS.snapshot().summary_line(),
+    );
     Ok(())
 }
 
diff --git a/crates/ruborist/src/service/manifest.rs b/crates/ruborist/src/service/manifest.rs
index 74baf3b9c..36bc6a85a 100644
--- a/crates/ruborist/src/service/manifest.rs
+++ b/crates/ruborist/src/service/manifest.rs
@@ -12,6 +12,7 @@ use super::fetch::{
 };
 use super::http::get_client;
 use crate::model::manifest::{CoreVersionManifest, FullManifest};
+use crate::util::FETCH_TIMINGS;
 
 /// Parse JSON bytes on rayon's CPU thread pool (native) or inline
 /// (wasm32). Keeps the tokio runtime free of `simd_json` work so other
@@ -91,7 +92,9 @@ pub async fn fetch_full_manifest(opts: FetchManifestOptions<'_>) -> Result<Fetch
                     request = request.header("If-None-Match", etag_value);
                 }
 
+                let t_request_start = std::time::Instant::now();
                 let response = request.send().await.map_err(classify_reqwest_error)?;
+                let request_us = t_request_start.elapsed().as_micros() as u64;
                 let status = response.status();
 
                 if status == reqwest::StatusCode::NOT_MODIFIED {
@@ -109,19 +112,25 @@ pub async fn fetch_full_manifest(opts: FetchManifestOptions<'_>) -> Result<Fetch
                         .and_then(|v| v.to_str().ok())
                         .map(|s| s.to_string());
 
+                    let t_body_start = std::time::Instant::now();
                     let raw_bytes = response
                         .bytes()
                         .await
                         .map_err(|e| FetchError::Permanent(anyhow!("Response read error: {e}")))?
                         .to_vec();
+                    let body_us = t_body_start.elapsed().as_micros() as u64;
+                    let bytes_len = raw_bytes.len() as u64;
                     // simd_json mutates the parse buffer; clone so the raw
                     // bytes survive for `manifest.raw`.
                     let parse_buf = raw_bytes.clone();
+                    let t_parse_start = std::time::Instant::now();
                     let mut manifest: FullManifest = parse_json_off_runtime(parse_buf)
                         .await
                         .map_err(FetchError::Permanent)?;
+                    let parse_us = t_parse_start.elapsed().as_micros() as u64;
                     manifest.raw = std::sync::Arc::from(raw_bytes);
 
+                    FETCH_TIMINGS.record(request_us, body_us, parse_us, bytes_len);
                     Ok(FetchManifestResult::Ok(manifest, new_etag))
                 } else {
                     Err(classify_status(status, &url))
@@ -190,6 +199,7 @@ pub async fn fetch_version_manifest(
         || {
             let url = url.clone();
             async move {
+                let t_request_start = std::time::Instant::now();
                 let response = get_client()
                     .map_err(FetchError::Permanent)?
                     .get(&url)
@@ -197,16 +207,26 @@ pub async fn fetch_version_manifest(
                     .send()
                     .await
                     .map_err(classify_reqwest_error)?;
+                let request_us = t_request_start.elapsed().as_micros() as u64;
 
                 if response.status().is_success() {
+                    let t_body_start = std::time::Instant::now();
                     let bytes = response
                         .bytes()
                         .await
                         .map_err(|e| FetchError::Permanent(anyhow!("Response read error: {e}")))?
                         .to_vec();
-                    parse_json_off_runtime::<CoreVersionManifest>(bytes)
+                    let body_us = t_body_start.elapsed().as_micros() as u64;
+                    let bytes_len = bytes.len() as u64;
+                    let t_parse_start = std::time::Instant::now();
+                    let parsed = parse_json_off_runtime::<CoreVersionManifest>(bytes)
                         .await
-                        .map_err(FetchError::Permanent)
+                        .map_err(FetchError::Permanent);
+                    let parse_us = t_parse_start.elapsed().as_micros() as u64;
+                    if parsed.is_ok() {
+                        FETCH_TIMINGS.record(request_us, body_us, parse_us, bytes_len);
+                    }
+                    parsed
                 } else {
                     Err(classify_status(response.status(), &url))
                 }
diff --git a/crates/ruborist/src/util/mod.rs b/crates/ruborist/src/util/mod.rs
index 649e47c95..a7f0b7b7d 100644
--- a/crates/ruborist/src/util/mod.rs
+++ b/crates/ruborist/src/util/mod.rs
@@ -1,6 +1,8 @@
 //! Shared utility primitives for ruborist and downstream consumers.
 
 pub mod oncemap;
+pub mod timing;
 
 pub use crate::model::util::{PackageNameStr, parse_package_spec, read_package_json};
 pub use oncemap::OnceMap;
+pub use timing::{FETCH_TIMINGS, FetchTimings, FetchTimingsSnapshot};
diff --git a/crates/ruborist/src/util/timing.rs b/crates/ruborist/src/util/timing.rs
new file mode 100644
index 000000000..f50e921b9
--- /dev/null
+++ b/crates/ruborist/src/util/timing.rs
@@ -0,0 +1,134 @@
+//! Per-phase manifest fetch timing accumulator for p1 perf investigation.
+//!
+//! Splits each `fetch_*_manifest` call into three observable pieces:
+//!   - `request_us`: from `request.send().await` to response headers
+//!     received. Captures TCP connect (when not pooled), TLS handshake,
+//!     HTTP request roundtrip, and server-side processing.
+//!   - `body_us`: from response headers to the entire JSON body buffered.
+//!     Network-bandwidth bound for large packuments.
+//!   - `parse_us`: from full body buffered to a typed manifest. CPU bound
+//!     (simd_json on a spawn_blocking thread).
+//!
+//! `parse_us` is wall-clock for the await on `parse_json_off_runtime` —
+//! since JSON parse runs on `spawn_blocking`, this includes scheduling
+//! latency rather than pure CPU time. Together with the per-fetch total
+//! already tracked in `preload_manifests`, this lets us answer "where
+//! did p1's wall time go?" without external profiling.
+//!
+//! All counters are `AtomicU64` so the recording path is lock-free.
+//! Numbers are reset between resolves via [`reset()`] so successive
+//! `utoo deps` invocations report independently.
+
+use std::sync::atomic::{AtomicU64, Ordering};
+
+/// Per-process accumulator for manifest fetch timings.
+#[derive(Default, Debug)]
+pub struct FetchTimings {
+    /// Number of fetches recorded (full + version manifest).
+    pub count: AtomicU64,
+    /// Sum of microseconds spent in `request.send().await`.
+    pub request_us: AtomicU64,
+    /// Sum of microseconds spent in `response.bytes().await`.
+    pub body_us: AtomicU64,
+    /// Sum of microseconds spent awaiting `parse_json_off_runtime`.
+    pub parse_us: AtomicU64,
+    /// Sum of body bytes received across all fetches.
+    pub bytes: AtomicU64,
+}
+
+impl FetchTimings {
+    /// Record one fetch's split timings. Call once per successful fetch.
+    pub fn record(&self, request_us: u64, body_us: u64, parse_us: u64, bytes: u64) {
+        self.count.fetch_add(1, Ordering::Relaxed);
+        self.request_us.fetch_add(request_us, Ordering::Relaxed);
+        self.body_us.fetch_add(body_us, Ordering::Relaxed);
+        self.parse_us.fetch_add(parse_us, Ordering::Relaxed);
+        self.bytes.fetch_add(bytes, Ordering::Relaxed);
+    }
+
+    /// Reset all counters to zero.
+    pub fn reset(&self) {
+        self.count.store(0, Ordering::Relaxed);
+        self.request_us.store(0, Ordering::Relaxed);
+        self.body_us.store(0, Ordering::Relaxed);
+        self.parse_us.store(0, Ordering::Relaxed);
+        self.bytes.store(0, Ordering::Relaxed);
+    }
+
+    /// Snapshot of the current accumulator state.
+    pub fn snapshot(&self) -> FetchTimingsSnapshot {
+        FetchTimingsSnapshot {
+            count: self.count.load(Ordering::Relaxed),
+            request_us: self.request_us.load(Ordering::Relaxed),
+            body_us: self.body_us.load(Ordering::Relaxed),
+            parse_us: self.parse_us.load(Ordering::Relaxed),
+            bytes: self.bytes.load(Ordering::Relaxed),
+        }
+    }
+}
+
+/// Immutable snapshot suitable for printing.
+#[derive(Debug, Clone, Copy)]
+pub struct FetchTimingsSnapshot {
+    pub count: u64,
+    pub request_us: u64,
+    pub body_us: u64,
+    pub parse_us: u64,
+    pub bytes: u64,
+}
+
+impl FetchTimingsSnapshot {
+    /// One-line summary for tracing logs.
+    pub fn summary_line(&self) -> String {
+        if self.count == 0 {
+            return "fetch-timings: no requests recorded".to_string();
+        }
+        let count = self.count;
+        let avg_req = self.request_us / count;
+        let avg_body = self.body_us / count;
+        let avg_parse = self.parse_us / count;
+        let avg_bytes = self.bytes / count;
+        format!(
+            "fetch-timings: n={} sum_request={}ms sum_body={}ms sum_parse={}ms total_bytes={}MB | avg_request={}us avg_body={}us avg_parse={}us avg_bytes={}KB",
+            count,
+            self.request_us / 1_000,
+            self.body_us / 1_000,
+            self.parse_us / 1_000,
+            self.bytes / 1_000_000,
+            avg_req,
+            avg_body,
+            avg_parse,
+            avg_bytes / 1_024,
+        )
+    }
+}
+
+/// Process-wide manifest fetch timing accumulator.
+pub static FETCH_TIMINGS: FetchTimings = FetchTimings {
+    count: AtomicU64::new(0),
+    request_us: AtomicU64::new(0),
+    body_us: AtomicU64::new(0),
+    parse_us: AtomicU64::new(0),
+    bytes: AtomicU64::new(0),
+};
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn record_and_snapshot() {
+        FETCH_TIMINGS.reset();
+        FETCH_TIMINGS.record(100, 200, 300, 1024);
+        FETCH_TIMINGS.record(150, 250, 350, 2048);
+        let snap = FETCH_TIMINGS.snapshot();
+        assert_eq!(snap.count, 2);
+        assert_eq!(snap.request_us, 250);
+        assert_eq!(snap.body_us, 450);
+        assert_eq!(snap.parse_us, 650);
+        assert_eq!(snap.bytes, 3072);
+        FETCH_TIMINGS.reset();
+        let snap2 = FETCH_TIMINGS.snapshot();
+        assert_eq!(snap2.count, 0);
+    }
+}

From 8ac97ae036ab97cb986ce19109af18e130dbc1cd Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Fri, 8 May 2026 22:25:36 +0800
Subject: [PATCH 02/24] =?UTF-8?q?chore(p1):=20revert=20concurrency=20256?=
 =?UTF-8?q?=20=E2=86=92=2064=20+=20restore=20manifest-bench?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two changes after the GHA bench on the previous commit (PR #2916,
run 25559625024) showed the concurrency=256 hypothesis was wrong on
GHA's environment.

Revert concurrency 256 → 64
---------------------------

The new fetch-timing instrumentation shipped in the previous commit
caught the surprise: GHA's pcap-vs-local profile is the *opposite*
of what local Mac measurements suggested.

  metric          local Mac    GHA Linux
  avg_request     399ms        70ms      ← network MUCH faster on GHA
  avg_body         50ms        20ms
  avg_parse       730µs        266ms     ← parse 365× SLOWER on GHA

Mechanism: `parse_json_off_runtime` dispatches to `rayon::spawn`,
and rayon's pool size is `num_cpus` (= 2 on GHA ubuntu-latest).
Bumping concurrency 64 → 256 queued 256 manifest parses behind 2
rayon workers — head-of-line blocking. avg_parse jumped from ~10ms
to 266ms wall, dragging p1 utoo wall from 3.10s up to 3.33s.

Restore manifest-bench
----------------------

Brought back `crates/manifest-bench` (originally landed in the
post-#2818 driver hunt, dropped in af714eb3 once #2818 graduated).
It's a single-binary HTTP-only fetch tool that strips out the
ruborist pipeline (no BFS, no dedup, no parse, no project cache,
no lockfile write) — fires `GET <registry>/<name>` in parallel
and reports the same diag shape as the new `p1-breakdown` lines.

Goal: separate the network ceiling from the resolver pipeline so
the next round of p1 experiments (parse offload, partial parse,
dedicated parse pool, etc.) can be evaluated against a stable
"pure network" baseline.

Knobs (unchanged from the original drop):
  --concurrency N    sweep without rebuilding utoo
  --reps N           run same workload back-to-back
  --single-version   use /<name>/latest (smaller bodies)
  --user-agent X     UA-fingerprint experiments
  --http1-only       H2 vs H1 toggle
  --accept X         override Accept header

Same TLS stack as ruborist (rustls + aws-lc-rs, native roots).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 Cargo.toml                        |   1 +
 crates/manifest-bench/Cargo.toml  |  37 +++
 crates/manifest-bench/src/main.rs | 371 ++++++++++++++++++++++++++++++
 crates/pm/src/util/user_config.rs |  19 +-
 4 files changed, 421 insertions(+), 7 deletions(-)
 create mode 100644 crates/manifest-bench/Cargo.toml
 create mode 100644 crates/manifest-bench/src/main.rs

diff --git a/Cargo.toml b/Cargo.toml
index ef4a4f926..0574a185a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,7 @@
 [workspace]
 resolver = "2"
 members  = [
+  "crates/manifest-bench",
   "crates/pack-api",
   "crates/pack-cli",
   "crates/pack-core",
diff --git a/crates/manifest-bench/Cargo.toml b/crates/manifest-bench/Cargo.toml
new file mode 100644
index 000000000..5b01e57c0
--- /dev/null
+++ b/crates/manifest-bench/Cargo.toml
@@ -0,0 +1,37 @@
+[package]
+name        = "manifest-bench"
+version     = "0.0.0"
+edition     = "2024"
+license     = "MIT"
+publish     = false
+description = "Standalone HTTP-only manifest fetch benchmark, isolating network behaviour from ruborist's resolver pipeline."
+
+[[bin]]
+name = "manifest-bench"
+path = "src/main.rs"
+
+# tombi: format.rules.table-keys-order.disabled = true
+[dependencies]
+anyhow      = { workspace = true }
+clap        = { workspace = true }
+futures     = "0.3"
+serde       = { version = "1", features = ["derive"] }
+serde_json  = { workspace = true }
+tokio       = { workspace = true, features = ["macros", "rt-multi-thread", "fs", "time"] }
+
+# Identical TLS / DNS choices to ruborist so we measure the *protocol*
+# characteristics of the same stack, not a different implementation.
+reqwest             = { version = "0.12", default-features = false, features = [
+  "brotli",
+  "gzip",
+  "http2",
+  "rustls-tls-native-roots-no-provider",
+  "socks"
+] }
+rustls              = { version = "0.23", default-features = false, features = [
+  "aws-lc-rs",
+  "logging",
+  "std",
+  "tls12"
+] }
+rustls-native-certs = "0.8"
diff --git a/crates/manifest-bench/src/main.rs b/crates/manifest-bench/src/main.rs
new file mode 100644
index 000000000..fa70f3fe4
--- /dev/null
+++ b/crates/manifest-bench/src/main.rs
@@ -0,0 +1,371 @@
+//! Standalone HTTP-only manifest fetch benchmark.
+//!
+//! Isolates the network behaviour of `reqwest + rustls + tokio` from
+//! ruborist's resolver pipeline (BFS, dedup, parse, lockfile, project
+//! cache). Reads a list of package names, builds manifest URLs, fires
+//! parallel `GET` requests, records `(start, end)` per request, and
+//! reports the same diag shape as ruborist's `Preload HTTP diag` line.
+//!
+//! Two input modes:
+//! - `--names-file <path>` — newline-separated package names
+//! - `--lockfile <path>` — a npm-style package-lock.json; we extract
+//!   the `packages.*` (v3) or `dependencies.*` (v2) keys
+//!
+//! Two registry modes:
+//! - `<registry>/<name>` — full manifest endpoint (default, npmjs)
+//! - `<registry>/<name>/latest` — single-version endpoint
+//!   (gated behind `--single-version`)
+//!
+//! Each request reads the body to completion (we only measure I/O, no
+//! parse). Output: same fields as preload's HTTP diag for direct
+//! comparison.
+
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::Instant;
+
+use anyhow::{Context, Result, anyhow};
+use clap::Parser;
+use futures::stream::{FuturesUnordered, StreamExt};
+
+#[derive(Parser, Debug)]
+#[command(
+    name = "manifest-bench",
+    about = "HTTP-only manifest fetch bench (no parse, no resolver)"
+)]
+struct Args {
+    /// Registry base URL.
+    #[arg(long, default_value = "https://registry.npmjs.org")]
+    registry: String,
+
+    /// File of newline-separated package names. Mutually exclusive with `--lockfile`.
+    #[arg(long, conflicts_with = "lockfile")]
+    names_file: Option<PathBuf>,
+
+    /// `package-lock.json` file. Reads top-level `packages.*.name` keys.
+    #[arg(long)]
+    lockfile: Option<PathBuf>,
+
+    /// Maximum concurrent in-flight requests.
+    #[arg(long, default_value_t = 128)]
+    concurrency: usize,
+
+    /// Number of times to repeat the whole sweep (each iteration is a
+    /// fresh `reqwest::Client`, so connection pool / TLS handshake
+    /// costs are paid each time, matching `hyperfine` cold-start).
+    #[arg(long, default_value_t = 1)]
+    reps: usize,
+
+    /// Use the single-version endpoint `/<name>/latest` instead of the
+    /// full-manifest endpoint `/<name>`. Smaller bodies, more requests
+    /// served per byte.
+    #[arg(long)]
+    single_version: bool,
+
+    /// Override `Accept` header. Default mimics ruborist's preload
+    /// (`application/vnd.npm.install-v1+json` — abbreviated metadata).
+    #[arg(long)]
+    accept: Option<String>,
+
+    /// Override `User-Agent`. Default uses reqwest's default. Try
+    /// `Bun/1.x.x` to test whether Cloudflare differentiates by UA.
+    #[arg(long)]
+    user_agent: Option<String>,
+
+    /// Force HTTP/1.1 (no H2 negotiation). Default lets ALPN decide.
+    #[arg(long)]
+    http1_only: bool,
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let args = Args::parse();
+
+    let names = load_names(&args)?;
+    if names.is_empty() {
+        return Err(anyhow!("no package names found in input"));
+    }
+
+    println!(
+        "manifest-bench: registry={} concurrency={} reps={} names={} h1_only={} single_version={} accept={} ua={}",
+        args.registry,
+        args.concurrency,
+        args.reps,
+        names.len(),
+        args.http1_only,
+        args.single_version,
+        args.accept.as_deref().unwrap_or("<default>"),
+        args.user_agent.as_deref().unwrap_or("<reqwest default>"),
+    );
+
+    for rep in 1..=args.reps {
+        run_once(&args, &names, rep).await?;
+    }
+
+    Ok(())
+}
+
+fn load_names(args: &Args) -> Result<Vec<String>> {
+    if let Some(path) = &args.names_file {
+        let raw = std::fs::read_to_string(path).with_context(|| format!("read {path:?}"))?;
+        return Ok(raw
+            .lines()
+            .map(str::trim)
+            .filter(|s| !s.is_empty() && !s.starts_with('#'))
+            .map(str::to_string)
+            .collect());
+    }
+
+    if let Some(path) = &args.lockfile {
+        let raw = std::fs::read_to_string(path).with_context(|| format!("read {path:?}"))?;
+        return extract_lockfile_names(&raw);
+    }
+
+    Err(anyhow!("provide --names-file or --lockfile"))
+}
+
+/// Pull unique package names from an npm v3 lockfile (`packages.*`)
+/// or an older v2 lockfile (`dependencies.*`).
+fn extract_lockfile_names(raw: &str) -> Result<Vec<String>> {
+    use std::collections::BTreeSet;
+
+    let v: serde_json::Value = serde_json::from_str(raw).context("parse lockfile JSON")?;
+    let mut names: BTreeSet<String> = BTreeSet::new();
+
+    if let Some(packages) = v.get("packages").and_then(|p| p.as_object()) {
+        for key in packages.keys() {
+            if key.is_empty() {
+                continue;
+            }
+            // npm v3 packages key like "node_modules/foo" or
+            // "node_modules/@scope/bar/node_modules/baz" — take the
+            // last path segment (or @scope/name pair).
+            let last = last_module_name(key);
+            if !last.is_empty() {
+                names.insert(last);
+            }
+        }
+    } else if let Some(deps) = v.get("dependencies").and_then(|d| d.as_object()) {
+        for key in deps.keys() {
+            names.insert(key.clone());
+        }
+    }
+
+    Ok(names.into_iter().collect())
+}
+
+fn last_module_name(key: &str) -> String {
+    let parts: Vec<&str> = key.split("node_modules/").collect();
+    let tail = parts.last().copied().unwrap_or("");
+    tail.to_string()
+}
+
+#[derive(Debug)]
+struct ReqResult {
+    start: Instant,
+    end: Instant,
+    bytes: usize,
+    status: u16,
+}
+
+async fn run_once(args: &Args, names: &[String], rep: usize) -> Result<()> {
+    // Build a fresh client per rep — matches hyperfine's cold-start
+    // assumption that each iteration pays the TLS handshake cost.
+    let client = build_client(args)?;
+    let registry = Arc::new(args.registry.trim_end_matches('/').to_string());
+    let accept = Arc::new(
+        args.accept
+            .clone()
+            .unwrap_or_else(|| "application/vnd.npm.install-v1+json".to_string()),
+    );
+
+    let single_version = args.single_version;
+    let concurrency = args.concurrency;
+
+    let phase_start = Instant::now();
+    let mut futs = FuturesUnordered::new();
+    let mut idx = 0usize;
+    let mut results: Vec<ReqResult> = Vec::with_capacity(names.len());
+
+    while idx < names.len() && futs.len() < concurrency {
+        spawn_one(
+            &client,
+            &registry,
+            &names[idx],
+            &accept,
+            single_version,
+            &mut futs,
+        );
+        idx += 1;
+    }
+
+    while let Some(res) = futs.next().await {
+        results.push(res);
+        if idx < names.len() {
+            spawn_one(
+                &client,
+                &registry,
+                &names[idx],
+                &accept,
+                single_version,
+                &mut futs,
+            );
+            idx += 1;
+        }
+    }
+    let phase_wall_ms = phase_start.elapsed().as_millis();
+
+    report(rep, &results, phase_wall_ms);
+    Ok(())
+}
+
+type Fut = std::pin::Pin<Box<dyn std::future::Future<Output = ReqResult> + Send>>;
+
+fn spawn_one(
+    client: &reqwest::Client,
+    registry: &Arc<String>,
+    name: &str,
+    accept: &Arc<String>,
+    single_version: bool,
+    futs: &mut FuturesUnordered<Fut>,
+) {
+    let url = if single_version {
+        format!("{registry}/{name}/latest")
+    } else {
+        format!("{registry}/{name}")
+    };
+    let client = client.clone();
+    let accept = Arc::clone(accept);
+    futs.push(Box::pin(async move {
+        let start = Instant::now();
+        let req = client.get(&url).header("accept", accept.as_str()).send();
+        let (bytes, status) = match req.await {
+            Ok(resp) => {
+                let status = resp.status().as_u16();
+                let body = resp.bytes().await.map(|b| b.len()).unwrap_or(0);
+                (body, status)
+            }
+            Err(_) => (0, 0),
+        };
+        let end = Instant::now();
+        ReqResult {
+            start,
+            end,
+            bytes,
+            status,
+        }
+    }));
+}
+
+fn build_client(args: &Args) -> Result<reqwest::Client> {
+    // Install aws-lc-rs as the default crypto provider (idempotent —
+    // first call wins). Matches ruborist's `service::http` setup.
+    let _ = rustls::crypto::aws_lc_rs::default_provider().install_default();
+
+    let mut roots = rustls::RootCertStore::empty();
+    let native = rustls_native_certs::load_native_certs();
+    for cert in native.certs {
+        let _ = roots.add(cert);
+    }
+
+    let tls_config = rustls::ClientConfig::builder_with_provider(std::sync::Arc::new(
+        rustls::crypto::aws_lc_rs::default_provider(),
+    ))
+    .with_safe_default_protocol_versions()
+    .map_err(|e| anyhow!("rustls protocol versions: {e}"))?
+    .with_root_certificates(roots)
+    .with_no_client_auth();
+
+    let mut builder = reqwest::Client::builder()
+        .use_preconfigured_tls(tls_config)
+        .no_proxy()
+        .pool_max_idle_per_host(256);
+    if args.http1_only {
+        builder = builder.http1_only();
+    }
+    if let Some(ua) = &args.user_agent {
+        builder = builder.user_agent(ua);
+    }
+    builder.build().context("build reqwest client")
+}
+
+fn report(rep: usize, results: &[ReqResult], wall_ms: u128) {
+    if results.is_empty() {
+        eprintln!("[rep {rep}] no results");
+        return;
+    }
+
+    let mut spans: Vec<(Instant, Instant)> = results.iter().map(|r| (r.start, r.end)).collect();
+    spans.sort_by_key(|(s, _)| *s);
+
+    let first_start = spans.first().unwrap().0;
+    let last_end = spans.iter().map(|(_, e)| *e).max().unwrap();
+    let win_wall = last_end.duration_since(first_start).as_millis();
+
+    let mut per_us: Vec<u128> = spans
+        .iter()
+        .map(|(s, e)| e.duration_since(*s).as_micros())
+        .collect();
+    per_us.sort_unstable();
+    let n = per_us.len();
+    let pct = |p: usize| per_us[(n * p).div_ceil(100).saturating_sub(1)];
+    let sum: u128 = per_us.iter().sum();
+    let p50 = per_us[n / 2];
+
+    let mut busy_us: u128 = 0;
+    let (mut cur_s, mut cur_e) = spans[0];
+    for &(s, e) in &spans[1..] {
+        if s <= cur_e {
+            if e > cur_e {
+                cur_e = e;
+            }
+        } else {
+            busy_us += cur_e.duration_since(cur_s).as_micros();
+            cur_s = s;
+            cur_e = e;
+        }
+    }
+    busy_us += cur_e.duration_since(cur_s).as_micros();
+
+    let bytes_total: usize = results.iter().map(|r| r.bytes).sum();
+    let ok = results.iter().filter(|r| r.status == 200).count();
+    let err = results.iter().filter(|r| r.status == 0).count();
+    let four_xx = results
+        .iter()
+        .filter(|r| (400..500).contains(&r.status))
+        .count();
+    let five_xx = results
+        .iter()
+        .filter(|r| (500..600).contains(&r.status))
+        .count();
+
+    let avg_conc = if busy_us > 0 {
+        sum as f64 / busy_us as f64
+    } else {
+        0.0
+    };
+
+    println!(
+        "[rep {rep}] n={} phase_wall={}ms win_wall={}ms busy={}ms ({:.0}%) sum={}ms avg_conc={:.1} p50={}ms p95={}ms p99={}ms max={}ms bytes={} 200={} 4xx={} 5xx={} err={}",
+        n,
+        wall_ms,
+        win_wall,
+        busy_us / 1000,
+        if win_wall > 0 {
+            100.0 * (busy_us as f64 / 1000.0) / win_wall as f64
+        } else {
+            0.0
+        },
+        sum / 1000,
+        avg_conc,
+        p50 / 1000,
+        pct(95) / 1000,
+        pct(99) / 1000,
+        per_us.last().unwrap() / 1000,
+        bytes_total,
+        ok,
+        four_xx,
+        five_xx,
+        err,
+    );
+}
diff --git a/crates/pm/src/util/user_config.rs b/crates/pm/src/util/user_config.rs
index bc281fb40..a0235830a 100644
--- a/crates/pm/src/util/user_config.rs
+++ b/crates/pm/src/util/user_config.rs
@@ -132,15 +132,20 @@ pub fn get_install_scope() -> InstallScope {
     INSTALL_SCOPE.get().copied().unwrap_or_default()
 }
 
-// Manifest fetch concurrency configuration.
+// Manifest fetch concurrency configuration. Default kept at 64.
 //
-// 256 to match bun's observed ~260 parallel TCP streams against
-// registry.npmjs.org. Local fetch-breakdown instrumentation showed
-// 88% of preload-phase work is in per-request RTT (TCP+TLS+server),
-// only 11% body, 0.16% parse — so the dominant lever for p1 wall is
-// the cap on concurrent in-flight manifest requests.
+// We tried 256 to match bun's observed parallel streams; on GHA the
+// fetch-breakdown instrumentation showed sum_parse exploded from
+// ~10ms (local Mac, network-bound) to 728s on first cold run with
+// 256 concurrency. Mechanism: parse_json_off_runtime dispatches to
+// rayon, which has only num_cpus (=2 on GHA) workers. Bumping
+// concurrency to 256 queued 256 parses behind 2 workers → wall
+// per-parse jumped from 730µs to 266ms. Net p1 wall *increased*
+// 3.10s → 3.33s on phases bench. Keep 64 until we address the
+// parse-side queueing (e.g. inline parse on tokio, or a wider
+// dedicated parse pool).
 static MANIFESTS_CONCURRENCY_LIMIT: LazyLock<ConfigValue<usize>> =
-    LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 256));
+    LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 64));
 
 pub fn set_manifests_concurrency_limit(value: Option<usize>) {
     MANIFESTS_CONCURRENCY_LIMIT.set(value);

From 5690a9b6b416fb7040a52a3ce24a303177d8bc76 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Fri, 8 May 2026 22:56:20 +0800
Subject: [PATCH 03/24] ci(p1): wire manifest-bench standalone HTTP sweep into
 bench-phases-linux
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

build-linux now also builds + uploads `manifest-bench` when a phases
bench is going to run (label or dispatch). bench-phases-linux
downloads the binary and runs it after the regular phase-isolated
benchmark.

Sweep mirrors the original (#2818-era) wire-in:

  concurrency: 32 / 64 / 96 / 128 / 192 / 256  (HTTP/1.1, full manifest)
  protocol:    H1 vs H2-negotiate  (cap=128)
  endpoint:    full vs `/<name>/latest`  (cap=128, smaller bodies)
  UA:          default vs `Bun/1.2.21`  (cap=128)

Output goes to /tmp/pm-bench-output/manifest-bench-npmjs.log and
ships in the existing pm-bench-logs-linux artifact — no PR comment
surface (the headline phases bench comment stays the same).

Why now: the new ruborist `p1-breakdown` instrumentation showed
sum_parse on GHA can dominate when concurrency is bumped (256:
sum_parse 728s vs sum_request 193s). To attribute the bun-vs-utoo
gap on p1_resolve we need a "pure HTTP" baseline that strips out
ruborist's parse / BFS / dedup / lockfile path. manifest-bench is
that baseline: same TLS stack as ruborist (rustls + aws-lc-rs,
native roots), no resolver pipeline.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/pm-e2e-bench.yml | 80 ++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/.github/workflows/pm-e2e-bench.yml b/.github/workflows/pm-e2e-bench.yml
index 74c90ece5..b25f5c380 100644
--- a/.github/workflows/pm-e2e-bench.yml
+++ b/.github/workflows/pm-e2e-bench.yml
@@ -143,6 +143,24 @@ jobs:
           name: utoo-linux-x64
           path: target/x86_64-unknown-linux-gnu/release/utoo
           retention-days: 1
+      # manifest-bench is a standalone HTTP-only fetch sweeper used as
+      # the network-only baseline for p1_resolve perf work. Built only
+      # when phases bench is going to run (label or dispatch), so plain
+      # PR builds aren't slowed by the extra crate.
+      - name: Build manifest-bench (p1 baseline)
+        if: >
+          (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'benchmark')) ||
+          (github.event_name == 'workflow_dispatch' && (inputs.target == 'pm-bench-phases' || inputs.target == 'pm-bench-pcap'))
+        run: cargo build --release --target x86_64-unknown-linux-gnu -p manifest-bench
+      - name: Upload manifest-bench binary
+        if: >
+          (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'benchmark')) ||
+          (github.event_name == 'workflow_dispatch' && (inputs.target == 'pm-bench-phases' || inputs.target == 'pm-bench-pcap'))
+        uses: actions/upload-artifact@v4
+        with:
+          name: manifest-bench-linux-x64
+          path: target/x86_64-unknown-linux-gnu/release/manifest-bench
+          retention-days: 1
       # Piggyback on the already-built target/ from the step above: when the
       # PR is labeled `benchmark`, overlay origin/next's tree onto the current
       # workdir and re-run cargo build. cargo's incremental compile only
@@ -516,6 +534,19 @@ jobs:
           mv /tmp/utoo-next-dist/utoo /tmp/utoo-next
           echo "Baseline utoo (next) version: $(/tmp/utoo-next --version)"
           echo "UTOO_NEXT_BIN=/tmp/utoo-next" >> $GITHUB_ENV
+      # Download the manifest-bench binary built by build-linux. Used as
+      # the network-only baseline for p1_resolve work — strips out parse,
+      # BFS, dedup, lockfile write so the wall is pure HTTP fetch.
+      - name: Download manifest-bench binary
+        uses: actions/download-artifact@v4
+        with:
+          name: manifest-bench-linux-x64
+          path: /tmp/manifest-bench-dist
+      - name: Install manifest-bench
+        run: |
+          chmod +x /tmp/manifest-bench-dist/manifest-bench
+          mv /tmp/manifest-bench-dist/manifest-bench /tmp/manifest-bench
+          echo "MANIFEST_BENCH_BIN=/tmp/manifest-bench" >> $GITHUB_ENV
       - name: Verify tools
         run: |
           hyperfine --version
@@ -565,6 +596,55 @@ jobs:
         run: |
           mkdir -p /tmp/pm-bench-output
           bash bench/pm-bench-phases.sh 2>&1 | tee /tmp/pm-bench-output/bench-phases-npmmirror.log
+      # Standalone HTTP-only sweep — sweeps the network-only ceiling
+      # against the same lockfile-derived workload phase-bench just used.
+      # Output goes into the bench logs artifact; no PR comment surface.
+      - name: Standalone manifest-bench (HTTP-only sweep)
+        env:
+          PROJECT: ${{ github.event.inputs.project || 'ant-design' }}
+          REGISTRY: 'https://registry.npmjs.org'
+        run: |
+          set -eu
+          mkdir -p /tmp/pm-bench-output
+          PROJECT_DIR="/tmp/pm-bench/$PROJECT"
+          if [ ! -d "$PROJECT_DIR" ]; then
+            mkdir -p /tmp/pm-bench
+            git clone --depth 1 "https://github.com/ant-design/$PROJECT" "$PROJECT_DIR"
+          fi
+          cd "$PROJECT_DIR"
+          if [ ! -f package-lock.json ]; then
+            echo "==> generating lockfile via utoo (one-shot, untimed)"
+            utoo deps --registry "$REGISTRY" || true
+          fi
+          ls -la package-lock.json || { echo "no lockfile; skipping manifest-bench"; exit 0; }
+
+          MB_LOG=/tmp/pm-bench-output/manifest-bench-npmjs.log
+          {
+            echo "============================================================"
+            echo "manifest-bench: HTTP-only fetch (no parse, no resolver)"
+            echo "  Goal: isolate reqwest/rustls/tokio behaviour from"
+            echo "  ruborist's resolver pipeline. Same metric shape as"
+            echo "  ruborist's p1-breakdown line."
+            echo "============================================================"
+            for CAP in 32 64 96 128 192 256; do
+              echo
+              echo "--- concurrency=$CAP, h1, full manifest, default UA ---"
+              "$MANIFEST_BENCH_BIN" --lockfile package-lock.json --registry "$REGISTRY" \
+                --concurrency "$CAP" --reps 2 --http1-only || true
+            done
+            echo
+            echo "--- concurrency=128, h2 negotiate, full manifest, default UA ---"
+            "$MANIFEST_BENCH_BIN" --lockfile package-lock.json --registry "$REGISTRY" \
+              --concurrency 128 --reps 2 || true
+            echo
+            echo "--- concurrency=128, h1, single-version endpoint ---"
+            "$MANIFEST_BENCH_BIN" --lockfile package-lock.json --registry "$REGISTRY" \
+              --concurrency 128 --reps 2 --http1-only --single-version || true
+            echo
+            echo "--- concurrency=128, h1, UA=Bun/1.2.21 ---"
+            "$MANIFEST_BENCH_BIN" --lockfile package-lock.json --registry "$REGISTRY" \
+              --concurrency 128 --reps 2 --http1-only --user-agent "Bun/1.2.21" || true
+          } 2>&1 | tee "$MB_LOG"
       - name: Upload bench logs
         if: always()
         uses: actions/upload-artifact@v4

From 94af458887de3add09f2e973dbbad6f2524f1a5f Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Fri, 8 May 2026 23:24:56 +0800
Subject: [PATCH 04/24] perf(ruborist): inline JSON parse, drop rayon::spawn
 dispatch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CI fetch-breakdown on GHA (run 25562552058, conc=64) showed parse
queueing on rayon dominates the gap to manifest-bench's pure-HTTP
baseline:

  manifest-bench (pure HTTP, conc=64): 2.12s wall
  utoo p1 (full ruborist):             3.10s wall  ← +1.0s overhead
  ↑ sum_parse 95s vs sum_request 95s, parse 50% of work-time
  ↑ avg_parse 30ms wall vs ~5ms actual CPU — the 25ms extra is rayon
    queue wait

Mechanism: 64 concurrent tasks all dispatching parse to rayon's pool
(size = num_cpus = 2 on GHA). Queue depth grows to ~32 per worker.
Each parse waits 25ms+ in queue before running its 5ms of CPU work.

Round 1 fix: inline parse, drop the rayon hop. simd_json on a tokio
worker thread is fast (~5ms for 115KB JSON), and the tokio runtime's
cooperative budget naturally rebalances CPU across the 64 tasks.

Expected on next CI:
- avg_parse drops from 30ms wall → ~5-10ms wall (close to CPU-only)
- preload_wall drops from 5.4s → ~3.5-4s for cold runs
- p1 hyperfine wall drops from 3.10s → 2.3-2.5s, narrowing the gap
  to manifest-bench's 2.12s ceiling

If parse becomes the new bottleneck (CPU-bound), next round could
look at partial parse / lazy field access. If wall doesn't drop,
hypothesis is wrong and we look elsewhere (BFS, dedup, lockfile).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/service/manifest.rs | 29 +++++++++----------------
 1 file changed, 10 insertions(+), 19 deletions(-)

diff --git a/crates/ruborist/src/service/manifest.rs b/crates/ruborist/src/service/manifest.rs
index 36bc6a85a..3502f6ec2 100644
--- a/crates/ruborist/src/service/manifest.rs
+++ b/crates/ruborist/src/service/manifest.rs
@@ -14,29 +14,20 @@ use super::http::get_client;
 use crate::model::manifest::{CoreVersionManifest, FullManifest};
 use crate::util::FETCH_TIMINGS;
 
-/// Parse JSON bytes on rayon's CPU thread pool (native) or inline
-/// (wasm32). Keeps the tokio runtime free of `simd_json` work so other
-/// in-flight manifest fetches keep driving network IO while this one
-/// parses.
+/// Parse JSON bytes inline on the calling tokio task. Previously this
+/// dispatched to `rayon::spawn` to "free the runtime", but
+/// fetch-breakdown instrumentation on GHA showed the rayon hop made it
+/// strictly worse: rayon's pool is `num_cpus` (= 2 on ubuntu-latest),
+/// 64 concurrent fetches all dispatching parse queued behind 2 workers
+/// — avg_parse ballooned from ~5ms (CPU only) to 30ms wall (queue +
+/// CPU). Inlining puts parse on the tokio worker that already owns
+/// the buffer; the cooperative-scheduling budget naturally rebalances
+/// CPU between fetches.
 async fn parse_json_off_runtime<T>(mut bytes: Vec<u8>) -> Result<T, anyhow::Error>
 where
     T: serde::de::DeserializeOwned + Send + 'static,
 {
-    #[cfg(not(target_arch = "wasm32"))]
-    {
-        let (tx, rx) = tokio::sync::oneshot::channel();
-        rayon::spawn(move || {
-            let result = simd_json::serde::from_slice::<T>(&mut bytes)
-                .map_err(|e| anyhow!("JSON parse error: {e}"));
-            let _ = tx.send(result);
-        });
-        rx.await
-            .map_err(|e| anyhow!("rayon parse channel closed: {e}"))?
-    }
-    #[cfg(target_arch = "wasm32")]
-    {
-        simd_json::serde::from_slice::<T>(&mut bytes).map_err(|e| anyhow!("JSON parse error: {e}"))
-    }
+    simd_json::serde::from_slice::<T>(&mut bytes).map_err(|e| anyhow!("JSON parse error: {e}"))
 }
 
 /// Result of a full manifest fetch with ETag support.

From ee5f5f4d23c8c9668c90c7d6b3b12eb49dab3afe Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Fri, 8 May 2026 23:47:49 +0800
Subject: [PATCH 05/24] perf(ruborist): switch JSON parse to tokio
 spawn_blocking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Round 1 (inline parse) reverted on data: GHA showed +0.37s p1
regression because parse blocked tokio runtime workers, dropping
eff_parallel 42 → 35 even though per-fetch work-time fell. avg_request
went up from 35ms → 52ms — symptomatic of socket reads being delayed
by the parsing task on the same worker.

  metric           round 0 (rayon)  round 1 (inline)
  p1 wall          3.27s            3.64s   ⚠️ +0.37s
  avg_parse        30ms (queued)    300µs   ✓
  avg_request      35ms             52ms    ⚠️ +17ms (worker contention)
  eff_parallel     42               35      ⚠️

Round 2 attempts the third option: `tokio::task::spawn_blocking`.

  - rayon's pool was too small (num_cpus = 2 on GHA) — 64 concurrent
    parses queued behind 2 workers, parse wall 30ms.
  - inline parse held tokio worker hostage during simd_json call,
    starving in-flight socket reads.
  - tokio's blocking pool has a much larger default cap (512), so 64
    concurrent parses never queue. Unlike rayon there's no contention
    with the install path's parallel-write rayon usage. Unlike inline
    the tokio runtime workers stay free to drive network I/O.

Expected on next CI:
  - avg_parse drops to ~5-10ms wall (close to CPU floor, no queue)
  - avg_request stays ~35ms (workers free for I/O)
  - eff_parallel returns to ~50, possibly higher
  - p1 wall drops toward manifest-bench's 2.10s ceiling

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/service/manifest.rs | 39 ++++++++++++++++++-------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/crates/ruborist/src/service/manifest.rs b/crates/ruborist/src/service/manifest.rs
index 3502f6ec2..90f1db71b 100644
--- a/crates/ruborist/src/service/manifest.rs
+++ b/crates/ruborist/src/service/manifest.rs
@@ -14,20 +14,39 @@ use super::http::get_client;
 use crate::model::manifest::{CoreVersionManifest, FullManifest};
 use crate::util::FETCH_TIMINGS;
 
-/// Parse JSON bytes inline on the calling tokio task. Previously this
-/// dispatched to `rayon::spawn` to "free the runtime", but
-/// fetch-breakdown instrumentation on GHA showed the rayon hop made it
-/// strictly worse: rayon's pool is `num_cpus` (= 2 on ubuntu-latest),
-/// 64 concurrent fetches all dispatching parse queued behind 2 workers
-/// — avg_parse ballooned from ~5ms (CPU only) to 30ms wall (queue +
-/// CPU). Inlining puts parse on the tokio worker that already owns
-/// the buffer; the cooperative-scheduling budget naturally rebalances
-/// CPU between fetches.
+/// Parse JSON bytes on tokio's blocking thread pool.
+///
+/// The history of this function captures three different attempts:
+///   - rayon::spawn (original): rayon's pool is `num_cpus` (= 2 on
+///     GHA), 64 concurrent parses queued behind 2 workers → avg_parse
+///     30ms wall vs ~5ms CPU. round-0 baseline.
+///   - inline (round 1, reverted): no rayon hop, but the simd_json
+///     call blocks the tokio runtime worker, so other in-flight
+///     fetches couldn't drive their socket I/O — avg_request grew
+///     35ms → 52ms (+17ms), eff_parallel 42 → 35, net p1 wall +0.37s.
+///   - spawn_blocking (current): tokio's dedicated blocking pool has
+///     a much higher default cap (512), so 64 concurrent parses are
+///     never queued. Unlike rayon there's no contention with the
+///     install path's parallel-write rayon usage, and unlike inline
+///     the tokio runtime workers stay free to drive network I/O on
+///     all in-flight fetches.
 async fn parse_json_off_runtime<T>(mut bytes: Vec<u8>) -> Result<T, anyhow::Error>
 where
     T: serde::de::DeserializeOwned + Send + 'static,
 {
-    simd_json::serde::from_slice::<T>(&mut bytes).map_err(|e| anyhow!("JSON parse error: {e}"))
+    #[cfg(not(target_arch = "wasm32"))]
+    {
+        tokio::task::spawn_blocking(move || {
+            simd_json::serde::from_slice::<T>(&mut bytes)
+                .map_err(|e| anyhow!("JSON parse error: {e}"))
+        })
+        .await
+        .map_err(|e| anyhow!("spawn_blocking parse panicked: {e}"))?
+    }
+    #[cfg(target_arch = "wasm32")]
+    {
+        simd_json::serde::from_slice::<T>(&mut bytes).map_err(|e| anyhow!("JSON parse error: {e}"))
+    }
 }
 
 /// Result of a full manifest fetch with ETag support.

From 16404fc481577a03b00ba2f46aa1f3711ec5351f Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 00:14:46 +0800
Subject: [PATCH 06/24] perf(ruborist): switch extract_core_version to
 spawn_blocking too
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Round 2 moved parse_json_off_runtime off rayon (-0.11s p1). But
fetch-breakdown still showed avg_request 41ms vs round 0's 35ms,
hinting at a second source of rayon contention.

Found it: `extract_core_version_off_runtime` is also on
`rayon::spawn`. On npmjs.org's `!supports_semver` path EVERY fetch
resolves through `resolve_via_full_manifest`, which fetches the
full packument once per package name (deduped via inflight_full)
and then calls `extract_core_version_off_runtime` per (name, spec)
to materialize the chosen version into a `CoreVersionManifest`.

So per fetch we hit rayon TWICE — once for the JSON parse (round 2
moved to spawn_blocking), and once for `get_core_version` (still on
rayon). The second hop has the same head-of-line blocking signature
as the first: 64 concurrent resolves dispatching to a 2-thread
rayon pool.

Round 3: move extract_core_version_off_runtime to spawn_blocking
for the same reasons. The work is JSON lazy-reparse (`raw_json`
sub-tree decoding) — genuinely blocking, well-suited for tokio's
blocking pool.

Expected: utoo p1 wall drops further toward manifest-bench's 2.10s
ceiling. avg_request should fall back from 41ms → ~35ms (rayon
contention removed from the fetch task's await chain).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/model/manifest.rs | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/crates/ruborist/src/model/manifest.rs b/crates/ruborist/src/model/manifest.rs
index 37e95deb9..15c762eb5 100644
--- a/crates/ruborist/src/model/manifest.rs
+++ b/crates/ruborist/src/model/manifest.rs
@@ -163,14 +163,20 @@ pub async fn extract_core_version_off_runtime(
     full: Arc<FullManifest>,
     version: String,
 ) -> (String, Option<Arc<CoreVersionManifest>>) {
+    // See `parse_json_off_runtime` for the same rayon-vs-spawn_blocking
+    // history: rayon's `num_cpus` pool oversubscribes when many concurrent
+    // resolves all extract from full manifests at once. spawn_blocking's
+    // larger pool avoids the queue, and the work is genuinely blocking
+    // (lazy JSON re-parse via `get_core_version`) so the blocking pool
+    // is the right home.
     #[cfg(not(target_arch = "wasm32"))]
     {
-        let (tx, rx) = tokio::sync::oneshot::channel();
-        rayon::spawn(move || {
+        tokio::task::spawn_blocking(move || {
             let core = full.get_core_version(&version).map(Arc::new);
-            let _ = tx.send((version, core));
-        });
-        rx.await.expect("rayon parse worker dropped before sending")
+            (version, core)
+        })
+        .await
+        .expect("spawn_blocking parse worker panicked")
     }
     #[cfg(target_arch = "wasm32")]
     {

From 460a53885b30982bd19c68ca1a866fa540c66a76 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 00:37:37 +0800
Subject: [PATCH 07/24] revert + instrument(ruborist): post-build phase timing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two changes for round 4 of p1 optimization:

1. Revert `extract_core_version_off_runtime` from spawn_blocking back
   to rayon::spawn (round 3). Within-run measurement showed +0.42s
   regression vs utoo-next (round 2 was +0.11s). Likely cause: this
   function is called per (name, spec), so multi-spec packages call
   it 2-5x per fetch. spawn_blocking's per-dispatch overhead exceeds
   rayon queue savings at this multiplier.

2. Add `serialize_us` and `cache_export_us` to the p1-breakdown line
   so we can attribute the remaining gap. Currently:

     manifest-bench wall:     2.10s   (pure HTTP ceiling)
     utoo p1 wall (round 2):  3.16s
     gap:                     1.06s

   We have:
     preload_wall  ≈ 2.7s   (logged)
     bfs_wall      ≈ 0.3s   (logged)
     serialize_us  ?
     cache_export_us ?      ← suspected: full manifest deep-clone
                              into ProjectCacheData for ~2730 entries

   Next round will have data to choose between attacking serialize,
   cache export, or the BFS loop body.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/model/manifest.rs | 24 +++++++++++++-----------
 crates/ruborist/src/service/api.rs    | 10 ++++++++++
 2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/crates/ruborist/src/model/manifest.rs b/crates/ruborist/src/model/manifest.rs
index 15c762eb5..3509e839d 100644
--- a/crates/ruborist/src/model/manifest.rs
+++ b/crates/ruborist/src/model/manifest.rs
@@ -163,20 +163,22 @@ pub async fn extract_core_version_off_runtime(
     full: Arc<FullManifest>,
     version: String,
 ) -> (String, Option<Arc<CoreVersionManifest>>) {
-    // See `parse_json_off_runtime` for the same rayon-vs-spawn_blocking
-    // history: rayon's `num_cpus` pool oversubscribes when many concurrent
-    // resolves all extract from full manifests at once. spawn_blocking's
-    // larger pool avoids the queue, and the work is genuinely blocking
-    // (lazy JSON re-parse via `get_core_version`) so the blocking pool
-    // is the right home.
+    // Round 3 attempted to switch this to `tokio::task::spawn_blocking`
+    // for the same reasons as `parse_json_off_runtime`, but CI showed
+    // it regressed p1 by 0.5s on `preload_wall`. Mechanism: this
+    // function is called per (name, spec), so packages with multiple
+    // specs (e.g. peer-dep range overlaps) call it 2-5x per fetch.
+    // spawn_blocking's per-dispatch overhead (channel + thread wake)
+    // is significant for short CPU work; with the multiplier this
+    // outweighed rayon queue waits at conc=64. Keep on rayon::spawn.
     #[cfg(not(target_arch = "wasm32"))]
     {
-        tokio::task::spawn_blocking(move || {
+        let (tx, rx) = tokio::sync::oneshot::channel();
+        rayon::spawn(move || {
             let core = full.get_core_version(&version).map(Arc::new);
-            (version, core)
-        })
-        .await
-        .expect("spawn_blocking parse worker panicked")
+            let _ = tx.send((version, core));
+        });
+        rx.await.expect("rayon parse worker dropped before sending")
     }
     #[cfg(target_arch = "wasm32")]
     {
diff --git a/crates/ruborist/src/service/api.rs b/crates/ruborist/src/service/api.rs
index 878b357a1..82703ed97 100644
--- a/crates/ruborist/src/service/api.rs
+++ b/crates/ruborist/src/service/api.rs
@@ -258,9 +258,12 @@ where
         .await
         .map_err(|e| anyhow::Error::new(e).context("Dependency resolution failed"))?;
 
+    let t_serialize_start = std::time::Instant::now();
     let (packages, _total) = graph.serialize_to_packages(&root_path);
+    let serialize_us = t_serialize_start.elapsed().as_micros() as u64;
 
     // Export project cache from memory cache for the host to persist.
+    let t_cache_export_start = std::time::Instant::now();
     let mut project_cache = ProjectCacheData::default();
     for (key, manifest) in registry.cache().export_version_manifests() {
         // `parse_package_spec` rather than `split_once('@')` so scoped names
@@ -271,6 +274,13 @@ where
         pkg_cache.specs.insert(spec.to_string(), version.clone());
         pkg_cache.manifests.insert(version, (*manifest).clone());
     }
+    let cache_export_us = t_cache_export_start.elapsed().as_micros() as u64;
+
+    tracing::info!(
+        "p1-breakdown serialize_us={} cache_export_us={}",
+        serialize_us,
+        cache_export_us,
+    );
 
     Ok(BuildDepsOutput {
         lock: PackageLock::new(&pkg.name, &pkg.version, packages),

From 58d49aafd2f886d1af364d91f85997e4dc01e37e Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 01:02:11 +0800
Subject: [PATCH 08/24] instrument(ruborist): preload main loop dispatch +
 result split
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Round 4 measured serialize_us = 15ms and cache_export_us = 34ms — both
tiny — confirming the 1s gap from manifest-bench (utoo p1 = 3.16s vs
mb wall = 2.10s) is not in post-build code.

Per-fetch math also pointed at main-loop bookkeeping:

  manifest-bench: eff_parallel = 52 (sum_work 111s / wall 2.14s)
  utoo preload  : eff_parallel = 43 (sum_work 120s / wall 2.85s)

Same conc=64 cap, but utoo loses 9 effective slots — most likely
the main loop's serial bookkeeping (dedup hash insert, format!
key, extract_transitive_deps, queue push, 3-4 receiver events)
holds the flow between futures.next() returning and the next
fetch dispatch.

This commit splits the main loop into two timed segments:

  preload_loop_dispatch_us: time spent in the `while in_flight <
                            concurrency` block — popping pending,
                            dedup check, futures.push.
  preload_loop_result_us:   time spent processing each completed
                            future — extract_transitive_deps,
                            pending.extend, on_manifest.

If dispatch+result sum approaches preload_wall, the main loop is
the bottleneck and we need to either (a) split processing onto a
dedicated task, or (b) use unbounded futures with a downstream
consumer. If they're small, the gap is elsewhere (per-task
overhead in resolve_package's inflight gates).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/resolver/preload.rs | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/crates/ruborist/src/resolver/preload.rs b/crates/ruborist/src/resolver/preload.rs
index 1230c5bf6..e9a777407 100644
--- a/crates/ruborist/src/resolver/preload.rs
+++ b/crates/ruborist/src/resolver/preload.rs
@@ -99,8 +99,17 @@ where
     let mut in_flight = 0usize;
     let mut started = false;
 
+    // Main-loop overhead instrumentation. Atomic accumulators so we
+    // can attribute the gap between manifest-bench's pure-HTTP wall
+    // and ruborist's preload wall: how much of the gap is bookkeeping
+    // (dedup hash, extract_transitive_deps, queue push, events) vs
+    // actual fetch wait?
+    let mut total_dispatch_us: u64 = 0;
+    let mut total_result_us: u64 = 0;
+
     loop {
         // Fill up to concurrency limit
+        let dispatch_start = tokio::time::Instant::now();
         while in_flight < concurrency {
             let item = loop {
                 let Some((name, spec)) = pending.pop_front() else {
@@ -134,6 +143,7 @@ where
             });
             in_flight += 1;
         }
+        total_dispatch_us += dispatch_start.elapsed().as_micros() as u64;
 
         if in_flight == 0 {
             break;
@@ -142,6 +152,7 @@ where
         let Some((name, result, elapsed_ms)) = futures.next().await else {
             break;
         };
+        let result_start = tokio::time::Instant::now();
         in_flight -= 1;
 
         if stats.success_count == 0 && stats.failed_count == 0 {
@@ -174,8 +185,15 @@ where
                 tracing::debug!("Failed to preload {}: {}", name, e);
             }
         }
+        total_result_us += result_start.elapsed().as_micros() as u64;
     }
 
+    tracing::info!(
+        "p1-breakdown preload_loop_dispatch_us={} preload_loop_result_us={}",
+        total_dispatch_us,
+        total_result_us,
+    );
+
     stats.total_processed = processed.len();
 
     receiver.on_event(BuildEvent::PreloadComplete {

From 8114bf42af0e9d102bd9c2893acd764d9e0470be Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 01:25:54 +0800
Subject: [PATCH 09/24] perf(pm): grow rayon pool to max(num_cpus, 8) to drain
 p1 extract queue
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Round 5 main-loop instrumentation showed the preload main loop
itself is fast (15-25ms total dispatch+result). The 0.8s gap from
manifest-bench's 2.10s wall lives INSIDE the spawned fetch tasks.

Per-fetch wall (warm runs):
  measured: avg_request 30ms + avg_body 6ms + avg_parse 2.5ms = ~38ms
  derived:  preload_wall 2.4s × eff_parallel(43) / 2730 = 38ms
  delta:    ~12ms unaccounted per task

That 12ms is `extract_core_version_off_runtime` queueing on rayon's
2-thread pool. extract is called per (name, spec) — for ant-design
that's ~3000+ calls. With pool=2 and 64 concurrent fetches each
dispatching extract, the queue depth grows; each task waits its
turn before extract returns.

Bump rayon pool to `max(num_cpus, 8)` for non-Windows. Sizing the
pool above the CPU count for short blocking JSON ops (parse + extract)
replaces FIFO queueing with parallel dispatch. Real CPU contention
is bounded by num_cpus (the kernel scheduler still gates), so the
extra pool threads just hold ready-to-run dispatches in parallel
rather than serialised in a queue.

Why not just spawn_blocking (round 3 attempt): tokio's blocking pool
defaults to 512 threads, but its per-dispatch overhead was higher
than rayon's even when queueing — round 3 regressed by 0.5s.

Expected: extract queue wait drops from ~12ms to ~1-2ms wall, p1
preload_wall narrows toward manifest-bench's 2.10s.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/pm/src/util/sysconf.rs | 45 ++++++++++++++++++++++++++++++-----
 1 file changed, 39 insertions(+), 6 deletions(-)

diff --git a/crates/pm/src/util/sysconf.rs b/crates/pm/src/util/sysconf.rs
index af77a7745..645b7b451 100644
--- a/crates/pm/src/util/sysconf.rs
+++ b/crates/pm/src/util/sysconf.rs
@@ -6,13 +6,46 @@ pub fn init() {
         reset_sigpipe();
     }
 
-    // Windows default thread stack is 1MB, insufficient for libdeflater + tar
-    // + rayon work-stealing.
+    init_rayon_pool();
+}
+
+/// Configure the global rayon pool size.
+///
+/// Rayon defaults to `num_cpus` workers, which is 2 on GHA ubuntu-latest.
+/// Two workers are enough for the install-path's `par_chunks(64)` extract
+/// (mostly disk-bound), but the resolve-path's manifest parse + extract
+/// pipeline runs *many* short CPU bursts (parse: ~5ms, get_core_version:
+/// ~1-3ms) dispatched from up to 64 concurrent fetches.
+///
+/// With pool=2, each fetch waits up to ~25ms in queue per dispatch —
+/// fetch-breakdown instrumentation showed avg_parse jumping 5ms (CPU)
+/// → 30ms (CPU + queue) just from the first dispatch. The second hop
+/// (`extract_core_version_off_runtime`) has the same problem. `tokio
+/// spawn_blocking` avoids the queue but its per-dispatch overhead
+/// (round 3 measurement) was higher than rayon's queue wait at 64×.
+///
+/// Sizing the pool above the host CPU count for these short, blocking
+/// JSON-shape operations gives the queue a chance to drain even when
+/// 64 fetches dispatch concurrently. The work itself is bounded — at
+/// most 2 are doing real CPU at once on a 2-core box; the extra pool
+/// slots just hold pending tasks until a CPU is free, replacing FIFO
+/// queueing with parallel dispatch.
+///
+/// Cap of 8 keeps the pool reasonable on bigger machines (where
+/// `num_cpus` is already enough); the floor of 8 oversubscribes
+/// only on the constrained 2-core CI image.
+fn init_rayon_pool() {
+    let parallelism = std::thread::available_parallelism()
+        .map(std::num::NonZero::get)
+        .unwrap_or(2);
+    let threads = parallelism.max(8);
+
+    let builder = rayon::ThreadPoolBuilder::new().num_threads(threads);
+
     #[cfg(target_os = "windows")]
-    rayon::ThreadPoolBuilder::new()
-        .stack_size(8 * 1024 * 1024)
-        .build_global()
-        .ok();
+    let builder = builder.stack_size(8 * 1024 * 1024);
+
+    builder.build_global().ok();
 }
 
 /// Restore default SIGPIPE handling so broken pipes cause a clean exit

From 394f6c92d7c5f929c18846abec54fefb9dbbb1bd Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 04:06:09 +0800
Subject: [PATCH 10/24] perf(pm): skip preload for p1 path; BFS does per-level
 parallel prefetch

Adds `BuildDepsOptions::skip_preload` so callers without a pipeline
consumer (utoo deps / package-lock-only) can drop the up-front
preload phase entirely. BFS now batches prefetch per level across
the whole frontier, then runs the existing sequential
process_dependency walk against the warmed cache.

For install paths (Context::pipeline_deps_options), skip_preload
stays false so PackageResolved events still feed the
download/clone pipeline.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/pm/src/helper/ruborist_context.rs |  8 ++-
 crates/ruborist/src/resolver/builder.rs  | 71 +++++++++++++++++++++---
 crates/ruborist/src/service/api.rs       | 21 ++++++-
 3 files changed, 90 insertions(+), 10 deletions(-)

diff --git a/crates/pm/src/helper/ruborist_context.rs b/crates/pm/src/helper/ruborist_context.rs
index b47def019..bc4d7faa1 100644
--- a/crates/pm/src/helper/ruborist_context.rs
+++ b/crates/pm/src/helper/ruborist_context.rs
@@ -63,6 +63,7 @@ impl Context {
             receiver,
             supports_semver: get_supports_semver(),
             catalogs,
+            skip_preload: false,
         }
     }
 
@@ -82,8 +83,13 @@ impl Context {
     /// Resolve dependency tree with plain ProgressReceiver. Returns
     /// [`BuildDepsOutput`] (lock + project cache); the project cache is
     /// persisted in the background.
+    ///
+    /// Used by the lockfile-only path (`utoo deps`). No pipeline consumes
+    /// `PackageResolved` events here, so preload is pure overhead — BFS's
+    /// own per-level parallel prefetch warms the manifest cache.
     pub async fn build_deps(cwd: PathBuf) -> anyhow::Result<BuildDepsOutput> {
-        let options = Self::deps_options(cwd.clone(), ProgressReceiver).await;
+        let mut options = Self::deps_options(cwd.clone(), ProgressReceiver).await;
+        options.skip_preload = true;
         let output = utoo_ruborist::service::build_deps(options).await?;
         spawn_save_project_cache(cwd, output.project_cache.clone());
         Ok(output)
diff --git a/crates/ruborist/src/resolver/builder.rs b/crates/ruborist/src/resolver/builder.rs
index 166372c91..d811fc38c 100644
--- a/crates/ruborist/src/resolver/builder.rs
+++ b/crates/ruborist/src/resolver/builder.rs
@@ -18,21 +18,22 @@
 //! This separation allows for maximum parallelism during network I/O
 //! while keeping the graph building logic simple and deterministic.
 
-use petgraph::graph::NodeIndex;
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::path::PathBuf;
 use std::sync::Arc;
 
 #[cfg(feature = "http-tarball")]
 use anyhow::Context as _;
+use futures::stream::{self, StreamExt};
+use petgraph::graph::NodeIndex;
 
 use crate::model::graph::{DependencyGraph, FindResult, PackageNode};
 use crate::model::manifest::NodeManifest;
 use crate::model::node::EdgeType;
 use crate::model::package_json::PackageJson;
 use crate::resolver::preload::{PreloadConfig, preload_manifests};
-use crate::resolver::registry::{ResolveError, resolve_registry_dep};
-use crate::spec::{Catalogs, PackageSpec, Protocol};
+use crate::resolver::registry::{ResolveError, resolve_package, resolve_registry_dep};
+use crate::spec::{Catalogs, PackageSpec, Protocol, SpecStr};
 use crate::traits::progress::{BuildEvent, EventReceiver, NoopReceiver};
 use crate::traits::registry::{RegistryClient, ResolvedPackage};
 
@@ -181,9 +182,6 @@ struct NodeFlags {
 /// resolved at edge creation time, so by the time this runs they are already
 /// concrete registry specs.
 fn gather_preload_deps(graph: &DependencyGraph, peer_deps: PeerDeps) -> Vec<(String, String)> {
-    use crate::spec::SpecStr;
-    use std::collections::HashSet;
-
     let mut deps = HashSet::new();
 
     let collect = |node_index: NodeIndex, deps: &mut HashSet<(String, String)>| {
@@ -805,20 +803,74 @@ async fn run_preload_phase<R: RegistryClient, E: EventReceiver>(
 }
 
 /// Run the BFS traversal phase to build the dependency tree.
+///
+/// Each level does a parallel prefetch of all unresolved registry specs
+/// before the sequential `process_dependency` walk. The prefetch warms
+/// the registry's manifest cache so the per-edge `process_dependency`
+/// calls below hit cache instead of awaiting network.
+///
+/// This collapses the previously-separate `run_preload_phase` (which
+/// fetched all transitive manifests up-front) into per-level batches.
+/// Net effect on `utoo deps`: no separate preload wall — fetch happens
+/// inside BFS in waves matching the dep tree's natural levels. For
+/// install paths (p0/p3), `run_preload_phase` may still run via
+/// `skip_preload=false` and feed the `PackageResolved` pipeline event.
 async fn run_bfs_phase<R: RegistryClient, E: EventReceiver>(
     graph: &mut DependencyGraph,
     registry: &R,
     config: &BuildDepsConfig,
     receiver: &E,
 ) -> Result<(), ResolveError<R::Error>> {
+    // Reset fetch counters so the breakdown line reports fetches issued
+    // *during* this BFS phase, not preload's. (Preload still runs for
+    // install-path callers and reports its own breakdown.)
+    if config.skip_preload {
+        crate::util::FETCH_TIMINGS.reset();
+    }
+
     let start = tokio::time::Instant::now();
+    let mut total_prefetch_wall_us: u64 = 0;
+    let mut total_merge_wall_us: u64 = 0;
 
     let mut current_level = vec![graph.root_index];
+    let mut prefetched: HashSet<String> = HashSet::new();
 
     while !current_level.is_empty() {
         receiver.on_event(BuildEvent::LevelStart {
             node_count: current_level.len(),
         });
+
+        // Phase A: collect unresolved registry edges across the whole level
+        // (deduplicated against earlier levels — once a (name, spec) is
+        // prefetched, the registry's cache satisfies every subsequent
+        // `process_dependency` call).
+        let mut prefetch_targets: Vec<(String, String)> = Vec::new();
+        for &node_index in &current_level {
+            for edge in collect_unresolved_edges(graph, node_index) {
+                if edge.spec.is_registry_spec() {
+                    let key = format!("{}@{}", edge.name, edge.spec);
+                    if prefetched.insert(key) {
+                        prefetch_targets.push((edge.name, edge.spec));
+                    }
+                }
+            }
+        }
+
+        // Phase B: parallel prefetch — pure cache warming. Errors are
+        // ignored here; the sequential `process_dependency` below will
+        // re-issue (now hitting either cache or the same fresh failure)
+        // and propagate any real error through the existing path.
+        if !prefetch_targets.is_empty() {
+            let prefetch_start = tokio::time::Instant::now();
+            stream::iter(prefetch_targets)
+                .for_each_concurrent(config.concurrency, |(name, spec)| async move {
+                    let _ = resolve_package(registry, &name, &spec).await;
+                })
+                .await;
+            total_prefetch_wall_us += prefetch_start.elapsed().as_micros() as u64;
+        }
+
+        let merge_start = tokio::time::Instant::now();
         let mut next_level = Vec::new();
 
         for node_index in current_level {
@@ -900,14 +952,17 @@ async fn run_bfs_phase<R: RegistryClient, E: EventReceiver>(
         receiver.on_event(BuildEvent::LevelComplete {
             next_level_count: next_level.len(),
         });
+        total_merge_wall_us += merge_start.elapsed().as_micros() as u64;
         current_level = next_level;
     }
 
     let bfs_elapsed = start.elapsed();
     tracing::debug!("Build phase: {:?}", bfs_elapsed);
     tracing::info!(
-        "p1-breakdown bfs_wall={}ms | {}",
+        "p1-breakdown bfs_wall={}ms bfs_prefetch_wall_us={} bfs_merge_wall_us={} | {}",
         bfs_elapsed.as_millis(),
+        total_prefetch_wall_us,
+        total_merge_wall_us,
         crate::util::FETCH_TIMINGS.snapshot().summary_line(),
     );
     Ok(())
diff --git a/crates/ruborist/src/service/api.rs b/crates/ruborist/src/service/api.rs
index 82703ed97..5a14f2a56 100644
--- a/crates/ruborist/src/service/api.rs
+++ b/crates/ruborist/src/service/api.rs
@@ -70,6 +70,16 @@ pub struct BuildDepsOptions<G, R> {
     /// Catalog definitions for the `catalog:` dependency protocol.
     /// Key `""` = default catalog, other keys = named catalogs.
     pub catalogs: Catalogs,
+    /// When true, skip the up-front `run_preload_phase`. Set by callers
+    /// that don't consume the `BuildEvent::PackageResolved` pipeline
+    /// stream — e.g. `utoo deps` (lockfile-only). The BFS phase has its
+    /// own per-level prefetch that warms the manifest cache, so dropping
+    /// preload doesn't change correctness, only avoids the redundant
+    /// up-front fetch + dedicated wall.
+    /// Install paths (which feed `PipelineReceiver` to start tarball
+    /// downloads as resolves complete) leave this false so preload still
+    /// emits PackageResolved events to the pipeline.
+    pub skip_preload: bool,
 }
 
 impl<G, R> BuildDepsOptions<G, R> {
@@ -91,6 +101,7 @@ impl<G, R> BuildDepsOptions<G, R> {
             receiver,
             supports_semver: None,
             catalogs: HashMap::new(),
+            skip_preload: false,
         }
     }
 }
@@ -132,6 +143,7 @@ where
         receiver,
         supports_semver,
         catalogs,
+        skip_preload: skip_preload_caller,
     } = options;
 
     // 1. Find root path (workspace root if applicable)
@@ -234,7 +246,13 @@ where
         registry.supports_semver(),
     );
 
-    let skip_preload = cache_count > 0;
+    // Skip preload when:
+    //   - the caller asked us to (e.g. `utoo deps`, no pipeline consumer
+    //     for PackageResolved events — BFS does its own per-level
+    //     prefetch, preload is redundant), OR
+    //   - the project's warm cache already has manifests covering most
+    //     of the workload (existing skip-on-warm behavior).
+    let skip_preload = skip_preload_caller || cache_count > 0;
     let mut config = BuildDepsConfig::default()
         .with_peer_deps(peer_deps)
         .with_concurrency(concurrency)
@@ -334,6 +352,7 @@ mod tests {
             receiver: NoopReceiver,
             supports_semver: None,
             catalogs: HashMap::new(),
+            skip_preload: false,
         };
 
         assert_eq!(options.concurrency, 20);

From 596cd2045fd6ef5031703343b52ccad2a67a907f Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 04:18:21 +0800
Subject: [PATCH 11/24] perf(pm): fast_preload bypasses UnifiedRegistry for
 utoo deps path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds resolver::fast_preload, a manifest-bench-style flat
FuturesUnordered over service::manifest::fetch_full_manifest. It
warms MemoryCache (both full_manifests and version_manifests slots)
synchronously after each fetch, so the BFS phase is pure cache-hit:
no rayon hop on extract_core_version, no OnceMap gates, no
DiskManifestStore writes, no PackageResolved events.

Wired into service::api::build_deps: when the caller asks to skip
preload (Context::build_deps for `utoo deps`) and there's no warm
project cache, fast_preload runs ahead of build_deps_with_config.
Install paths still go through preload_manifests so the pipeline
keeps its early-start signal.

Also reverts the per-level prefetch I added in 394f6c92 — with
fast_preload pre-warming everything, BFS doesn't need its own
prefetch wave.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/resolver/builder.rs      |  72 ++----
 crates/ruborist/src/resolver/fast_preload.rs | 234 +++++++++++++++++++
 crates/ruborist/src/resolver/mod.rs          |   1 +
 crates/ruborist/src/service/api.rs           |  26 +++
 4 files changed, 275 insertions(+), 58 deletions(-)
 create mode 100644 crates/ruborist/src/resolver/fast_preload.rs

diff --git a/crates/ruborist/src/resolver/builder.rs b/crates/ruborist/src/resolver/builder.rs
index d811fc38c..156622502 100644
--- a/crates/ruborist/src/resolver/builder.rs
+++ b/crates/ruborist/src/resolver/builder.rs
@@ -24,7 +24,6 @@ use std::sync::Arc;
 
 #[cfg(feature = "http-tarball")]
 use anyhow::Context as _;
-use futures::stream::{self, StreamExt};
 use petgraph::graph::NodeIndex;
 
 use crate::model::graph::{DependencyGraph, FindResult, PackageNode};
@@ -32,7 +31,7 @@ use crate::model::manifest::NodeManifest;
 use crate::model::node::EdgeType;
 use crate::model::package_json::PackageJson;
 use crate::resolver::preload::{PreloadConfig, preload_manifests};
-use crate::resolver::registry::{ResolveError, resolve_package, resolve_registry_dep};
+use crate::resolver::registry::{ResolveError, resolve_registry_dep};
 use crate::spec::{Catalogs, PackageSpec, Protocol, SpecStr};
 use crate::traits::progress::{BuildEvent, EventReceiver, NoopReceiver};
 use crate::traits::registry::{RegistryClient, ResolvedPackage};
@@ -181,7 +180,10 @@ struct NodeFlags {
 /// Only registry specs (e.g. `^4.17.0`) are collected. `catalog:` specs are
 /// resolved at edge creation time, so by the time this runs they are already
 /// concrete registry specs.
-fn gather_preload_deps(graph: &DependencyGraph, peer_deps: PeerDeps) -> Vec<(String, String)> {
+pub(crate) fn gather_preload_deps(
+    graph: &DependencyGraph,
+    peer_deps: PeerDeps,
+) -> Vec<(String, String)> {
     let mut deps = HashSet::new();
 
     let collect = |node_index: NodeIndex, deps: &mut HashSet<(String, String)>| {
@@ -805,72 +807,29 @@ async fn run_preload_phase<R: RegistryClient, E: EventReceiver>(
 /// Run the BFS traversal phase to build the dependency tree.
 ///
 /// Each level does a parallel prefetch of all unresolved registry specs
-/// before the sequential `process_dependency` walk. The prefetch warms
-/// the registry's manifest cache so the per-edge `process_dependency`
-/// calls below hit cache instead of awaiting network.
+/// before the sequential `process_dependency` walk.
 ///
-/// This collapses the previously-separate `run_preload_phase` (which
-/// fetched all transitive manifests up-front) into per-level batches.
-/// Net effect on `utoo deps`: no separate preload wall — fetch happens
-/// inside BFS in waves matching the dep tree's natural levels. For
-/// install paths (p0/p3), `run_preload_phase` may still run via
-/// `skip_preload=false` and feed the `PackageResolved` pipeline event.
+/// When `skip_preload=true` (lockfile-only path), the caller is
+/// expected to have already populated `registry.cache()` via
+/// [`super::fast_preload::fast_preload`], so this BFS sees only
+/// cache hits. When `skip_preload=false` (install paths), the
+/// receiver-driven [`super::preload::preload_manifests`] runs ahead
+/// of this phase and feeds `BuildEvent::PackageResolved` to the
+/// pipeline.
 async fn run_bfs_phase<R: RegistryClient, E: EventReceiver>(
     graph: &mut DependencyGraph,
     registry: &R,
     config: &BuildDepsConfig,
     receiver: &E,
 ) -> Result<(), ResolveError<R::Error>> {
-    // Reset fetch counters so the breakdown line reports fetches issued
-    // *during* this BFS phase, not preload's. (Preload still runs for
-    // install-path callers and reports its own breakdown.)
-    if config.skip_preload {
-        crate::util::FETCH_TIMINGS.reset();
-    }
-
     let start = tokio::time::Instant::now();
-    let mut total_prefetch_wall_us: u64 = 0;
-    let mut total_merge_wall_us: u64 = 0;
-
     let mut current_level = vec![graph.root_index];
-    let mut prefetched: HashSet<String> = HashSet::new();
 
     while !current_level.is_empty() {
         receiver.on_event(BuildEvent::LevelStart {
             node_count: current_level.len(),
         });
 
-        // Phase A: collect unresolved registry edges across the whole level
-        // (deduplicated against earlier levels — once a (name, spec) is
-        // prefetched, the registry's cache satisfies every subsequent
-        // `process_dependency` call).
-        let mut prefetch_targets: Vec<(String, String)> = Vec::new();
-        for &node_index in &current_level {
-            for edge in collect_unresolved_edges(graph, node_index) {
-                if edge.spec.is_registry_spec() {
-                    let key = format!("{}@{}", edge.name, edge.spec);
-                    if prefetched.insert(key) {
-                        prefetch_targets.push((edge.name, edge.spec));
-                    }
-                }
-            }
-        }
-
-        // Phase B: parallel prefetch — pure cache warming. Errors are
-        // ignored here; the sequential `process_dependency` below will
-        // re-issue (now hitting either cache or the same fresh failure)
-        // and propagate any real error through the existing path.
-        if !prefetch_targets.is_empty() {
-            let prefetch_start = tokio::time::Instant::now();
-            stream::iter(prefetch_targets)
-                .for_each_concurrent(config.concurrency, |(name, spec)| async move {
-                    let _ = resolve_package(registry, &name, &spec).await;
-                })
-                .await;
-            total_prefetch_wall_us += prefetch_start.elapsed().as_micros() as u64;
-        }
-
-        let merge_start = tokio::time::Instant::now();
         let mut next_level = Vec::new();
 
         for node_index in current_level {
@@ -952,17 +911,14 @@ async fn run_bfs_phase<R: RegistryClient, E: EventReceiver>(
         receiver.on_event(BuildEvent::LevelComplete {
             next_level_count: next_level.len(),
         });
-        total_merge_wall_us += merge_start.elapsed().as_micros() as u64;
         current_level = next_level;
     }
 
     let bfs_elapsed = start.elapsed();
     tracing::debug!("Build phase: {:?}", bfs_elapsed);
     tracing::info!(
-        "p1-breakdown bfs_wall={}ms bfs_prefetch_wall_us={} bfs_merge_wall_us={} | {}",
+        "p1-breakdown bfs_wall={}ms | {}",
         bfs_elapsed.as_millis(),
-        total_prefetch_wall_us,
-        total_merge_wall_us,
         crate::util::FETCH_TIMINGS.snapshot().summary_line(),
     );
     Ok(())
diff --git a/crates/ruborist/src/resolver/fast_preload.rs b/crates/ruborist/src/resolver/fast_preload.rs
new file mode 100644
index 000000000..975c18a81
--- /dev/null
+++ b/crates/ruborist/src/resolver/fast_preload.rs
@@ -0,0 +1,234 @@
+//! Lean parallel manifest fetcher modeled on `manifest-bench`.
+//!
+//! Bypasses [`crate::service::registry::UnifiedRegistry`] — and therefore
+//! its `OnceMap` gates, [`crate::service::store::ManifestStore`] writes,
+//! and `EventReceiver` event dispatch — to drive a flat
+//! `FuturesUnordered` over [`crate::service::manifest::fetch_full_manifest`]
+//! plus a synchronous transitive walk. The warm
+//! [`crate::service::cache::MemoryCache`] it leaves behind makes the
+//! subsequent BFS phase a pure cache-hit walk: no network, no rayon
+//! re-parse hop on `extract_core_version`.
+//!
+//! Intended for the lockfile-only path (`utoo deps`) which has no
+//! pipeline consumer for `BuildEvent::PackageResolved` — install paths
+//! still go through [`super::preload::preload_manifests`] so the
+//! pipeline keeps its early-start signal.
+
+use std::collections::{HashSet, VecDeque};
+use std::sync::Arc;
+
+use futures::stream::{FuturesUnordered, StreamExt};
+
+use crate::model::manifest::CoreVersionManifest;
+use crate::model::node::PeerDeps;
+use crate::resolver::preload::{Dep, PreloadConfig};
+use crate::resolver::version::resolve_target_version;
+use crate::service::{
+    FetchManifestOptions, FetchManifestResult, MemoryCache, MetadataFormat, fetch_full_manifest,
+};
+use crate::spec::SpecStr;
+use crate::util::FETCH_TIMINGS;
+
+/// Statistics from the lean fetch loop. Mirrors `PreloadStats` shape so
+/// the bench-grep regex stays the same.
+#[derive(Debug, Default)]
+pub struct FastPreloadStats {
+    pub success_count: usize,
+    pub failed_count: usize,
+    pub fetched_names: usize,
+    pub min_request_ms: u64,
+    pub max_request_ms: u64,
+    pub total_request_ms: u64,
+}
+
+/// Collect dependencies from any deps map, filtering out non-registry specs.
+fn collect_deps(map: Option<&std::collections::HashMap<String, String>>) -> Vec<Dep> {
+    map.into_iter()
+        .flatten()
+        .filter(|(_, spec)| spec.is_registry_spec())
+        .map(|(name, spec)| (name.clone(), spec.clone()))
+        .collect()
+}
+
+/// Extract transitive dependencies from a resolved manifest.
+/// devDependencies are omitted (only the root installs devDeps).
+fn extract_transitive_deps(manifest: &CoreVersionManifest, peer_deps: PeerDeps) -> Vec<Dep> {
+    let mut deps = Vec::new();
+    deps.extend(collect_deps(manifest.dependencies.as_ref()));
+    if peer_deps == PeerDeps::Include {
+        deps.extend(collect_deps(manifest.peer_dependencies.as_ref()));
+    }
+    deps.extend(collect_deps(manifest.optional_dependencies.as_ref()));
+    deps
+}
+
+/// Resolve `(name, spec)` against the cached `FullManifest` synchronously.
+///
+/// Inlines the work that `UnifiedRegistry::resolve_via_full_manifest` does
+/// after a cache hit — pick a version, parse just that subset, populate
+/// the per-version cache slot the BFS phase will read from. Skips the
+/// rayon/`spawn_blocking` hop because the caller is already doing
+/// CPU-bound bookkeeping between fetches.
+fn settle_spec(name: &str, spec: &str, cache: &MemoryCache, peer_deps: PeerDeps) -> Vec<Dep> {
+    let Some(full) = cache.get_full_manifest(name) else {
+        return Vec::new();
+    };
+    let Ok(resolved_version) = resolve_target_version((&*full).into(), spec) else {
+        return Vec::new();
+    };
+    if let Some(cached) = cache.get_version_manifest(name, &resolved_version) {
+        return extract_transitive_deps(&cached, peer_deps);
+    }
+    let Some(core) = full.get_core_version(&resolved_version) else {
+        return Vec::new();
+    };
+    let core_arc = Arc::new(core);
+    cache.set_version_manifest(name.to_string(), resolved_version, Arc::clone(&core_arc));
+    extract_transitive_deps(&core_arc, peer_deps)
+}
+
+/// Manifest-bench-style flat parallel fetch of all transitively-reachable
+/// registry manifests. Populates `cache` with both `full_manifests` and
+/// `version_manifests` slots so the subsequent BFS does no network and no
+/// re-parse.
+///
+/// `initial_deps` should already be the union of root+workspace
+/// registry edges, with non-registry specs filtered out.
+pub async fn fast_preload(
+    initial_deps: Vec<Dep>,
+    registry_url: &str,
+    cache: &MemoryCache,
+    config: &PreloadConfig,
+) -> FastPreloadStats {
+    let mut stats = FastPreloadStats::default();
+    let mut pending: VecDeque<Dep> = VecDeque::from(initial_deps);
+    // Specs we've already enqueued (or settled). Prevents duplicate
+    // sync resolutions from re-walking the same transitive subtree.
+    let mut seen_specs: HashSet<(String, String)> = HashSet::new();
+    // Names whose full manifest is either cached or in flight. Spec-level
+    // dedup happens in `seen_specs` above; this set is the gate that
+    // prevents two concurrent fetches for the same package (sibling
+    // specs queue against the in-flight one rather than racing).
+    let mut fetched_names: HashSet<String> = HashSet::new();
+    // Specs that arrived while their package's full manifest was still
+    // in flight — we'll settle them once the fetch lands.
+    let mut deferred_specs: Vec<(String, String)> = Vec::new();
+    let mut futs = FuturesUnordered::new();
+    let concurrency = config.concurrency;
+    let peer_deps = config.peer_deps;
+
+    loop {
+        while futs.len() < concurrency {
+            let Some((name, spec)) = pending.pop_front() else {
+                break;
+            };
+            if !seen_specs.insert((name.clone(), spec.clone())) {
+                continue;
+            }
+
+            // Full manifest already cached: skip the network round-trip,
+            // settle synchronously and queue this package's transitive
+            // deps. This is the hot path on the second-and-later spec
+            // for any popular package (lodash, semver, etc.).
+            if cache.get_full_manifest(&name).is_some() {
+                let new_deps = settle_spec(&name, &spec, cache, peer_deps);
+                pending.extend(new_deps);
+                continue;
+            }
+
+            // Fetch in flight for this name — defer settling this spec
+            // until the fetch lands. The deferred set is small (only
+            // sibling specs for in-flight names) so the linear scan is
+            // cheaper than another HashMap.
+            if !fetched_names.insert(name.clone()) {
+                deferred_specs.push((name, spec));
+                continue;
+            }
+
+            let registry_url = registry_url.to_string();
+            let n = name.clone();
+            futs.push(async move {
+                let start = tokio::time::Instant::now();
+                let result = fetch_full_manifest(FetchManifestOptions {
+                    registry_url: &registry_url,
+                    name: &n,
+                    format: MetadataFormat::Abbreviated,
+                    etag: None,
+                })
+                .await;
+                let elapsed_ms = start.elapsed().as_millis() as u64;
+                (name, spec, result, elapsed_ms)
+            });
+        }
+
+        if futs.is_empty() {
+            break;
+        }
+
+        let Some((name, spec, result, elapsed_ms)) = futs.next().await else {
+            break;
+        };
+
+        if stats.success_count == 0 && stats.failed_count == 0 {
+            stats.min_request_ms = elapsed_ms;
+            stats.max_request_ms = elapsed_ms;
+        } else {
+            stats.min_request_ms = stats.min_request_ms.min(elapsed_ms);
+            stats.max_request_ms = stats.max_request_ms.max(elapsed_ms);
+        }
+        stats.total_request_ms += elapsed_ms;
+
+        match result {
+            Ok(FetchManifestResult::Ok(manifest, _etag)) => {
+                stats.success_count += 1;
+                stats.fetched_names += 1;
+                cache.set_full_manifest(name.clone(), Arc::new(manifest));
+
+                let new_deps = settle_spec(&name, &spec, cache, peer_deps);
+                pending.extend(new_deps);
+
+                // Drain any sibling specs that arrived while this fetch
+                // was in flight. `extract_if`-style retain in place.
+                let mut i = 0;
+                while i < deferred_specs.len() {
+                    if deferred_specs[i].0 == name {
+                        let (n, s) = deferred_specs.swap_remove(i);
+                        let new_deps = settle_spec(&n, &s, cache, peer_deps);
+                        pending.extend(new_deps);
+                    } else {
+                        i += 1;
+                    }
+                }
+            }
+            Ok(FetchManifestResult::NotModified) => {
+                // No ETag was sent on these requests, so 304 is unreachable
+                // here in practice; treat it as a soft-failure to keep the
+                // path total.
+                stats.failed_count += 1;
+            }
+            Err(e) => {
+                stats.failed_count += 1;
+                tracing::debug!("fast_preload failed for {}: {}", name, e);
+            }
+        }
+    }
+
+    let total = stats.success_count + stats.failed_count;
+    let avg_ms = if total > 0 {
+        stats.total_request_ms / total as u64
+    } else {
+        0
+    };
+    tracing::info!(
+        "p1-breakdown fast_preload n={} ok={} fail={} avg_req={}ms min={}ms max={}ms | {}",
+        total,
+        stats.success_count,
+        stats.failed_count,
+        avg_ms,
+        stats.min_request_ms,
+        stats.max_request_ms,
+        FETCH_TIMINGS.snapshot().summary_line(),
+    );
+
+    stats
+}
diff --git a/crates/ruborist/src/resolver/mod.rs b/crates/ruborist/src/resolver/mod.rs
index 582e03b31..e7baad988 100644
--- a/crates/ruborist/src/resolver/mod.rs
+++ b/crates/ruborist/src/resolver/mod.rs
@@ -3,6 +3,7 @@
 pub mod builder;
 pub mod common;
 pub mod edges;
+pub mod fast_preload;
 #[cfg(feature = "native-git")]
 pub mod git;
 #[cfg(feature = "http-tarball")]
diff --git a/crates/ruborist/src/service/api.rs b/crates/ruborist/src/service/api.rs
index 5a14f2a56..3b9b713ea 100644
--- a/crates/ruborist/src/service/api.rs
+++ b/crates/ruborist/src/service/api.rs
@@ -36,7 +36,10 @@ use crate::model::package_lock::PackageLock;
 use crate::model::util::parse_package_spec;
 use crate::resolver::builder::{
     BuildDepsConfig, DevDeps, EdgeContext, PeerDeps, add_edges_from, build_deps_with_config,
+    gather_preload_deps,
 };
+use crate::resolver::fast_preload::fast_preload;
+use crate::resolver::preload::PreloadConfig;
 use crate::resolver::runtime::install_runtime_from_map;
 use crate::resolver::workspace::WorkspaceDiscovery;
 use crate::spec::Catalogs;
@@ -269,6 +272,29 @@ where
         );
     }
 
+    // Lockfile-only callers (`utoo deps`) skip the receiver-driven
+    // `run_preload_phase` because they have no pipeline consumer for
+    // `BuildEvent::PackageResolved`. Run `fast_preload` instead — a flat
+    // `FuturesUnordered` over `fetch_full_manifest` that warms the
+    // `MemoryCache` so the BFS phase below is pure cache-hit. This is
+    // the manifest-bench-style path; the heavier `preload_manifests`
+    // path (with `OnceMap` gates + `EventReceiver` events) only runs
+    // for install paths that need the pipeline signal.
+    if skip_preload_caller && cache_count == 0 {
+        let initial_deps = gather_preload_deps(&graph, peer_deps);
+        let preload_config = PreloadConfig {
+            peer_deps,
+            concurrency,
+        };
+        fast_preload(
+            initial_deps,
+            registry.registry_url(),
+            registry.cache(),
+            &preload_config,
+        )
+        .await;
+    }
+
     // Preserve the typed error via `Error::new` + `.context(...)` so CLI
     // renderers (e.g. pm's format_print) can downcast and pretty-print the
     // dependency chain carried by `ResolveError::WithChain`.

From 2e74bba904e391931a71960464932334e0d46e94 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 04:51:27 +0800
Subject: [PATCH 12/24] perf(pm): dispatch fast_preload settle to rayon to free
 tokio runtime
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v1 of fast_preload called settle_spec inline on the tokio worker —
each settle ran simd_json::to_borrowed_value over the full
manifest's raw bytes (5–10ms per spec) right on the runtime
thread. CI showed it starved sibling fetches: avg_request rose
+3ms, avg_parse jumped 5→11ms, p1_resolve regressed +1.0s vs the
preload+BFS baseline (4.0s vs 3.0s).

Fix: route every settle through extract_core_version_off_runtime
(the same rayon::spawn helper the BFS path uses), and merge fetch
and settle completions into a single FuturesUnordered so
backpressure on either side throttles the other. Sibling specs
that arrived during a fetch are now stashed by name (HashMap, not
linear scan), then dispatched as their own settle futures when
the fetch lands.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/resolver/fast_preload.rs | 248 ++++++++++++-------
 1 file changed, 163 insertions(+), 85 deletions(-)

diff --git a/crates/ruborist/src/resolver/fast_preload.rs b/crates/ruborist/src/resolver/fast_preload.rs
index 975c18a81..faea79752 100644
--- a/crates/ruborist/src/resolver/fast_preload.rs
+++ b/crates/ruborist/src/resolver/fast_preload.rs
@@ -4,7 +4,7 @@
 //! its `OnceMap` gates, [`crate::service::store::ManifestStore`] writes,
 //! and `EventReceiver` event dispatch — to drive a flat
 //! `FuturesUnordered` over [`crate::service::manifest::fetch_full_manifest`]
-//! plus a synchronous transitive walk. The warm
+//! plus a rayon-dispatched per-spec settle. The warm
 //! [`crate::service::cache::MemoryCache`] it leaves behind makes the
 //! subsequent BFS phase a pure cache-hit walk: no network, no rayon
 //! re-parse hop on `extract_core_version`.
@@ -13,13 +13,28 @@
 //! pipeline consumer for `BuildEvent::PackageResolved` — install paths
 //! still go through [`super::preload::preload_manifests`] so the
 //! pipeline keeps its early-start signal.
+//!
+//! ## Why settle is dispatched off-runtime
+//!
+//! A "settle" turns a freshly-fetched `FullManifest` plus a spec into a
+//! `CoreVersionManifest` for one version, via `simd_json::to_borrowed_value`
+//! over the manifest's raw bytes. That parse is 5–10ms per spec on a
+//! 100KB body. Calling it inline on the tokio runtime (the v1 of this
+//! module) starves the runtime worker — sibling fetches in flight stop
+//! draining their sockets while the worker is parsing, which CI showed
+//! as `avg_request` rising +3ms and `avg_parse` jumping 5→11ms vs the
+//! UnifiedRegistry baseline. Routing settle through `rayon::spawn`
+//! (the same path the `extract_core_version_off_runtime` helper takes)
+//! keeps the runtime free to drive I/O.
 
-use std::collections::{HashSet, VecDeque};
+use std::collections::{HashMap, HashSet, VecDeque};
+use std::pin::Pin;
 use std::sync::Arc;
 
+use futures::future::BoxFuture;
 use futures::stream::{FuturesUnordered, StreamExt};
 
-use crate::model::manifest::CoreVersionManifest;
+use crate::model::manifest::{CoreVersionManifest, FullManifest, extract_core_version_off_runtime};
 use crate::model::node::PeerDeps;
 use crate::resolver::preload::{Dep, PreloadConfig};
 use crate::resolver::version::resolve_target_version;
@@ -41,8 +56,32 @@ pub struct FastPreloadStats {
     pub total_request_ms: u64,
 }
 
+/// Output of one in-flight future. The main loop merges fetch and settle
+/// completions through a single `FuturesUnordered` so backpressure on
+/// either side throttles the other naturally.
+///
+/// `Fetched` is boxed because `FetchManifestResult::Ok` carries a fully-
+/// parsed `FullManifest` (`raw` bytes + parsed envelope), which makes
+/// the variant large enough that clippy flags the size delta with
+/// `Settled`. The cost is one heap allocation per fetched manifest;
+/// trivial against the network round-trip we already paid.
+#[allow(clippy::large_enum_variant)]
+enum FastEvent {
+    Fetched {
+        name: String,
+        primary_spec: String,
+        result: anyhow::Result<FetchManifestResult>,
+        elapsed_ms: u64,
+    },
+    Settled {
+        new_deps: Vec<Dep>,
+    },
+}
+
+type FastFut = Pin<Box<dyn std::future::Future<Output = FastEvent> + Send>>;
+
 /// Collect dependencies from any deps map, filtering out non-registry specs.
-fn collect_deps(map: Option<&std::collections::HashMap<String, String>>) -> Vec<Dep> {
+fn collect_deps(map: Option<&HashMap<String, String>>) -> Vec<Dep> {
     map.into_iter()
         .flatten()
         .filter(|(_, spec)| spec.is_registry_spec())
@@ -62,29 +101,41 @@ fn extract_transitive_deps(manifest: &CoreVersionManifest, peer_deps: PeerDeps)
     deps
 }
 
-/// Resolve `(name, spec)` against the cached `FullManifest` synchronously.
+/// Resolve `(name, spec)` against `full` off the tokio runtime.
 ///
-/// Inlines the work that `UnifiedRegistry::resolve_via_full_manifest` does
-/// after a cache hit — pick a version, parse just that subset, populate
-/// the per-version cache slot the BFS phase will read from. Skips the
-/// rayon/`spawn_blocking` hop because the caller is already doing
-/// CPU-bound bookkeeping between fetches.
-fn settle_spec(name: &str, spec: &str, cache: &MemoryCache, peer_deps: PeerDeps) -> Vec<Dep> {
-    let Some(full) = cache.get_full_manifest(name) else {
-        return Vec::new();
-    };
-    let Ok(resolved_version) = resolve_target_version((&*full).into(), spec) else {
-        return Vec::new();
-    };
-    if let Some(cached) = cache.get_version_manifest(name, &resolved_version) {
-        return extract_transitive_deps(&cached, peer_deps);
-    }
-    let Some(core) = full.get_core_version(&resolved_version) else {
-        return Vec::new();
-    };
-    let core_arc = Arc::new(core);
-    cache.set_version_manifest(name.to_string(), resolved_version, Arc::clone(&core_arc));
-    extract_transitive_deps(&core_arc, peer_deps)
+/// Returns the freshly-extracted version manifest's transitive deps so
+/// the caller can extend its pending queue. The heavy
+/// `simd_json::to_borrowed_value` parse runs inside
+/// `extract_core_version_off_runtime`, which dispatches to rayon — same
+/// path the BFS phase uses for cold extracts.
+fn settle_future(
+    name: String,
+    spec: String,
+    full: Arc<FullManifest>,
+    cache: MemoryCache,
+    peer_deps: PeerDeps,
+) -> BoxFuture<'static, FastEvent> {
+    Box::pin(async move {
+        let resolved_version = match resolve_target_version((&*full).into(), &spec) {
+            Ok(v) => v,
+            Err(_) => return FastEvent::Settled { new_deps: vec![] },
+        };
+        if let Some(cached) = cache.get_version_manifest(&name, &resolved_version) {
+            return FastEvent::Settled {
+                new_deps: extract_transitive_deps(&cached, peer_deps),
+            };
+        }
+        let (resolved_version, core) =
+            extract_core_version_off_runtime(Arc::clone(&full), resolved_version).await;
+        let new_deps = match core {
+            Some(core_arc) => {
+                cache.set_version_manifest(name, resolved_version, Arc::clone(&core_arc));
+                extract_transitive_deps(&core_arc, peer_deps)
+            }
+            None => Vec::new(),
+        };
+        FastEvent::Settled { new_deps }
+    })
 }
 
 /// Manifest-bench-style flat parallel fetch of all transitively-reachable
@@ -103,17 +154,15 @@ pub async fn fast_preload(
     let mut stats = FastPreloadStats::default();
     let mut pending: VecDeque<Dep> = VecDeque::from(initial_deps);
     // Specs we've already enqueued (or settled). Prevents duplicate
-    // sync resolutions from re-walking the same transitive subtree.
+    // settles from re-walking the same transitive subtree.
     let mut seen_specs: HashSet<(String, String)> = HashSet::new();
-    // Names whose full manifest is either cached or in flight. Spec-level
-    // dedup happens in `seen_specs` above; this set is the gate that
-    // prevents two concurrent fetches for the same package (sibling
-    // specs queue against the in-flight one rather than racing).
+    // Names whose full manifest is in flight or already cached.
     let mut fetched_names: HashSet<String> = HashSet::new();
-    // Specs that arrived while their package's full manifest was still
-    // in flight — we'll settle them once the fetch lands.
-    let mut deferred_specs: Vec<(String, String)> = Vec::new();
-    let mut futs = FuturesUnordered::new();
+    // Sibling specs that arrived while their package's full manifest
+    // was still in flight. The fetch's completion handler drains this
+    // bucket — we stash by name so the lookup is one HashMap probe.
+    let mut deferred_by_name: HashMap<String, Vec<String>> = HashMap::new();
+    let mut futs: FuturesUnordered<FastFut> = FuturesUnordered::new();
     let concurrency = config.concurrency;
     let peer_deps = config.peer_deps;
 
@@ -126,28 +175,33 @@ pub async fn fast_preload(
                 continue;
             }
 
-            // Full manifest already cached: skip the network round-trip,
-            // settle synchronously and queue this package's transitive
-            // deps. This is the hot path on the second-and-later spec
-            // for any popular package (lodash, semver, etc.).
-            if cache.get_full_manifest(&name).is_some() {
-                let new_deps = settle_spec(&name, &spec, cache, peer_deps);
-                pending.extend(new_deps);
+            // Hot path: the full manifest is already cached (a sibling
+            // spec for this name has already returned). Dispatch a
+            // settle so the parse work runs on rayon, not on the tokio
+            // worker — keeps the runtime free for ongoing fetches.
+            if let Some(full) = cache.get_full_manifest(&name) {
+                futs.push(Box::pin(settle_future(
+                    name,
+                    spec,
+                    full,
+                    cache.clone(),
+                    peer_deps,
+                )));
                 continue;
             }
 
-            // Fetch in flight for this name — defer settling this spec
-            // until the fetch lands. The deferred set is small (only
-            // sibling specs for in-flight names) so the linear scan is
-            // cheaper than another HashMap.
+            // A fetch for this name is already in flight: stash this
+            // spec; the fetch's completion handler will dispatch its
+            // settle.
             if !fetched_names.insert(name.clone()) {
-                deferred_specs.push((name, spec));
+                deferred_by_name.entry(name).or_default().push(spec);
                 continue;
             }
 
             let registry_url = registry_url.to_string();
+            let primary_spec = spec.clone();
             let n = name.clone();
-            futs.push(async move {
+            futs.push(Box::pin(async move {
                 let start = tokio::time::Instant::now();
                 let result = fetch_full_manifest(FetchManifestOptions {
                     registry_url: &registry_url,
@@ -157,58 +211,82 @@ pub async fn fast_preload(
                 })
                 .await;
                 let elapsed_ms = start.elapsed().as_millis() as u64;
-                (name, spec, result, elapsed_ms)
-            });
+                FastEvent::Fetched {
+                    name,
+                    primary_spec,
+                    result,
+                    elapsed_ms,
+                }
+            }));
         }
 
         if futs.is_empty() {
             break;
         }
 
-        let Some((name, spec, result, elapsed_ms)) = futs.next().await else {
+        let Some(event) = futs.next().await else {
             break;
         };
 
-        if stats.success_count == 0 && stats.failed_count == 0 {
-            stats.min_request_ms = elapsed_ms;
-            stats.max_request_ms = elapsed_ms;
-        } else {
-            stats.min_request_ms = stats.min_request_ms.min(elapsed_ms);
-            stats.max_request_ms = stats.max_request_ms.max(elapsed_ms);
-        }
-        stats.total_request_ms += elapsed_ms;
+        match event {
+            FastEvent::Fetched {
+                name,
+                primary_spec,
+                result,
+                elapsed_ms,
+            } => {
+                if stats.success_count == 0 && stats.failed_count == 0 {
+                    stats.min_request_ms = elapsed_ms;
+                    stats.max_request_ms = elapsed_ms;
+                } else {
+                    stats.min_request_ms = stats.min_request_ms.min(elapsed_ms);
+                    stats.max_request_ms = stats.max_request_ms.max(elapsed_ms);
+                }
+                stats.total_request_ms += elapsed_ms;
 
-        match result {
-            Ok(FetchManifestResult::Ok(manifest, _etag)) => {
-                stats.success_count += 1;
-                stats.fetched_names += 1;
-                cache.set_full_manifest(name.clone(), Arc::new(manifest));
+                match result {
+                    Ok(FetchManifestResult::Ok(manifest, _etag)) => {
+                        stats.success_count += 1;
+                        stats.fetched_names += 1;
+                        let full_arc = Arc::new(manifest);
+                        cache.set_full_manifest(name.clone(), Arc::clone(&full_arc));
 
-                let new_deps = settle_spec(&name, &spec, cache, peer_deps);
-                pending.extend(new_deps);
+                        // Primary settle.
+                        futs.push(Box::pin(settle_future(
+                            name.clone(),
+                            primary_spec,
+                            Arc::clone(&full_arc),
+                            cache.clone(),
+                            peer_deps,
+                        )));
 
-                // Drain any sibling specs that arrived while this fetch
-                // was in flight. `extract_if`-style retain in place.
-                let mut i = 0;
-                while i < deferred_specs.len() {
-                    if deferred_specs[i].0 == name {
-                        let (n, s) = deferred_specs.swap_remove(i);
-                        let new_deps = settle_spec(&n, &s, cache, peer_deps);
-                        pending.extend(new_deps);
-                    } else {
-                        i += 1;
+                        // Sibling settles that were stashed while the
+                        // fetch was in flight.
+                        if let Some(siblings) = deferred_by_name.remove(&name) {
+                            for sibling_spec in siblings {
+                                futs.push(Box::pin(settle_future(
+                                    name.clone(),
+                                    sibling_spec,
+                                    Arc::clone(&full_arc),
+                                    cache.clone(),
+                                    peer_deps,
+                                )));
+                            }
+                        }
+                    }
+                    Ok(FetchManifestResult::NotModified) => {
+                        // No ETag was sent on these requests, so 304 is
+                        // unreachable in practice; treat as soft failure.
+                        stats.failed_count += 1;
+                    }
+                    Err(e) => {
+                        stats.failed_count += 1;
+                        tracing::debug!("fast_preload failed for {}: {}", name, e);
                     }
                 }
             }
-            Ok(FetchManifestResult::NotModified) => {
-                // No ETag was sent on these requests, so 304 is unreachable
-                // here in practice; treat it as a soft-failure to keep the
-                // path total.
-                stats.failed_count += 1;
-            }
-            Err(e) => {
-                stats.failed_count += 1;
-                tracing::debug!("fast_preload failed for {}: {}", name, e);
+            FastEvent::Settled { new_deps } => {
+                pending.extend(new_deps);
             }
         }
     }

From 04c9ec34d26fdb97f83014c9a09e241cd64715aa Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 05:19:48 +0800
Subject: [PATCH 13/24] =?UTF-8?q?perf(pm):=20bump=20manifests-concurrency-?=
 =?UTF-8?q?limit=2064=20=E2=86=92=2096=20(manifest-bench=20best)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Standalone manifest-bench HTTP-only sweep (npmjs, h1) shows wall
bottoming at concurrency=96 (1817ms) — earlier 256 regression was
caused by rayon-queued parses behind 2 workers, no longer relevant
since fetch parse is on spawn_blocking and settle is rayon-dispatched
off the runtime.

fast_preload's wave-shaped transitive walk currently runs at
eff_parallel ~35 against the 64 cap because pending refills lag
settles; raising the cap to 96 gives headroom for sustained
in-flight on the deep waves without crossing the npmjs per-IP
tail-latency cliff that conc 128+ trips.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/pm/src/util/user_config.rs | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/crates/pm/src/util/user_config.rs b/crates/pm/src/util/user_config.rs
index a0235830a..f05b0f52f 100644
--- a/crates/pm/src/util/user_config.rs
+++ b/crates/pm/src/util/user_config.rs
@@ -137,15 +137,18 @@ pub fn get_install_scope() -> InstallScope {
 // We tried 256 to match bun's observed parallel streams; on GHA the
 // fetch-breakdown instrumentation showed sum_parse exploded from
 // ~10ms (local Mac, network-bound) to 728s on first cold run with
-// 256 concurrency. Mechanism: parse_json_off_runtime dispatches to
-// rayon, which has only num_cpus (=2 on GHA) workers. Bumping
-// concurrency to 256 queued 256 parses behind 2 workers → wall
-// per-parse jumped from 730µs to 266ms. Net p1 wall *increased*
-// 3.10s → 3.33s on phases bench. Keep 64 until we address the
-// parse-side queueing (e.g. inline parse on tokio, or a wider
-// dedicated parse pool).
+// Once we moved fetch parse off rayon to tokio's spawn_blocking pool
+// (cap 512) and settle off the runtime via rayon::spawn, the original
+// 256-concurrency regression mechanism (parses queued behind 2 rayon
+// workers) no longer applies. The standalone manifest-bench HTTP-only
+// sweep on GHA (npmjs, conc 32→256) shows wall bottoming out at conc 96
+// (1817ms) and tracking flat-then-rising past that — beyond ~96
+// in-flight, npmjs's per-IP rate degrades and tail latency widens.
+// 96 is the sweet spot: enough headroom for the wave-shaped transitive
+// dep walk in fast_preload to keep the runtime busy, without paying the
+// p99 widening that 128+ shows.
 static MANIFESTS_CONCURRENCY_LIMIT: LazyLock<ConfigValue<usize>> =
-    LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 64));
+    LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 96));
 
 pub fn set_manifests_concurrency_limit(value: Option<usize>) {
     MANIFESTS_CONCURRENCY_LIMIT.set(value);

From 6455852e518b3cc9859e12442972f40697360d73 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 05:46:06 +0800
Subject: [PATCH 14/24] perf(pm): fast_preload populates (name, spec) cache
 slot for BFS fast path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`UnifiedRegistry::resolve_version_manifest`'s first cache check
(service/registry.rs:347) keys on `(name, spec)` — the original spec
string the caller passed, e.g. `^4.0.0`. settle_future was only
populating `(name, resolved_version)` (e.g. `4.17.21`), so on every
BFS edge for `lodash@^4.0.0`-style specs the warm path missed and
fell into the OnceMap inflight gate + `resolve_via_full_manifest`
re-walk before recovering the manifest from the
`(name, resolved_version)` slot we'd already set.

Now settle writes both keys so BFS hits the early-return at
service/registry.rs:347 with no further dispatch. Saves ~1
OnceMap+resolve_target_version round-trip per unique (name, spec)
the BFS encounters (≈3000 calls on ant-design-x).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/resolver/fast_preload.rs | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/crates/ruborist/src/resolver/fast_preload.rs b/crates/ruborist/src/resolver/fast_preload.rs
index faea79752..c3845a73a 100644
--- a/crates/ruborist/src/resolver/fast_preload.rs
+++ b/crates/ruborist/src/resolver/fast_preload.rs
@@ -121,6 +121,8 @@ fn settle_future(
             Err(_) => return FastEvent::Settled { new_deps: vec![] },
         };
         if let Some(cached) = cache.get_version_manifest(&name, &resolved_version) {
+            // Populate the (name, spec) slot too — see comment below.
+            cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&cached));
             return FastEvent::Settled {
                 new_deps: extract_transitive_deps(&cached, peer_deps),
             };
@@ -129,6 +131,18 @@ fn settle_future(
             extract_core_version_off_runtime(Arc::clone(&full), resolved_version).await;
         let new_deps = match core {
             Some(core_arc) => {
+                // Populate BOTH cache slots so the subsequent BFS hits the
+                // fast path on its first call:
+                //   * `(name, resolved_version)` — what
+                //     `resolve_via_full_manifest` writes in the cold path,
+                //     and what `extract_core_version_off_runtime`'s callers
+                //     elsewhere expect.
+                //   * `(name, spec)` — what `resolve_version_manifest`'s
+                //     first cache check uses (line 347 in service/registry.rs).
+                //     Without this slot, BFS still pays one OnceMap dispatch
+                //     + `resolve_via_full_manifest` walk per `(name, spec)`,
+                //     even though we've already done that work here.
+                cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&core_arc));
                 cache.set_version_manifest(name, resolved_version, Arc::clone(&core_arc));
                 extract_transitive_deps(&core_arc, peer_deps)
             }

From 4bbcae8083de94ea69b6ef19611cdb59c719ca9c Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 06:12:08 +0800
Subject: [PATCH 15/24] perf(pm): fuse primary settle into fetch task to drop
 dispatch RTT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previous fast_preload (v2) dispatched primary settles to rayon as
separate FuturesUnordered futures. CI breakdown showed
eff_parallel ~44 against the conc=96 cap — the wave-shaped
transitive walk was held back by settle dispatch RTT: each fetch
landed → primary settle queued → settle popped → only then did
`pending` get transitive deps and fill the next dispatch wave.

v3 folds the primary settle into the fetch task itself via
`tokio::task::spawn_blocking`. The fetch task does the network
round-trip and the primary version-extract on the same blocking
pool slot, then returns with the resolved CoreVersionManifest
attached. Main loop pulls one Fetched event, immediately extends
`pending`, no second `next().await` to wait through the queue.

Sibling specs (rare; same name, different range) still go through
the rayon settle_future path so the primary path stays lean.

Carries primary_spec through FastEvent so the fused path can
populate both `(name, primary_spec)` and `(name, resolved_version)`
cache slots — preserves the 6455852e BFS fast-path win.

FetchOutcome enum replaces by-value FetchManifestResult to avoid a
full FullManifest clone (HashMap+Vec) per fetch event.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/resolver/fast_preload.rs | 206 ++++++++++++-------
 1 file changed, 135 insertions(+), 71 deletions(-)

diff --git a/crates/ruborist/src/resolver/fast_preload.rs b/crates/ruborist/src/resolver/fast_preload.rs
index c3845a73a..008030139 100644
--- a/crates/ruborist/src/resolver/fast_preload.rs
+++ b/crates/ruborist/src/resolver/fast_preload.rs
@@ -4,7 +4,7 @@
 //! its `OnceMap` gates, [`crate::service::store::ManifestStore`] writes,
 //! and `EventReceiver` event dispatch — to drive a flat
 //! `FuturesUnordered` over [`crate::service::manifest::fetch_full_manifest`]
-//! plus a rayon-dispatched per-spec settle. The warm
+//! plus a fused-into-fetch primary settle. The warm
 //! [`crate::service::cache::MemoryCache`] it leaves behind makes the
 //! subsequent BFS phase a pure cache-hit walk: no network, no rayon
 //! re-parse hop on `extract_core_version`.
@@ -14,18 +14,30 @@
 //! still go through [`super::preload::preload_manifests`] so the
 //! pipeline keeps its early-start signal.
 //!
-//! ## Why settle is dispatched off-runtime
+//! ## Why settle is fused into the fetch task
 //!
 //! A "settle" turns a freshly-fetched `FullManifest` plus a spec into a
 //! `CoreVersionManifest` for one version, via `simd_json::to_borrowed_value`
 //! over the manifest's raw bytes. That parse is 5–10ms per spec on a
-//! 100KB body. Calling it inline on the tokio runtime (the v1 of this
-//! module) starves the runtime worker — sibling fetches in flight stop
-//! draining their sockets while the worker is parsing, which CI showed
-//! as `avg_request` rising +3ms and `avg_parse` jumping 5→11ms vs the
-//! UnifiedRegistry baseline. Routing settle through `rayon::spawn`
-//! (the same path the `extract_core_version_off_runtime` helper takes)
-//! keeps the runtime free to drive I/O.
+//! 100KB body.
+//!
+//! v1 ran settle inline on the tokio runtime worker — that starved
+//! sibling fetches' I/O drive (CI showed `avg_request` +3ms,
+//! `avg_parse` 5→11ms). v2 dispatched settle to rayon via a separate
+//! `FuturesUnordered` future, which fixed the runtime starvation but
+//! introduced a dispatch RTT: fetch lands → rayon settle queued → settle
+//! pops → `pending` finally gets transitive deps. That round-trip held
+//! the wave-shaped transitive walk back, capping `eff_parallel` at ~44
+//! against a 96 cap.
+//!
+//! v3 (this) folds the primary settle into the fetch task itself via
+//! `tokio::task::spawn_blocking`. The fetch task awaits both the
+//! network round-trip and the version-extract on the same blocking
+//! pool slot, then returns with the resolved `CoreVersionManifest`
+//! attached. The main loop pulls a single `Fetched` event and
+//! immediately extends `pending` — no separate settle pop. Sibling
+//! specs (rare; same package, different range) still go through a
+//! `Settled` future to keep the primary path lean.
 
 use std::collections::{HashMap, HashSet, VecDeque};
 use std::pin::Pin;
@@ -56,21 +68,31 @@ pub struct FastPreloadStats {
     pub total_request_ms: u64,
 }
 
-/// Output of one in-flight future. The main loop merges fetch and settle
-/// completions through a single `FuturesUnordered` so backpressure on
-/// either side throttles the other naturally.
-///
-/// `Fetched` is boxed because `FetchManifestResult::Ok` carries a fully-
-/// parsed `FullManifest` (`raw` bytes + parsed envelope), which makes
-/// the variant large enough that clippy flags the size delta with
-/// `Settled`. The cost is one heap allocation per fetched manifest;
-/// trivial against the network round-trip we already paid.
-#[allow(clippy::large_enum_variant)]
+/// One fetch's primary settle outcome — the resolved version + parsed
+/// `CoreVersionManifest` for the spec the fetch was originally issued
+/// for. `None` means the spec didn't match any version (caller treats
+/// as soft skip).
+type PrimarySettle = Option<(String, Arc<CoreVersionManifest>)>;
+
+/// Outcome of a fetch task. Owning `Arc<FullManifest>` (rather than
+/// `FetchManifestResult` by-value) means the fetch task can `Arc::clone`
+/// once for the primary settle, then pass ownership along — no full
+/// `FullManifest` clone (which would copy the 200-entry `time`
+/// HashMap + the `versions` `Vec<String>` per fetch).
+enum FetchOutcome {
+    Ok(Arc<FullManifest>),
+    NotModified,
+    Err,
+}
+
+/// Output of one in-flight future. The main loop merges fetch and
+/// sibling-settle completions through a single `FuturesUnordered`.
 enum FastEvent {
     Fetched {
         name: String,
         primary_spec: String,
-        result: anyhow::Result<FetchManifestResult>,
+        outcome: FetchOutcome,
+        primary_settle: PrimarySettle,
         elapsed_ms: u64,
     },
     Settled {
@@ -101,13 +123,9 @@ fn extract_transitive_deps(manifest: &CoreVersionManifest, peer_deps: PeerDeps)
     deps
 }
 
-/// Resolve `(name, spec)` against `full` off the tokio runtime.
-///
-/// Returns the freshly-extracted version manifest's transitive deps so
-/// the caller can extend its pending queue. The heavy
-/// `simd_json::to_borrowed_value` parse runs inside
-/// `extract_core_version_off_runtime`, which dispatches to rayon — same
-/// path the BFS phase uses for cold extracts.
+/// Off-runtime settle for a `(name, spec)` whose `FullManifest` is
+/// already cached. Used for sibling specs — multiple ranges on the
+/// same package — that arrive after the primary fetch has landed.
 fn settle_future(
     name: String,
     spec: String,
@@ -121,7 +139,6 @@ fn settle_future(
             Err(_) => return FastEvent::Settled { new_deps: vec![] },
         };
         if let Some(cached) = cache.get_version_manifest(&name, &resolved_version) {
-            // Populate the (name, spec) slot too — see comment below.
             cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&cached));
             return FastEvent::Settled {
                 new_deps: extract_transitive_deps(&cached, peer_deps),
@@ -131,17 +148,6 @@ fn settle_future(
             extract_core_version_off_runtime(Arc::clone(&full), resolved_version).await;
         let new_deps = match core {
             Some(core_arc) => {
-                // Populate BOTH cache slots so the subsequent BFS hits the
-                // fast path on its first call:
-                //   * `(name, resolved_version)` — what
-                //     `resolve_via_full_manifest` writes in the cold path,
-                //     and what `extract_core_version_off_runtime`'s callers
-                //     elsewhere expect.
-                //   * `(name, spec)` — what `resolve_version_manifest`'s
-                //     first cache check uses (line 347 in service/registry.rs).
-                //     Without this slot, BFS still pays one OnceMap dispatch
-                //     + `resolve_via_full_manifest` walk per `(name, spec)`,
-                //     even though we've already done that work here.
                 cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&core_arc));
                 cache.set_version_manifest(name, resolved_version, Arc::clone(&core_arc));
                 extract_transitive_deps(&core_arc, peer_deps)
@@ -152,6 +158,35 @@ fn settle_future(
     })
 }
 
+/// Resolve `(name, spec)` against `full` on tokio's blocking pool.
+///
+/// Same shape as `extract_core_version_off_runtime` (which uses rayon),
+/// but stays inside the fetch task so the result lands together with
+/// the network round-trip — no separate `FuturesUnordered` pop, so
+/// `pending` gets the transitive deps the moment the fetch event is
+/// drained. Tokio's blocking pool has a 512-thread cap; rayon's is
+/// `max(num_cpus, 8)`. With many primary settles arriving in waves,
+/// the wider blocking pool absorbs the burst better than rayon would.
+async fn resolve_primary_settle(spec: String, full: Arc<FullManifest>) -> PrimarySettle {
+    #[cfg(not(target_arch = "wasm32"))]
+    {
+        tokio::task::spawn_blocking(move || {
+            let resolved = resolve_target_version((&*full).into(), &spec).ok()?;
+            let core = full.get_core_version(&resolved)?;
+            Some((resolved, Arc::new(core)))
+        })
+        .await
+        .ok()
+        .flatten()
+    }
+    #[cfg(target_arch = "wasm32")]
+    {
+        let resolved = resolve_target_version((&*full).into(), &spec).ok()?;
+        let core = full.get_core_version(&resolved)?;
+        Some((resolved, Arc::new(core)))
+    }
+}
+
 /// Manifest-bench-style flat parallel fetch of all transitively-reachable
 /// registry manifests. Populates `cache` with both `full_manifests` and
 /// `version_manifests` slots so the subsequent BFS does no network and no
@@ -167,14 +202,14 @@ pub async fn fast_preload(
 ) -> FastPreloadStats {
     let mut stats = FastPreloadStats::default();
     let mut pending: VecDeque<Dep> = VecDeque::from(initial_deps);
-    // Specs we've already enqueued (or settled). Prevents duplicate
-    // settles from re-walking the same transitive subtree.
+    // Specs we've already enqueued. Prevents duplicate settles from
+    // re-walking the same transitive subtree.
     let mut seen_specs: HashSet<(String, String)> = HashSet::new();
     // Names whose full manifest is in flight or already cached.
     let mut fetched_names: HashSet<String> = HashSet::new();
     // Sibling specs that arrived while their package's full manifest
-    // was still in flight. The fetch's completion handler drains this
-    // bucket — we stash by name so the lookup is one HashMap probe.
+    // was still in flight. The fetch's completion handler dispatches
+    // settles for them, then drains this bucket.
     let mut deferred_by_name: HashMap<String, Vec<String>> = HashMap::new();
     let mut futs: FuturesUnordered<FastFut> = FuturesUnordered::new();
     let concurrency = config.concurrency;
@@ -189,10 +224,10 @@ pub async fn fast_preload(
                 continue;
             }
 
-            // Hot path: the full manifest is already cached (a sibling
-            // spec for this name has already returned). Dispatch a
-            // settle so the parse work runs on rayon, not on the tokio
-            // worker — keeps the runtime free for ongoing fetches.
+            // Hot path: a sibling spec for this name has already
+            // returned, so the full manifest is cached. Settle on
+            // rayon (off-runtime) — keeps the primary fetch path
+            // (next branch) clean.
             if let Some(full) = cache.get_full_manifest(&name) {
                 futs.push(Box::pin(settle_future(
                     name,
@@ -205,8 +240,8 @@ pub async fn fast_preload(
             }
 
             // A fetch for this name is already in flight: stash this
-            // spec; the fetch's completion handler will dispatch its
-            // settle.
+            // sibling spec; the fetch's completion handler will
+            // dispatch a settle for it.
             if !fetched_names.insert(name.clone()) {
                 deferred_by_name.entry(name).or_default().push(spec);
                 continue;
@@ -225,10 +260,30 @@ pub async fn fast_preload(
                 })
                 .await;
                 let elapsed_ms = start.elapsed().as_millis() as u64;
+                // Fuse the primary settle into the same task so the
+                // main loop sees the resolved version + transitive
+                // deps in the same event — no extra `next().await` to
+                // wait through the FuturesUnordered queue before
+                // `pending` can refill.
+                let (outcome, primary_settle) = match result {
+                    Ok(FetchManifestResult::Ok(manifest, _etag)) => {
+                        let full_arc = Arc::new(manifest);
+                        let settle =
+                            resolve_primary_settle(primary_spec.clone(), Arc::clone(&full_arc))
+                                .await;
+                        (FetchOutcome::Ok(full_arc), settle)
+                    }
+                    Ok(FetchManifestResult::NotModified) => (FetchOutcome::NotModified, None),
+                    Err(e) => {
+                        tracing::debug!("fast_preload failed for {}: {}", n, e);
+                        (FetchOutcome::Err, None)
+                    }
+                };
                 FastEvent::Fetched {
                     name,
                     primary_spec,
-                    result,
+                    outcome,
+                    primary_settle,
                     elapsed_ms,
                 }
             }));
@@ -246,7 +301,8 @@ pub async fn fast_preload(
             FastEvent::Fetched {
                 name,
                 primary_spec,
-                result,
+                outcome,
+                primary_settle,
                 elapsed_ms,
             } => {
                 if stats.success_count == 0 && stats.failed_count == 0 {
@@ -258,24 +314,36 @@ pub async fn fast_preload(
                 }
                 stats.total_request_ms += elapsed_ms;
 
-                match result {
-                    Ok(FetchManifestResult::Ok(manifest, _etag)) => {
+                match outcome {
+                    FetchOutcome::Ok(full_arc) => {
                         stats.success_count += 1;
                         stats.fetched_names += 1;
-                        let full_arc = Arc::new(manifest);
                         cache.set_full_manifest(name.clone(), Arc::clone(&full_arc));
 
-                        // Primary settle.
-                        futs.push(Box::pin(settle_future(
-                            name.clone(),
-                            primary_spec,
-                            Arc::clone(&full_arc),
-                            cache.clone(),
-                            peer_deps,
-                        )));
+                        // Apply the primary settle (already done inside
+                        // the fetch task via spawn_blocking) — populate
+                        // both `(name, primary_spec)` and
+                        // `(name, resolved_version)` cache slots so BFS
+                        // hits the early-return at registry.rs:347 on
+                        // its first probe, then extend `pending` with
+                        // the spec's transitive deps.
+                        if let Some((resolved_version, core_arc)) = primary_settle {
+                            cache.set_version_manifest(
+                                name.clone(),
+                                primary_spec,
+                                Arc::clone(&core_arc),
+                            );
+                            cache.set_version_manifest(
+                                name.clone(),
+                                resolved_version,
+                                Arc::clone(&core_arc),
+                            );
+                            pending.extend(extract_transitive_deps(&core_arc, peer_deps));
+                        }
 
-                        // Sibling settles that were stashed while the
-                        // fetch was in flight.
+                        // Sibling specs that were stashed while the
+                        // fetch was in flight: dispatch each as a
+                        // separate settle future.
                         if let Some(siblings) = deferred_by_name.remove(&name) {
                             for sibling_spec in siblings {
                                 futs.push(Box::pin(settle_future(
@@ -288,14 +356,10 @@ pub async fn fast_preload(
                             }
                         }
                     }
-                    Ok(FetchManifestResult::NotModified) => {
-                        // No ETag was sent on these requests, so 304 is
-                        // unreachable in practice; treat as soft failure.
-                        stats.failed_count += 1;
-                    }
-                    Err(e) => {
+                    FetchOutcome::NotModified | FetchOutcome::Err => {
+                        // 304 is unreachable in practice (no ETag sent);
+                        // both branches treated as soft failure.
                         stats.failed_count += 1;
-                        tracing::debug!("fast_preload failed for {}: {}", name, e);
                     }
                 }
             }

From 671ac98e51e4a7ca4e53149c8bead24b4f144451 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 06:42:55 +0800
Subject: [PATCH 16/24] perf(pm): combined-parse fetch path eliminates
 per-fetch double simd_json
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The fast_preload hot path was paying TWO simd_json passes per
manifest:
  1. fetch_full_manifest's parse_json_off_runtime did a typed
     simd_json::serde::from_slice<FullManifest> (envelope + IgnoredAny
     visitor on `versions` keys, ~3-5ms on a 100KB body).
  2. Primary settle re-parsed the same raw bytes with
     simd_json::to_borrowed_value (~5-10ms) to extract one version's
     subtree.

Both passes went through simd_json's Tape constructor — duplicated
work. CI showed avg_parse 5-7ms × 2700 fetches = 14-19s of CPU sum
on 2-core GHA, where the spawn_blocking pool's overlapping schedule
masked some of the cost but not all.

Adds `service::manifest::fetch_full_manifest_with_settle`: same HTTP
+ retry + ETag machinery as `fetch_full_manifest`, but the parse
step does ONE `to_borrowed_value` and extracts:
  * envelope (`name`, `dist-tags`, `versions` keys) into FullManifest
    manually (no typed serde), and
  * the resolved version's subtree as a typed CoreVersionManifest
    (serde-deserializing that single subtree via the borrowed value).

fast_preload's fetch task switches to this entry point — primary
settle is now a free byproduct of the fetch parse, not a separate
`to_borrowed_value` pass. Sibling specs (same name, different
range) still go through the rayon settle_future path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/resolver/fast_preload.rs |  68 ++----
 crates/ruborist/src/service/manifest.rs      | 208 +++++++++++++++++++
 crates/ruborist/src/service/mod.rs           |   5 +-
 3 files changed, 231 insertions(+), 50 deletions(-)

diff --git a/crates/ruborist/src/resolver/fast_preload.rs b/crates/ruborist/src/resolver/fast_preload.rs
index 008030139..d049321d8 100644
--- a/crates/ruborist/src/resolver/fast_preload.rs
+++ b/crates/ruborist/src/resolver/fast_preload.rs
@@ -51,7 +51,8 @@ use crate::model::node::PeerDeps;
 use crate::resolver::preload::{Dep, PreloadConfig};
 use crate::resolver::version::resolve_target_version;
 use crate::service::{
-    FetchManifestOptions, FetchManifestResult, MemoryCache, MetadataFormat, fetch_full_manifest,
+    FetchManifestOptions, FetchWithSettleResult, MemoryCache, MetadataFormat,
+    fetch_full_manifest_with_settle,
 };
 use crate::spec::SpecStr;
 use crate::util::FETCH_TIMINGS;
@@ -158,35 +159,6 @@ fn settle_future(
     })
 }
 
-/// Resolve `(name, spec)` against `full` on tokio's blocking pool.
-///
-/// Same shape as `extract_core_version_off_runtime` (which uses rayon),
-/// but stays inside the fetch task so the result lands together with
-/// the network round-trip — no separate `FuturesUnordered` pop, so
-/// `pending` gets the transitive deps the moment the fetch event is
-/// drained. Tokio's blocking pool has a 512-thread cap; rayon's is
-/// `max(num_cpus, 8)`. With many primary settles arriving in waves,
-/// the wider blocking pool absorbs the burst better than rayon would.
-async fn resolve_primary_settle(spec: String, full: Arc<FullManifest>) -> PrimarySettle {
-    #[cfg(not(target_arch = "wasm32"))]
-    {
-        tokio::task::spawn_blocking(move || {
-            let resolved = resolve_target_version((&*full).into(), &spec).ok()?;
-            let core = full.get_core_version(&resolved)?;
-            Some((resolved, Arc::new(core)))
-        })
-        .await
-        .ok()
-        .flatten()
-    }
-    #[cfg(target_arch = "wasm32")]
-    {
-        let resolved = resolve_target_version((&*full).into(), &spec).ok()?;
-        let core = full.get_core_version(&resolved)?;
-        Some((resolved, Arc::new(core)))
-    }
-}
-
 /// Manifest-bench-style flat parallel fetch of all transitively-reachable
 /// registry manifests. Populates `cache` with both `full_manifests` and
 /// `version_manifests` slots so the subsequent BFS does no network and no
@@ -252,28 +224,28 @@ pub async fn fast_preload(
             let n = name.clone();
             futs.push(Box::pin(async move {
                 let start = tokio::time::Instant::now();
-                let result = fetch_full_manifest(FetchManifestOptions {
-                    registry_url: &registry_url,
-                    name: &n,
-                    format: MetadataFormat::Abbreviated,
-                    etag: None,
-                })
+                // Combined fetch + envelope parse + primary settle in
+                // a single `to_borrowed_value` pass — replaces the old
+                // pattern of typed-serde envelope parse followed by a
+                // separate `to_borrowed_value` reparse for version
+                // extraction. Halves simd_json work per fetch.
+                let result = fetch_full_manifest_with_settle(
+                    FetchManifestOptions {
+                        registry_url: &registry_url,
+                        name: &n,
+                        format: MetadataFormat::Abbreviated,
+                        etag: None,
+                    },
+                    &primary_spec,
+                )
                 .await;
                 let elapsed_ms = start.elapsed().as_millis() as u64;
-                // Fuse the primary settle into the same task so the
-                // main loop sees the resolved version + transitive
-                // deps in the same event — no extra `next().await` to
-                // wait through the FuturesUnordered queue before
-                // `pending` can refill.
                 let (outcome, primary_settle) = match result {
-                    Ok(FetchManifestResult::Ok(manifest, _etag)) => {
-                        let full_arc = Arc::new(manifest);
-                        let settle =
-                            resolve_primary_settle(primary_spec.clone(), Arc::clone(&full_arc))
-                                .await;
-                        (FetchOutcome::Ok(full_arc), settle)
+                    Ok(FetchWithSettleResult::Ok(payload)) => {
+                        let full_arc = Arc::new(payload.manifest);
+                        (FetchOutcome::Ok(full_arc), payload.primary_settle)
                     }
-                    Ok(FetchManifestResult::NotModified) => (FetchOutcome::NotModified, None),
+                    Ok(FetchWithSettleResult::NotModified) => (FetchOutcome::NotModified, None),
                     Err(e) => {
                         tracing::debug!("fast_preload failed for {}: {}", n, e);
                         (FetchOutcome::Err, None)
diff --git a/crates/ruborist/src/service/manifest.rs b/crates/ruborist/src/service/manifest.rs
index 90f1db71b..38db87969 100644
--- a/crates/ruborist/src/service/manifest.rs
+++ b/crates/ruborist/src/service/manifest.rs
@@ -4,7 +4,11 @@
 //! [`crate::service::fetch`] so retry policy stays uniform across registry
 //! manifest fetches and non-registry resolvers (git, http tarball).
 
+use std::collections::HashMap;
+use std::sync::Arc;
+
 use anyhow::{Result, anyhow};
+use serde::Deserialize;
 use tokio_retry::RetryIf;
 
 use super::fetch::{
@@ -12,6 +16,7 @@ use super::fetch::{
 };
 use super::http::get_client;
 use crate::model::manifest::{CoreVersionManifest, FullManifest};
+use crate::resolver::version::resolve_target_version;
 use crate::util::FETCH_TIMINGS;
 
 /// Parse JSON bytes on tokio's blocking thread pool.
@@ -157,6 +162,209 @@ pub async fn fetch_full_manifest(opts: FetchManifestOptions<'_>) -> Result<Fetch
     })
 }
 
+/// Outcome of [`fetch_full_manifest_with_settle`] — a full manifest
+/// plus the parsed `CoreVersionManifest` for the requested spec, when
+/// it resolves to a known version. Both are produced from a single
+/// `simd_json::to_borrowed_value` pass over the response body, so
+/// callers that need the version subtree never pay the typed-serde
+/// envelope parse + per-version `to_borrowed_value` reparse.
+pub struct FetchWithSettle {
+    pub manifest: FullManifest,
+    pub etag: Option<String>,
+    /// `Some` when the requested spec resolves to a real version in
+    /// `manifest.versions`. `None` only on no-match (rare; usually a
+    /// spec referring to a yanked or moved version).
+    pub primary_settle: Option<PrimarySettleResult>,
+}
+
+/// `(resolved_version, parsed_subtree)` — what
+/// [`fetch_full_manifest_with_settle`] hands back to callers that
+/// supplied a `primary_spec`.
+pub type PrimarySettleResult = (String, Arc<CoreVersionManifest>);
+
+#[allow(clippy::large_enum_variant)]
+pub enum FetchWithSettleResult {
+    Ok(FetchWithSettle),
+    NotModified,
+}
+
+/// Fetch a full manifest and resolve the primary spec from the same
+/// parse pass.
+///
+/// Where [`fetch_full_manifest`] uses `simd_json::serde::from_slice`
+/// to materialize a typed `FullManifest` (cheap envelope, deep
+/// `versions` subtrees skipped via `IgnoredAny`) and leaves version
+/// subtree extraction to a later `simd_json::to_borrowed_value`
+/// reparse, this entry point does the borrowed-value parse once and
+/// extracts:
+///   * envelope fields needed by the resolver (`name`, `dist-tags`,
+///     `versions` keys),
+///   * the resolved-version subtree as a typed
+///     [`CoreVersionManifest`].
+///
+/// Saves one full simd_json pass on the parse hot path —
+/// `fast_preload` uses ~2700 of these per `utoo deps` cold run, so
+/// halving the per-fetch parse work meaningfully reduces CPU on
+/// 2-core CI.
+pub async fn fetch_full_manifest_with_settle(
+    opts: FetchManifestOptions<'_>,
+    primary_spec: &str,
+) -> Result<FetchWithSettleResult> {
+    let url = format!("{}/{}", opts.registry_url, opts.name);
+    let etag_owned = opts.etag.map(|s| s.to_string());
+    let primary_spec_owned = primary_spec.to_string();
+    let accept = match opts.format {
+        MetadataFormat::Abbreviated => "application/vnd.npm.install-v1+json",
+        MetadataFormat::Complete => "application/json",
+    };
+
+    RetryIf::spawn(
+        retry_strategy(),
+        || {
+            let url = url.clone();
+            let etag = etag_owned.clone();
+            let primary_spec = primary_spec_owned.clone();
+            async move {
+                let mut request = get_client()
+                    .map_err(FetchError::Permanent)?
+                    .get(&url)
+                    .header("Accept", accept);
+                if let Some(etag_value) = &etag {
+                    request = request.header("If-None-Match", etag_value);
+                }
+
+                let t_request_start = std::time::Instant::now();
+                let response = request.send().await.map_err(classify_reqwest_error)?;
+                let request_us = t_request_start.elapsed().as_micros() as u64;
+                let status = response.status();
+
+                if status == reqwest::StatusCode::NOT_MODIFIED {
+                    if etag.is_some() {
+                        return Ok(FetchWithSettleResult::NotModified);
+                    }
+                    return Err(classify_status(status, &url));
+                }
+
+                if status.is_success() {
+                    let new_etag = response
+                        .headers()
+                        .get("etag")
+                        .and_then(|v| v.to_str().ok())
+                        .map(|s| s.to_string());
+
+                    let t_body_start = std::time::Instant::now();
+                    let raw_bytes = response
+                        .bytes()
+                        .await
+                        .map_err(|e| FetchError::Permanent(anyhow!("Response read error: {e}")))?
+                        .to_vec();
+                    let body_us = t_body_start.elapsed().as_micros() as u64;
+                    let bytes_len = raw_bytes.len() as u64;
+                    let raw_arc: Arc<[u8]> = Arc::from(raw_bytes);
+
+                    let t_parse_start = std::time::Instant::now();
+                    let parse_result =
+                        parse_envelope_and_settle(Arc::clone(&raw_arc), primary_spec)
+                            .await
+                            .map_err(FetchError::Permanent)?;
+                    let parse_us = t_parse_start.elapsed().as_micros() as u64;
+
+                    FETCH_TIMINGS.record(request_us, body_us, parse_us, bytes_len);
+
+                    let (manifest, primary_settle) = parse_result;
+                    Ok(FetchWithSettleResult::Ok(FetchWithSettle {
+                        manifest,
+                        etag: new_etag,
+                        primary_settle,
+                    }))
+                } else {
+                    Err(classify_status(status, &url))
+                }
+            }
+        },
+        is_retryable,
+    )
+    .await
+    .map_err(|e| match e {
+        FetchError::Retryable(e) | FetchError::Permanent(e) => {
+            anyhow!("Failed to fetch {}: {:#}", opts.name, e)
+        }
+    })
+}
+
+/// Off-runtime combined parse: `simd_json::to_borrowed_value` once,
+/// extract envelope into [`FullManifest`] + resolve `primary_spec`
+/// against the parsed `versions` keys + materialize the resolved
+/// version's subtree into [`CoreVersionManifest`].
+///
+/// Constructs `FullManifest` manually rather than via typed serde so
+/// the work is exactly one parse pass. Other `FullManifest` fields
+/// (`description`, `time`, `maintainers`, etc.) stay at `Default`
+/// values — none are read on the resolver hot path.
+async fn parse_envelope_and_settle(
+    raw: Arc<[u8]>,
+    primary_spec: String,
+) -> Result<(FullManifest, Option<PrimarySettleResult>)> {
+    #[cfg(not(target_arch = "wasm32"))]
+    {
+        tokio::task::spawn_blocking(move || parse_envelope_and_settle_sync(raw, &primary_spec))
+            .await
+            .map_err(|e| anyhow!("spawn_blocking parse panicked: {e}"))?
+    }
+    #[cfg(target_arch = "wasm32")]
+    {
+        parse_envelope_and_settle_sync(raw, &primary_spec)
+    }
+}
+
+fn parse_envelope_and_settle_sync(
+    raw: Arc<[u8]>,
+    primary_spec: &str,
+) -> Result<(FullManifest, Option<PrimarySettleResult>)> {
+    use simd_json::prelude::{ValueAsScalar, ValueObjectAccess};
+
+    let mut buf = (*raw).to_vec();
+    let parsed =
+        simd_json::to_borrowed_value(&mut buf).map_err(|e| anyhow!("JSON parse error: {e}"))?;
+
+    let name = parsed
+        .get("name")
+        .and_then(|v| v.as_str())
+        .map(|s| s.to_string())
+        .unwrap_or_default();
+
+    let dist_tags: HashMap<String, String> = parsed
+        .get("dist-tags")
+        .and_then(|v| HashMap::<String, String>::deserialize(v).ok())
+        .unwrap_or_default();
+
+    let versions_keys: Vec<String> = parsed
+        .get("versions")
+        .and_then(simd_json::prelude::ValueAsObject::as_object)
+        .map(|obj| obj.keys().map(|k| k.to_string()).collect())
+        .unwrap_or_default();
+
+    let manifest = FullManifest {
+        name,
+        dist_tags: dist_tags.clone(),
+        versions: versions_keys,
+        raw,
+        ..Default::default()
+    };
+
+    // Resolve spec against the just-extracted envelope.
+    let primary_settle = match resolve_target_version((&manifest).into(), primary_spec) {
+        Ok(resolved) => parsed
+            .get("versions")
+            .and_then(|v| v.get(resolved.as_str()))
+            .and_then(|version_obj| CoreVersionManifest::deserialize(version_obj).ok())
+            .map(|core| (resolved, Arc::new(core))),
+        Err(_) => None,
+    };
+
+    Ok((manifest, primary_settle))
+}
+
 /// Fetch full manifest without ETag / 304 support.
 ///
 /// Convenience wrapper around [`fetch_full_manifest`] for callers that never
diff --git a/crates/ruborist/src/service/mod.rs b/crates/ruborist/src/service/mod.rs
index 13109e994..5adb6bf0b 100644
--- a/crates/ruborist/src/service/mod.rs
+++ b/crates/ruborist/src/service/mod.rs
@@ -60,8 +60,9 @@ pub use cache::{
 pub use fs::{Glob, NoopGlob, exists, read_to_string};
 pub use http::client_builder;
 pub use manifest::{
-    FetchManifestOptions, FetchManifestResult, FetchVersionManifestOptions, MetadataFormat,
-    fetch_full_manifest, fetch_full_manifest_fresh, fetch_version_manifest,
+    FetchManifestOptions, FetchManifestResult, FetchVersionManifestOptions, FetchWithSettle,
+    FetchWithSettleResult, MetadataFormat, fetch_full_manifest, fetch_full_manifest_fresh,
+    fetch_full_manifest_with_settle, fetch_version_manifest,
 };
 pub use registry::UnifiedRegistry;
 pub use store::{ManifestStore, NoopStore};

From 542d7f144ec700ab5601247eff655399585fedbe Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 07:11:45 +0800
Subject: [PATCH 17/24] =?UTF-8?q?perf(pm):=20bump=20manifests-concurrency-?=
 =?UTF-8?q?limit=2096=20=E2=86=92=20128?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After 671ac98e's combined-parse fetch path eliminated the
double simd_json pass, the spawn_blocking pool's contention
ceiling rose enough that bumping concurrency past 96 no longer
queues parses behind 2-core CPU. manifest-bench's most recent
good-network sweep on GHA showed conc=128 hitting 1500ms vs
conc=96 at 1566ms — small but real headroom for fast_preload's
late-wave saturation now that initial waves fill faster.

Risk: on slower-network runs (npmjs per-IP throttle), conc=128
widens p99. Earlier conc-sweep data was mixed — accepting that
variance for the average-case improvement.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/pm/src/util/user_config.rs | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/crates/pm/src/util/user_config.rs b/crates/pm/src/util/user_config.rs
index f05b0f52f..2f389379e 100644
--- a/crates/pm/src/util/user_config.rs
+++ b/crates/pm/src/util/user_config.rs
@@ -137,18 +137,17 @@ pub fn get_install_scope() -> InstallScope {
 // We tried 256 to match bun's observed parallel streams; on GHA the
 // fetch-breakdown instrumentation showed sum_parse exploded from
 // ~10ms (local Mac, network-bound) to 728s on first cold run with
-// Once we moved fetch parse off rayon to tokio's spawn_blocking pool
-// (cap 512) and settle off the runtime via rayon::spawn, the original
-// 256-concurrency regression mechanism (parses queued behind 2 rayon
-// workers) no longer applies. The standalone manifest-bench HTTP-only
-// sweep on GHA (npmjs, conc 32→256) shows wall bottoming out at conc 96
-// (1817ms) and tracking flat-then-rising past that — beyond ~96
-// in-flight, npmjs's per-IP rate degrades and tail latency widens.
-// 96 is the sweet spot: enough headroom for the wave-shaped transitive
-// dep walk in fast_preload to keep the runtime busy, without paying the
-// p99 widening that 128+ shows.
+// Once parse work shrank (combined `to_borrowed_value` pass replaces
+// the typed-serde envelope parse + reparse), spawn_blocking pool
+// pressure no longer caps us at 96. manifest-bench's HTTP-only sweep
+// on GHA (npmjs, h1) consistently picks 96 or 128 as best wall —
+// in the most recent good-network run, conc=128 hit 1500ms vs
+// conc=96 at 1566ms. Bumping to 128 narrows the gap between
+// fast_preload's wave-shaped concurrency floor (eff_parallel ~48
+// because pending takes ~2 wave depths to fill) and the cap, so
+// the late-wave saturation has more headroom.
 static MANIFESTS_CONCURRENCY_LIMIT: LazyLock<ConfigValue<usize>> =
-    LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 96));
+    LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 128));
 
 pub fn set_manifests_concurrency_limit(value: Option<usize>) {
     MANIFESTS_CONCURRENCY_LIMIT.set(value);

From c8768ac4ce8ca26a60a3313e22dba7ac625665d7 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 07:39:24 +0800
Subject: [PATCH 18/24] revert(pm): manifests-concurrency-limit back to 96

542d7f14's conc=128 bench landed in a slow-network run (mb best
2010ms vs 1500ms in the prior good-network run; bun also bumped
to 2.14s vs 1.83s). Adjusted gap to mb best stayed flat (~700ms
either way), so conc=128 didn't beat 96 across runs.

Picking 96 as the conservative default: at-or-near best on every
GHA run we've measured, never the worst, and leaves headroom for
npmjs's per-IP throttling to absorb without compounding p99.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/pm/src/util/user_config.rs | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/crates/pm/src/util/user_config.rs b/crates/pm/src/util/user_config.rs
index 2f389379e..f6924f5aa 100644
--- a/crates/pm/src/util/user_config.rs
+++ b/crates/pm/src/util/user_config.rs
@@ -137,17 +137,18 @@ pub fn get_install_scope() -> InstallScope {
 // We tried 256 to match bun's observed parallel streams; on GHA the
 // fetch-breakdown instrumentation showed sum_parse exploded from
 // ~10ms (local Mac, network-bound) to 728s on first cold run with
-// Once parse work shrank (combined `to_borrowed_value` pass replaces
-// the typed-serde envelope parse + reparse), spawn_blocking pool
-// pressure no longer caps us at 96. manifest-bench's HTTP-only sweep
-// on GHA (npmjs, h1) consistently picks 96 or 128 as best wall —
-// in the most recent good-network run, conc=128 hit 1500ms vs
-// conc=96 at 1566ms. Bumping to 128 narrows the gap between
-// fast_preload's wave-shaped concurrency floor (eff_parallel ~48
-// because pending takes ~2 wave depths to fill) and the cap, so
-// the late-wave saturation has more headroom.
+// manifest-bench's HTTP-only sweep on GHA (npmjs, h1) bottoms out
+// somewhere in the 96-128 band — which one wins varies with npmjs's
+// per-IP latency on each run (good runs picked 128, slow-network
+// runs flattened the curve and even regressed at 128 due to wider
+// p99 from queued requests). 96 is the conservative pick: it's at
+// or near best on every run we've measured, never the worst, and
+// leaves headroom for npmjs to throttle without compounding queue
+// time. Combined-parse fetch (671ac98e) made the spawn_blocking
+// pool no longer a contention bottleneck, but didn't change the
+// network-side variance — that's what caps the useful concurrency.
 static MANIFESTS_CONCURRENCY_LIMIT: LazyLock<ConfigValue<usize>> =
-    LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 128));
+    LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 96));
 
 pub fn set_manifests_concurrency_limit(value: Option<usize>) {
     MANIFESTS_CONCURRENCY_LIMIT.set(value);

From 3be7487d7ad772667ac125ce82955432c257f8d3 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 09:57:58 +0800
Subject: [PATCH 19/24] perf(pm): mb_resolve experimental fetch path (parallel
 track to fast_preload)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds resolver::mb_resolve module + service::build_deps_mb entry point
as a parallel-track alternative to fast_preload, structured to
match manifest-bench's main-loop shape as closely as correctness
allows. Hypothesis under test: fast_preload's eff_parallel caps at
~50/96 because the FastEvent enum match + cache writes + sibling
deferred bookkeeping in the main loop competes with tokio runtime
workers for the 2 CPU cores on GHA, stalling socket I/O drive.

mb_fetch pushes ALL per-fetch work into the spawned future itself
(including cache writes), so the main loop is reduced to:

  while let Some(deps) = futs.next().await {
      pending.extend(deps);
      refill_to_cap(...);
  }

Sibling specs (multiple ranges on same package) are NOT deferred at
queue level — racing fetches for the same name both proceed. The
race converges naturally: first fetch to land populates
full_manifests, subsequent racers find the cache hit on entry and
short-circuit to a sibling-style settle. Wastes ~5-50 network
requests in real workloads but eliminates the HashMap probe + drain
overhead from the hot loop.

Wired in via UTOO_RESOLVE=mb env var:
- Context::build_deps (utoo deps) routes through build_deps_mb
- pipeline::resolve_with_pipeline (utoo install) also routes
  through it; pipeline workers still start but don't pipeline
  during fetch (mb_fetch emits no PackageResolved events) — install
  becomes phase-sequential, useful for resolve-phase A/B.

bench script enables UTOO_RESOLVE=mb so CI measures the new path
against existing baselines (utoo-next/utoo-npm/bun ignore the env
var). Comment the export line to A/B back against fast_preload.

Old fast_preload + UnifiedRegistry paths untouched.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 bench/pm-bench-phases.sh                   |   7 +
 crates/pm/src/helper/ruborist_context.rs   |  12 +-
 crates/pm/src/service/pipeline/mod.rs      |  17 +-
 crates/ruborist/src/resolver/mb_resolve.rs | 243 +++++++++++++++++++++
 crates/ruborist/src/resolver/mod.rs        |   1 +
 crates/ruborist/src/service/api.rs         | 161 ++++++++++++++
 crates/ruborist/src/service/mod.rs         |   2 +-
 7 files changed, 440 insertions(+), 3 deletions(-)
 create mode 100644 crates/ruborist/src/resolver/mb_resolve.rs

diff --git a/bench/pm-bench-phases.sh b/bench/pm-bench-phases.sh
index 226ffb751..26e43388c 100755
--- a/bench/pm-bench-phases.sh
+++ b/bench/pm-bench-phases.sh
@@ -22,6 +22,13 @@ UTOO_NEXT_CACHE="${UTOO_NEXT_CACHE:-/tmp/utoo-next-bench-cache}"
 BUN_CACHE="${BUN_CACHE:-/tmp/bun-bench-cache}"
 export BUN_INSTALL_CACHE_DIR="$BUN_CACHE"
 
+# Route the current `utoo` binary's resolve phase through the
+# experimental `mb_resolve` flat-fetch path. Other PMs ignore this
+# env var (utoo-next is built from origin/next which doesn't have
+# the flag; utoo-npm/bun ignore unrecognized env vars). Comment out
+# to A/B against the default `fast_preload` path.
+export UTOO_RESOLVE=mb
+
 # Drop optional baselines from the PM list when their binary is not wired
 # up — UTOO_NPM_BIN is set by CI's "Install utoo@npm" step, UTOO_NEXT_BIN
 # by the optional "Build next branch utoo" step. Local runs without them
diff --git a/crates/pm/src/helper/ruborist_context.rs b/crates/pm/src/helper/ruborist_context.rs
index bc4d7faa1..542664f8c 100644
--- a/crates/pm/src/helper/ruborist_context.rs
+++ b/crates/pm/src/helper/ruborist_context.rs
@@ -87,10 +87,20 @@ impl Context {
     /// Used by the lockfile-only path (`utoo deps`). No pipeline consumes
     /// `PackageResolved` events here, so preload is pure overhead — BFS's
     /// own per-level parallel prefetch warms the manifest cache.
+    ///
+    /// Set `UTOO_RESOLVE=mb` to opt into the experimental
+    /// manifest-bench-style fetch path (`build_deps_mb`) for A/B
+    /// benchmarking against the current `fast_preload`.
     pub async fn build_deps(cwd: PathBuf) -> anyhow::Result<BuildDepsOutput> {
         let mut options = Self::deps_options(cwd.clone(), ProgressReceiver).await;
         options.skip_preload = true;
-        let output = utoo_ruborist::service::build_deps(options).await?;
+        let use_mb = std::env::var("UTOO_RESOLVE").as_deref() == Ok("mb");
+        let output = if use_mb {
+            tracing::debug!("UTOO_RESOLVE=mb: routing to build_deps_mb");
+            utoo_ruborist::service::build_deps_mb(options).await?
+        } else {
+            utoo_ruborist::service::build_deps(options).await?
+        };
         spawn_save_project_cache(cwd, output.project_cache.clone());
         Ok(output)
     }
diff --git a/crates/pm/src/service/pipeline/mod.rs b/crates/pm/src/service/pipeline/mod.rs
index 719d31d13..4169ca88d 100644
--- a/crates/pm/src/service/pipeline/mod.rs
+++ b/crates/pm/src/service/pipeline/mod.rs
@@ -41,7 +41,22 @@ pub async fn resolve_with_pipeline(root_path: &std::path::Path) -> anyhow::Resul
     let (options, channels) = Context::pipeline_deps_options(root_path.to_path_buf()).await;
     let handles = worker::start_workers(channels, root_path.to_path_buf());
 
-    let output = utoo_ruborist::service::build_deps(options).await?;
+    // `UTOO_RESOLVE=mb` reroutes install through the experimental
+    // mb-style fetch path. Pipeline workers are still started, but
+    // because mb_fetch doesn't emit `PackageResolved` events, the
+    // pipeline only fires once BFS completes (graph_to_package_lock
+    // emits `PackagePlaced` from BFS). Install becomes
+    // phase-sequential — fetch all manifests, then download +
+    // clone. Useful for A/B benchmarking the resolve phase in
+    // isolation; the pipelining advantage of the default path is
+    // lost.
+    let use_mb = std::env::var("UTOO_RESOLVE").as_deref() == Ok("mb");
+    let output = if use_mb {
+        tracing::debug!("UTOO_RESOLVE=mb: routing install resolve to build_deps_mb");
+        utoo_ruborist::service::build_deps_mb(options).await?
+    } else {
+        utoo_ruborist::service::build_deps(options).await?
+    };
 
     save_package_lock(root_path, &output.lock).await?;
     spawn_save_project_cache(root_path.to_path_buf(), output.project_cache);
diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs
new file mode 100644
index 000000000..2928638be
--- /dev/null
+++ b/crates/ruborist/src/resolver/mb_resolve.rs
@@ -0,0 +1,243 @@
+//! Manifest-bench-style flat manifest fetcher (experimental new pipeline).
+//!
+//! A parallel-track alternative to [`super::fast_preload`], structured
+//! to match `manifest-bench`'s main-loop shape as closely as
+//! correctness allows. The hypothesis under test: `fast_preload`'s
+//! eff_parallel caps at ~50 against a 96-cap because the main loop's
+//! CPU work (FastEvent enum match + cache writes + sibling-deferred
+//! bookkeeping + Box::pin allocation) competes with tokio runtime
+//! workers for the 2 cores on GHA, stalling socket I/O drive.
+//!
+//! `mb_resolve` pushes ALL per-fetch work into the spawned future
+//! itself (cache writes included) so the main loop is reduced to:
+//!
+//! ```ignore
+//! while let Some(deps) = futs.next().await {
+//!     pending.extend(deps);
+//!     refill_to_cap(&mut futs, &mut pending, ...);
+//! }
+//! ```
+//!
+//! Sibling specs (multiple ranges on the same package) are NOT
+//! deferred at queue level — if two specs for the same name race,
+//! both fetch. This wastes a small number of network requests (~5-50
+//! across a real install) but keeps the main loop's per-event cost
+//! minimal (no HashMap probe / drain). The race converges: whichever
+//! fetch lands first populates `full_manifests`; subsequent racers
+//! find the cache hit on entry and short-circuit to a sibling-style
+//! settle without re-fetching.
+//!
+//! Wiring: opt-in via `UTOO_RESOLVE=mb` env var. Both `utoo deps`
+//! and `utoo install` route through this when set; install loses
+//! pipelining (mb_fetch doesn't emit `PackageResolved` events) but
+//! gains the lean main loop for resolve-phase A/B testing.
+
+use std::collections::{HashSet, VecDeque};
+use std::sync::Arc;
+
+use futures::stream::{FuturesUnordered, StreamExt};
+
+use crate::model::manifest::{CoreVersionManifest, FullManifest};
+use crate::model::node::PeerDeps;
+use crate::resolver::preload::{Dep, PreloadConfig};
+use crate::resolver::version::resolve_target_version;
+use crate::service::{
+    FetchManifestOptions, FetchWithSettleResult, MemoryCache, MetadataFormat,
+    fetch_full_manifest_with_settle,
+};
+use crate::spec::SpecStr;
+use crate::util::FETCH_TIMINGS;
+
+#[derive(Debug, Default)]
+pub struct MbFetchStats {
+    pub success: usize,
+    pub fail: usize,
+}
+
+/// Collect dependencies from a deps map, filtering non-registry specs.
+fn collect_deps(map: Option<&std::collections::HashMap<String, String>>) -> Vec<Dep> {
+    map.into_iter()
+        .flatten()
+        .filter(|(_, spec)| spec.is_registry_spec())
+        .map(|(name, spec)| (name.clone(), spec.clone()))
+        .collect()
+}
+
+fn extract_transitive(manifest: &CoreVersionManifest, peer_deps: PeerDeps) -> Vec<Dep> {
+    let mut out = Vec::new();
+    out.extend(collect_deps(manifest.dependencies.as_ref()));
+    if peer_deps == PeerDeps::Include {
+        out.extend(collect_deps(manifest.peer_dependencies.as_ref()));
+    }
+    out.extend(collect_deps(manifest.optional_dependencies.as_ref()));
+    out
+}
+
+/// Settle one (name, spec) against an already-cached `FullManifest`.
+/// Used for sibling specs (or racing-fetch losers) — extracts the
+/// resolved version's `CoreVersionManifest` on the blocking pool,
+/// populates both `(name, spec)` and `(name, resolved_version)` cache
+/// slots so BFS hits the early-return fast path.
+async fn settle_sibling(
+    name: String,
+    spec: String,
+    full: Arc<FullManifest>,
+    cache: MemoryCache,
+    peer_deps: PeerDeps,
+) -> Vec<Dep> {
+    let Ok(resolved) = resolve_target_version((&*full).into(), &spec) else {
+        return Vec::new();
+    };
+    if let Some(cached) = cache.get_version_manifest(&name, &resolved) {
+        cache.set_version_manifest(name, spec, Arc::clone(&cached));
+        return extract_transitive(&cached, peer_deps);
+    }
+
+    let resolved_for_parse = resolved.clone();
+    let full_for_parse = Arc::clone(&full);
+    let core_opt = tokio::task::spawn_blocking(move || {
+        full_for_parse
+            .get_core_version(&resolved_for_parse)
+            .map(Arc::new)
+    })
+    .await
+    .ok()
+    .flatten();
+
+    let Some(core_arc) = core_opt else {
+        return Vec::new();
+    };
+    cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc));
+    cache.set_version_manifest(name, resolved, Arc::clone(&core_arc));
+    extract_transitive(&core_arc, peer_deps)
+}
+
+/// Self-contained per-spec future. Either fetches `(name)`'s full
+/// manifest from the registry (if not yet cached), or settles against
+/// an already-cached one. In both cases it:
+///   * writes `full_manifests` and `version_manifests` cache slots
+///     for the resolved spec,
+///   * returns the spec's transitive deps for the main loop to
+///     enqueue.
+///
+/// Racing-fetch handling: two specs for the same name dispatched
+/// concurrently both enter the fetch branch (no in-flight gate). The
+/// second one re-issues a network round-trip; the cost is bounded by
+/// the small number of sibling specs in real workloads (<2% in
+/// ant-design-x). Last writer to `cache.set_full_manifest` wins;
+/// content is identical so correctness is preserved.
+async fn fetch_or_settle(
+    name: String,
+    spec: String,
+    registry_url: String,
+    cache: MemoryCache,
+    peer_deps: PeerDeps,
+) -> Vec<Dep> {
+    // Sibling fast path: full manifest already cached.
+    if let Some(full) = cache.get_full_manifest(&name) {
+        return settle_sibling(name, spec, full, cache, peer_deps).await;
+    }
+
+    let result = fetch_full_manifest_with_settle(
+        FetchManifestOptions {
+            registry_url: &registry_url,
+            name: &name,
+            format: MetadataFormat::Abbreviated,
+            etag: None,
+        },
+        &spec,
+    )
+    .await;
+
+    let Ok(FetchWithSettleResult::Ok(payload)) = result else {
+        return Vec::new();
+    };
+
+    let full_arc = Arc::new(payload.manifest);
+    cache.set_full_manifest(name.clone(), Arc::clone(&full_arc));
+
+    let Some((resolved, core_arc)) = payload.primary_settle else {
+        return Vec::new();
+    };
+    cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc));
+    cache.set_version_manifest(name, resolved, Arc::clone(&core_arc));
+    extract_transitive(&core_arc, peer_deps)
+}
+
+/// Manifest-bench-style flat parallel fetch. See module docs for the
+/// rationale.
+pub async fn mb_fetch(
+    initial_deps: Vec<Dep>,
+    registry_url: &str,
+    cache: &MemoryCache,
+    config: &PreloadConfig,
+) -> MbFetchStats {
+    let mut stats = MbFetchStats::default();
+    let mut pending: VecDeque<Dep> = initial_deps.into();
+    let mut seen: HashSet<(String, String)> = HashSet::new();
+    let mut futs = FuturesUnordered::new();
+    let cap = config.concurrency;
+    let peer_deps = config.peer_deps;
+    let registry_url = registry_url.to_string();
+
+    let start = tokio::time::Instant::now();
+
+    // Initial fill — same shape as the refill below.
+    while futs.len() < cap {
+        let Some((name, spec)) = pending.pop_front() else {
+            break;
+        };
+        if !seen.insert((name.clone(), spec.clone())) {
+            continue;
+        }
+        futs.push(Box::pin(fetch_or_settle(
+            name,
+            spec,
+            registry_url.clone(),
+            cache.clone(),
+            peer_deps,
+        )));
+    }
+
+    while let Some(transitive) = futs.next().await {
+        if transitive.is_empty() {
+            // Empty result is ambiguous (no transitive deps OR fetch
+            // failed) — `MbFetchStats` only tracks success/fail at a
+            // coarse level. The fetch-timings counters (recorded
+            // inside `fetch_full_manifest_with_settle`) carry the
+            // detailed per-fetch metrics.
+            stats.fail += 1;
+        } else {
+            stats.success += 1;
+        }
+        pending.extend(transitive);
+
+        // Refill — same body as the initial fill above.
+        while futs.len() < cap {
+            let Some((name, spec)) = pending.pop_front() else {
+                break;
+            };
+            if !seen.insert((name.clone(), spec.clone())) {
+                continue;
+            }
+            futs.push(Box::pin(fetch_or_settle(
+                name,
+                spec,
+                registry_url.clone(),
+                cache.clone(),
+                peer_deps,
+            )));
+        }
+    }
+
+    let wall = start.elapsed();
+    tracing::info!(
+        "p1-breakdown mb_fetch wall={}ms ok={} fail={} | {}",
+        wall.as_millis(),
+        stats.success,
+        stats.fail,
+        FETCH_TIMINGS.snapshot().summary_line(),
+    );
+
+    stats
+}
diff --git a/crates/ruborist/src/resolver/mod.rs b/crates/ruborist/src/resolver/mod.rs
index e7baad988..2d0a288d9 100644
--- a/crates/ruborist/src/resolver/mod.rs
+++ b/crates/ruborist/src/resolver/mod.rs
@@ -8,6 +8,7 @@ pub mod fast_preload;
 pub mod git;
 #[cfg(feature = "http-tarball")]
 pub mod http;
+pub mod mb_resolve;
 pub mod preload;
 pub mod registry;
 pub mod runtime;
diff --git a/crates/ruborist/src/service/api.rs b/crates/ruborist/src/service/api.rs
index 3b9b713ea..9687fc875 100644
--- a/crates/ruborist/src/service/api.rs
+++ b/crates/ruborist/src/service/api.rs
@@ -39,6 +39,7 @@ use crate::resolver::builder::{
     gather_preload_deps,
 };
 use crate::resolver::fast_preload::fast_preload;
+use crate::resolver::mb_resolve::mb_fetch;
 use crate::resolver::preload::PreloadConfig;
 use crate::resolver::runtime::install_runtime_from_map;
 use crate::resolver::workspace::WorkspaceDiscovery;
@@ -332,6 +333,166 @@ where
     })
 }
 
+/// Experimental parallel-track entry point: structurally identical to
+/// [`build_deps`] but routes the manifest-fetch phase through
+/// [`crate::resolver::mb_resolve::mb_fetch`] instead of
+/// [`crate::resolver::fast_preload::fast_preload`].
+///
+/// Intended for A/B benchmarking: install + lockfile-only callers can
+/// opt in via `UTOO_RESOLVE=mb` (wired in `pm::helper::ruborist_context`).
+/// All other behavior — workspace discovery, runtime injection, BFS,
+/// graph→lock serialization, project cache export — is the same as
+/// `build_deps`. The `EventReceiver` still receives BFS events; it
+/// does NOT receive `PreloadFetching` / `PreloadProgress` events
+/// because mb_fetch is silent (matches `manifest-bench`'s zero-event
+/// loop).
+///
+/// **Install-path note:** `pipeline_deps_options` callers that need
+/// `PackageResolved` events to drive the download/clone pipeline
+/// won't pipeline under this path — mb_fetch finishes all fetches
+/// before BFS starts. Use only for `utoo deps`-style workloads, or
+/// accept that install becomes phase-sequential.
+pub async fn build_deps_mb<G, R>(options: BuildDepsOptions<G, R>) -> Result<BuildDepsOutput>
+where
+    G: Glob + Clone,
+    R: EventReceiver,
+{
+    let BuildDepsOptions {
+        cwd,
+        registry_url,
+        cache_dir,
+        manifest_store,
+        warm_project_cache,
+        concurrency,
+        peer_deps,
+        glob,
+        receiver,
+        supports_semver,
+        catalogs,
+        skip_preload: _,
+    } = options;
+
+    // Steps 1-6: structurally identical to `build_deps` — read
+    // package.json, inject runtime deps, build initial graph, add
+    // root edges, discover and add workspaces.
+    let discovery = WorkspaceDiscovery::new(glob.clone());
+    let root_path = discovery.find_root_path(&cwd).await?;
+    let pkg_path = root_path.join("package.json");
+    let mut pkg: PackageJson = super::fs::read_json(&pkg_path)
+        .await
+        .map_err(|e| anyhow::anyhow!("Failed to read/parse package.json: {}", e))?;
+
+    if let Some(engines) = &pkg.engines {
+        let runtime_deps = install_runtime_from_map(engines);
+        if !runtime_deps.is_empty() {
+            for (name, version) in runtime_deps {
+                pkg.optional_dependencies
+                    .get_or_insert_with(HashMap::new)
+                    .entry(name)
+                    .or_insert(version);
+            }
+        }
+    }
+
+    let mut graph = DependencyGraph::from_package_json(root_path.clone(), pkg.clone());
+    let root_index = graph.root_index;
+    let edge_ctx = EdgeContext::new(peer_deps, DevDeps::Include).with_catalogs(&catalogs);
+    add_edges_from(&mut graph, root_index, &pkg, &edge_ctx);
+
+    let workspaces = discovery.find_workspaces_from_pkg(&root_path, &pkg).await?;
+    for workspace in workspaces {
+        let ws_pkg = workspace.package_json;
+        let workspace_node =
+            PackageNode::workspace_from_package_json(workspace.path.clone(), ws_pkg.clone());
+        let workspace_index = graph.add_node(workspace_node);
+        let link_node = PackageNode::link_from_package_json(workspace.path.clone(), ws_pkg.clone());
+        let link_index = graph.add_node(link_node);
+        graph.add_physical_edge(root_index, workspace_index);
+        graph.add_physical_edge(root_index, link_index);
+        let dep_edge_id = graph.add_dependency_edge(
+            root_index,
+            workspace.name.clone(),
+            &ws_pkg.version,
+            EdgeType::Prod,
+        );
+        graph.mark_dependency_resolved(dep_edge_id, workspace_index);
+        add_edges_from(&mut graph, workspace_index, &ws_pkg, &edge_ctx);
+    }
+
+    // Step 7-8: cache + registry, same as `build_deps`. Warm project
+    // cache is honored.
+    let package_cache = Arc::new(PackageCache::default());
+    let (cache_count, _) = prepopulate_warm_cache(&package_cache, warm_project_cache.as_ref());
+
+    let mut builder = UnifiedRegistry::builder()
+        .registry(&registry_url)
+        .cache(package_cache)
+        .store(Arc::clone(&manifest_store));
+    if let Some(semver) = supports_semver {
+        builder = builder.supports_semver(semver);
+    }
+    let registry = builder.build();
+
+    // Run mb_fetch instead of fast_preload — pre-warms cache by
+    // walking transitive deps via flat FuturesUnordered. Skipped if
+    // the warm project cache already covers the workload.
+    if cache_count == 0 {
+        let initial_deps = gather_preload_deps(&graph, peer_deps);
+        let preload_config = PreloadConfig {
+            peer_deps,
+            concurrency,
+        };
+        mb_fetch(
+            initial_deps,
+            registry.registry_url(),
+            registry.cache(),
+            &preload_config,
+        )
+        .await;
+    }
+
+    // BFS phase reads the now-warm cache. `skip_preload=true` skips
+    // the receiver-driven preload — mb_fetch already ran.
+    let mut config = BuildDepsConfig::default()
+        .with_peer_deps(peer_deps)
+        .with_concurrency(concurrency)
+        .with_skip_preload(true)
+        .with_catalogs(catalogs);
+    if let Some(dir) = cache_dir {
+        config = config.with_cache_dir(dir);
+    }
+
+    build_deps_with_config(&mut graph, &registry, config, &receiver)
+        .await
+        .map_err(|e| anyhow::Error::new(e).context("Dependency resolution failed"))?;
+
+    let t_serialize_start = std::time::Instant::now();
+    let (packages, _total) = graph.serialize_to_packages(&root_path);
+    let serialize_us = t_serialize_start.elapsed().as_micros() as u64;
+
+    let t_cache_export_start = std::time::Instant::now();
+    let mut project_cache = ProjectCacheData::default();
+    for (key, manifest) in registry.cache().export_version_manifests() {
+        let (name, spec) = parse_package_spec(&key);
+        let version = manifest.version.clone();
+        let pkg_cache = project_cache.cache.entry(name.to_string()).or_default();
+        pkg_cache.specs.insert(spec.to_string(), version.clone());
+        pkg_cache.manifests.insert(version, (*manifest).clone());
+    }
+    let cache_export_us = t_cache_export_start.elapsed().as_micros() as u64;
+
+    tracing::info!(
+        "p1-breakdown serialize_us={} cache_export_us={}",
+        serialize_us,
+        cache_export_us,
+    );
+
+    Ok(BuildDepsOutput {
+        lock: PackageLock::new(&pkg.name, &pkg.version, packages),
+        project_cache,
+    })
+}
+
 /// Pre-populate `cache` from a warm project cache. Returns
 /// `(loaded, missing)` — `loaded` is the count of usable spec→manifest
 /// entries; `missing` counts specs whose resolved version had no manifest
diff --git a/crates/ruborist/src/service/mod.rs b/crates/ruborist/src/service/mod.rs
index 5adb6bf0b..7a7cf8ca8 100644
--- a/crates/ruborist/src/service/mod.rs
+++ b/crates/ruborist/src/service/mod.rs
@@ -52,7 +52,7 @@ mod manifest;
 mod registry;
 mod store;
 
-pub use api::{BuildDepsOptions, BuildDepsOutput, build_deps};
+pub use api::{BuildDepsOptions, BuildDepsOutput, build_deps, build_deps_mb};
 pub use cache::{
     CacheStats, MemoryCache, PackageCache, ProjectCacheData, ProjectPackageCache, Versions,
     VersionsInfo,

From 02cc12e7a23214672215a1ee1efd6317e7ce6d8c Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 10:39:27 +0800
Subject: [PATCH 20/24] =?UTF-8?q?perf(pm):=20mb=5Fresolve=20v3=20=E2=80=94?=
 =?UTF-8?q?=20two-phase=20pure=20HTTP=20+=20rayon=20batch=20parse?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v1/v2 ran parse work in spawn_blocking inside each fetch future,
which competed with tokio runtime workers for the 2 GHA cores. CI
showed eff_parallel capped at 47/96 vs manifest-bench standalone's
75/96 on the same box. Hypothesis: parse CPU starves socket drive.

v3 separates the two phases:

* PHASE 1 — `mb_style_pure_fetch` is a structural copy of
  `manifest-bench`'s main loop: future body does ONLY GET + body
  recv, refill 1-for-1 on completion. Zero per-future CPU work, so
  tokio runtime workers retain full CPU for socket drive.

* PHASE 2 — bulk rayon par_iter parse: for each body, parse
  `FullManifest` envelope via simd_json::to_borrowed_value, resolve
  every queued spec for this name against the just-parsed manifest,
  populate cache slots, collect transitive deps. Runs off the
  tokio runtime entirely (spawn_blocking → rayon par_iter).

Phases alternate until pending exhausted. Typical project: 3-5
iterations as the dep tree fans out wave by wave.

The point of the split is the `phase1_http_wall` trace — measured
in isolation from any parse work, it should match manifest-bench's
standalone wall (~1.5-2.0s for 2733 names @ conc=96). If it does,
the remaining gap to mb is concentrated in phase 2 work, which is
inherent to discovering transitive deps from a non-flat name list.

Tracing per iteration:
  p1-breakdown mb_fetch iter=N phase1_http_wall=Xms n=Y bytes=Z
  p1-breakdown mb_fetch iter=N phase2_parse_wall=Xms settles=Y new_transitives=Z
  p1-breakdown mb_fetch total_wall=Xms iters=Y

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/resolver/mb_resolve.rs | 494 ++++++++++++++-------
 1 file changed, 332 insertions(+), 162 deletions(-)

diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs
index 2928638be..05e1bf038 100644
--- a/crates/ruborist/src/resolver/mb_resolve.rs
+++ b/crates/ruborist/src/resolver/mb_resolve.rs
@@ -1,61 +1,87 @@
-//! Manifest-bench-style flat manifest fetcher (experimental new pipeline).
+//! Two-phase manifest fetcher: phase 1 pure HTTP (mirrors
+//! `manifest-bench` standalone exactly), phase 2 rayon batch parse +
+//! settle.
 //!
-//! A parallel-track alternative to [`super::fast_preload`], structured
-//! to match `manifest-bench`'s main-loop shape as closely as
-//! correctness allows. The hypothesis under test: `fast_preload`'s
-//! eff_parallel caps at ~50 against a 96-cap because the main loop's
-//! CPU work (FastEvent enum match + cache writes + sibling-deferred
-//! bookkeeping + Box::pin allocation) competes with tokio runtime
-//! workers for the 2 cores on GHA, stalling socket I/O drive.
+//! ## Phase split
 //!
-//! `mb_resolve` pushes ALL per-fetch work into the spawned future
-//! itself (cache writes included) so the main loop is reduced to:
+//! Per-fetch parse work was the real bottleneck in v1/v2 — `simd_json`
+//! ran in `spawn_blocking` threads that competed with tokio runtime
+//! workers for CPU on the 2-core GHA box. When 50+ parses ran in
+//! parallel, tokio workers couldn't drive sockets, so `eff_parallel`
+//! capped at ~47 against the 96 cap (vs `manifest-bench` standalone's
+//! 75 on the same box).
 //!
-//! ```ignore
-//! while let Some(deps) = futs.next().await {
-//!     pending.extend(deps);
-//!     refill_to_cap(&mut futs, &mut pending, ...);
-//! }
-//! ```
+//! v3 separates the work:
 //!
-//! Sibling specs (multiple ranges on the same package) are NOT
-//! deferred at queue level — if two specs for the same name race,
-//! both fetch. This wastes a small number of network requests (~5-50
-//! across a real install) but keeps the main loop's per-event cost
-//! minimal (no HashMap probe / drain). The race converges: whichever
-//! fetch lands first populates `full_manifests`; subsequent racers
-//! find the cache hit on entry and short-circuit to a sibling-style
-//! settle without re-fetching.
+//! - **Phase 1** — `mb_style_pure_fetch` is a structural copy of
+//!   `manifest-bench`'s main loop: `spawn_one` (GET + body recv,
+//!   nothing else) + 1-for-1 refill on completion. The future body
+//!   has zero CPU work, so the tokio runtime workers retain full CPU
+//!   to drive sockets and `eff_parallel` reaches the same level as
+//!   the standalone bench.
 //!
-//! Wiring: opt-in via `UTOO_RESOLVE=mb` env var. Both `utoo deps`
-//! and `utoo install` route through this when set; install loses
-//! pipelining (mb_fetch doesn't emit `PackageResolved` events) but
-//! gains the lean main loop for resolve-phase A/B testing.
+//! - **Phase 2** — bulk parse on rayon (off the tokio runtime). For
+//!   each fetched body: parse `FullManifest` envelope, resolve every
+//!   spec we need for this name, materialize `CoreVersionManifest`
+//!   subtrees, populate cache slots, collect transitive deps for the
+//!   next iteration.
+//!
+//! Phases alternate until `pending` is empty (typical project: 3-5
+//! iterations as transitive deps fan out wave by wave).
+//!
+//! Phase 1 is the line we measure against `manifest-bench` —
+//! `p1-breakdown mb_fetch_iter=N phase1_http_wall=...` traces let us
+//! check eff_parallel directly.
+//!
+//! Wired in via `UTOO_RESOLVE=mb` env var (see
+//! `pm::helper::ruborist_context::Context::build_deps`).
 
-use std::collections::{HashSet, VecDeque};
+use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
 
+use bytes::Bytes;
 use futures::stream::{FuturesUnordered, StreamExt};
+use rayon::prelude::*;
+use serde::Deserialize;
 
 use crate::model::manifest::{CoreVersionManifest, FullManifest};
 use crate::model::node::PeerDeps;
 use crate::resolver::preload::{Dep, PreloadConfig};
 use crate::resolver::version::resolve_target_version;
-use crate::service::{
-    FetchManifestOptions, FetchWithSettleResult, MemoryCache, MetadataFormat,
-    fetch_full_manifest_with_settle,
-};
+use crate::service::MemoryCache;
+use crate::service::http::get_client;
 use crate::spec::SpecStr;
-use crate::util::FETCH_TIMINGS;
 
 #[derive(Debug, Default)]
 pub struct MbFetchStats {
     pub success: usize,
     pub fail: usize,
+    pub iterations: usize,
+}
+
+/// Phase 1 result: one body per fetched name. `bytes` is `None` on
+/// transport / non-2xx — kept in the result vector so phase 2 can
+/// account for it, but contributes no settle work.
+struct FetchOutcome {
+    name: String,
+    bytes: Option<Bytes>,
 }
 
-/// Collect dependencies from a deps map, filtering non-registry specs.
-fn collect_deps(map: Option<&std::collections::HashMap<String, String>>) -> Vec<Dep> {
+/// Phase 2 per-name output. `full` is `None` on parse failure.
+struct ParseOutcome {
+    name: String,
+    full: Option<Arc<FullManifest>>,
+    /// Per-spec settled subtrees: `(spec, resolved_version, core)`.
+    /// Empty when the body failed to fetch / parse, or when no spec
+    /// resolves against the manifest.
+    settled: Vec<(String, String, Arc<CoreVersionManifest>)>,
+    /// Transitive deps collected across all settled subtrees for this
+    /// name. Already filtered to registry specs; the main loop dedups
+    /// against `done_names` before queueing.
+    transitives: Vec<Dep>,
+}
+
+fn collect_deps(map: Option<&HashMap<String, String>>) -> Vec<Dep> {
     map.into_iter()
         .flatten()
         .filter(|(_, spec)| spec.is_registry_spec())
@@ -73,99 +99,177 @@ fn extract_transitive(manifest: &CoreVersionManifest, peer_deps: PeerDeps) -> Ve
     out
 }
 
-/// Settle one (name, spec) against an already-cached `FullManifest`.
-/// Used for sibling specs (or racing-fetch losers) — extracts the
-/// resolved version's `CoreVersionManifest` on the blocking pool,
-/// populates both `(name, spec)` and `(name, resolved_version)` cache
-/// slots so BFS hits the early-return fast path.
-async fn settle_sibling(
-    name: String,
-    spec: String,
-    full: Arc<FullManifest>,
-    cache: MemoryCache,
-    peer_deps: PeerDeps,
-) -> Vec<Dep> {
-    let Ok(resolved) = resolve_target_version((&*full).into(), &spec) else {
-        return Vec::new();
+/// Phase 1 — structural copy of `manifest-bench`'s main loop. Future
+/// body does ONLY GET + body recv; no parse, no cache writes, no
+/// dedup. Returns one `FetchOutcome` per input name in arrival order.
+async fn mb_style_pure_fetch(
+    names: Vec<String>,
+    registry_url: &str,
+    concurrency: usize,
+) -> Vec<FetchOutcome> {
+    let client = match get_client() {
+        Ok(c) => c.clone(),
+        Err(e) => {
+            tracing::warn!("get_client failed: {e}");
+            return Vec::new();
+        }
     };
-    if let Some(cached) = cache.get_version_manifest(&name, &resolved) {
-        cache.set_version_manifest(name, spec, Arc::clone(&cached));
-        return extract_transitive(&cached, peer_deps);
-    }
 
-    let resolved_for_parse = resolved.clone();
-    let full_for_parse = Arc::clone(&full);
-    let core_opt = tokio::task::spawn_blocking(move || {
-        full_for_parse
-            .get_core_version(&resolved_for_parse)
-            .map(Arc::new)
-    })
-    .await
-    .ok()
-    .flatten();
+    let mut results: Vec<FetchOutcome> = Vec::with_capacity(names.len());
+    let mut futs = FuturesUnordered::new();
+    let mut idx = 0usize;
 
-    let Some(core_arc) = core_opt else {
-        return Vec::new();
+    let spawn_one = |client: &reqwest::Client,
+                     registry_url: &str,
+                     name: String,
+                     futs: &mut FuturesUnordered<_>| {
+        let url = format!("{}/{}", registry_url, name);
+        let client = client.clone();
+        futs.push(Box::pin(async move {
+            let bytes = match client
+                .get(&url)
+                .header("accept", "application/vnd.npm.install-v1+json")
+                .send()
+                .await
+            {
+                Ok(resp) if resp.status().is_success() => resp.bytes().await.ok(),
+                _ => None,
+            };
+            FetchOutcome { name, bytes }
+        }));
     };
-    cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc));
-    cache.set_version_manifest(name, resolved, Arc::clone(&core_arc));
-    extract_transitive(&core_arc, peer_deps)
+
+    while idx < names.len() && futs.len() < concurrency {
+        spawn_one(&client, registry_url, names[idx].clone(), &mut futs);
+        idx += 1;
+    }
+
+    while let Some(outcome) = futs.next().await {
+        results.push(outcome);
+        if idx < names.len() {
+            spawn_one(&client, registry_url, names[idx].clone(), &mut futs);
+            idx += 1;
+        }
+    }
+
+    results
 }
 
-/// Self-contained per-spec future. Either fetches `(name)`'s full
-/// manifest from the registry (if not yet cached), or settles against
-/// an already-cached one. In both cases it:
-///   * writes `full_manifests` and `version_manifests` cache slots
-///     for the resolved spec,
-///   * returns the spec's transitive deps for the main loop to
-///     enqueue.
-///
-/// Racing-fetch handling: two specs for the same name dispatched
-/// concurrently both enter the fetch branch (no in-flight gate). The
-/// second one re-issues a network round-trip; the cost is bounded by
-/// the small number of sibling specs in real workloads (<2% in
-/// ant-design-x). Last writer to `cache.set_full_manifest` wins;
-/// content is identical so correctness is preserved.
-async fn fetch_or_settle(
+/// Sync phase 2 worker: parse one body, settle all specs we need for
+/// this name. Runs on rayon (called from `par_iter` in
+/// `parse_settle_batch`).
+fn parse_one_body(
     name: String,
-    spec: String,
-    registry_url: String,
-    cache: MemoryCache,
+    raw: Bytes,
+    specs: Vec<String>,
     peer_deps: PeerDeps,
-) -> Vec<Dep> {
-    // Sibling fast path: full manifest already cached.
-    if let Some(full) = cache.get_full_manifest(&name) {
-        return settle_sibling(name, spec, full, cache, peer_deps).await;
-    }
+) -> ParseOutcome {
+    use simd_json::prelude::{ValueAsScalar, ValueObjectAccess};
 
-    let result = fetch_full_manifest_with_settle(
-        FetchManifestOptions {
-            registry_url: &registry_url,
-            name: &name,
-            format: MetadataFormat::Abbreviated,
-            etag: None,
-        },
-        &spec,
-    )
-    .await;
-
-    let Ok(FetchWithSettleResult::Ok(payload)) = result else {
-        return Vec::new();
+    let raw_arc: Arc<[u8]> = Arc::from(raw.as_ref());
+    let mut buf = raw.to_vec();
+    let parsed = match simd_json::to_borrowed_value(&mut buf) {
+        Ok(v) => v,
+        Err(_) => {
+            return ParseOutcome {
+                name,
+                full: None,
+                settled: Vec::new(),
+                transitives: Vec::new(),
+            };
+        }
     };
 
-    let full_arc = Arc::new(payload.manifest);
-    cache.set_full_manifest(name.clone(), Arc::clone(&full_arc));
+    let envelope_name = parsed
+        .get("name")
+        .and_then(|v| v.as_str())
+        .map(|s| s.to_string())
+        .unwrap_or_else(|| name.clone());
+    let dist_tags: HashMap<String, String> = parsed
+        .get("dist-tags")
+        .and_then(|v| HashMap::<String, String>::deserialize(v).ok())
+        .unwrap_or_default();
+    let versions_keys: Vec<String> = parsed
+        .get("versions")
+        .and_then(simd_json::prelude::ValueAsObject::as_object)
+        .map(|obj| obj.keys().map(|k| k.to_string()).collect())
+        .unwrap_or_default();
 
-    let Some((resolved, core_arc)) = payload.primary_settle else {
-        return Vec::new();
+    let full = FullManifest {
+        name: envelope_name,
+        dist_tags,
+        versions: versions_keys,
+        raw: Arc::clone(&raw_arc),
+        ..Default::default()
     };
-    cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc));
-    cache.set_version_manifest(name, resolved, Arc::clone(&core_arc));
-    extract_transitive(&core_arc, peer_deps)
+    let full_arc = Arc::new(full);
+
+    // For each requested spec, resolve + extract version subtree.
+    // Cache the per-(name, version) `CoreVersionManifest` so sibling
+    // specs that resolve to the same version reuse the same Arc.
+    let mut version_cache: HashMap<String, Arc<CoreVersionManifest>> = HashMap::new();
+    let mut settled = Vec::with_capacity(specs.len());
+    let mut transitives = Vec::new();
+
+    for spec in specs {
+        let Ok(resolved_version) = resolve_target_version((&*full_arc).into(), &spec) else {
+            continue;
+        };
+        let core_arc = if let Some(cached) = version_cache.get(&resolved_version) {
+            Arc::clone(cached)
+        } else {
+            let Some(core) = parsed
+                .get("versions")
+                .and_then(|v| v.get(resolved_version.as_str()))
+                .and_then(|version_obj| CoreVersionManifest::deserialize(version_obj).ok())
+            else {
+                continue;
+            };
+            let arc = Arc::new(core);
+            version_cache.insert(resolved_version.clone(), Arc::clone(&arc));
+            arc
+        };
+        transitives.extend(extract_transitive(&core_arc, peer_deps));
+        settled.push((spec, resolved_version, core_arc));
+    }
+
+    ParseOutcome {
+        name,
+        full: Some(full_arc),
+        settled,
+        transitives,
+    }
+}
+
+/// Phase 2 dispatcher: hands the batch to rayon, awaits the result.
+async fn parse_settle_batch(
+    bodies: Vec<FetchOutcome>,
+    by_name: HashMap<String, Vec<String>>,
+    peer_deps: PeerDeps,
+) -> Vec<ParseOutcome> {
+    let work: Vec<(String, Bytes, Vec<String>)> = bodies
+        .into_iter()
+        .filter_map(|f| {
+            let bytes = f.bytes?;
+            let specs = by_name.get(&f.name).cloned().unwrap_or_default();
+            Some((f.name, bytes, specs))
+        })
+        .collect();
+
+    if work.is_empty() {
+        return Vec::new();
+    }
+
+    tokio::task::spawn_blocking(move || {
+        work.into_par_iter()
+            .map(|(name, raw, specs)| parse_one_body(name, raw, specs, peer_deps))
+            .collect::<Vec<_>>()
+    })
+    .await
+    .unwrap_or_default()
 }
 
-/// Manifest-bench-style flat parallel fetch. See module docs for the
-/// rationale.
+/// Two-phase mb-style fetch with rayon batch parse. See module docs.
 pub async fn mb_fetch(
     initial_deps: Vec<Dep>,
     registry_url: &str,
@@ -173,70 +277,136 @@ pub async fn mb_fetch(
     config: &PreloadConfig,
 ) -> MbFetchStats {
     let mut stats = MbFetchStats::default();
-    let mut pending: VecDeque<Dep> = initial_deps.into();
-    let mut seen: HashSet<(String, String)> = HashSet::new();
-    let mut futs = FuturesUnordered::new();
-    let cap = config.concurrency;
+    let mut pending_specs: Vec<Dep> = initial_deps;
+    let mut done_names: HashSet<String> = HashSet::new();
+    let conc = config.concurrency;
     let peer_deps = config.peer_deps;
-    let registry_url = registry_url.to_string();
+    let total_start = tokio::time::Instant::now();
 
-    let start = tokio::time::Instant::now();
+    while !pending_specs.is_empty() {
+        stats.iterations += 1;
+        let iter = stats.iterations;
 
-    // Initial fill — same shape as the refill below.
-    while futs.len() < cap {
-        let Some((name, spec)) = pending.pop_front() else {
-            break;
-        };
-        if !seen.insert((name.clone(), spec.clone())) {
-            continue;
+        // Group this iteration's pending specs by name.
+        let mut by_name: HashMap<String, Vec<String>> = HashMap::new();
+        for (name, spec) in pending_specs.drain(..) {
+            by_name.entry(name).or_default().push(spec);
         }
-        futs.push(Box::pin(fetch_or_settle(
-            name,
-            spec,
-            registry_url.clone(),
-            cache.clone(),
-            peer_deps,
-        )));
-    }
 
-    while let Some(transitive) = futs.next().await {
-        if transitive.is_empty() {
-            // Empty result is ambiguous (no transitive deps OR fetch
-            // failed) — `MbFetchStats` only tracks success/fail at a
-            // coarse level. The fetch-timings counters (recorded
-            // inside `fetch_full_manifest_with_settle`) carry the
-            // detailed per-fetch metrics.
-            stats.fail += 1;
-        } else {
-            stats.success += 1;
+        // Names whose full manifest is already cached from a prior
+        // iteration: settle their siblings synchronously (cheap
+        // semver match + cache lookup; no parse if version_manifest
+        // already cached, otherwise quick simd_json subtree extract).
+        let mut sibling_only: Vec<(String, Vec<String>)> = Vec::new();
+        let mut to_fetch: Vec<String> = Vec::with_capacity(by_name.len());
+        for (name, specs) in &by_name {
+            if done_names.contains(name) {
+                sibling_only.push((name.clone(), specs.clone()));
+            } else {
+                to_fetch.push(name.clone());
+            }
         }
-        pending.extend(transitive);
 
-        // Refill — same body as the initial fill above.
-        while futs.len() < cap {
-            let Some((name, spec)) = pending.pop_front() else {
-                break;
+        // Sibling settles (rare on real workloads — most names appear
+        // exactly once across the whole walk).
+        for (name, specs) in sibling_only {
+            let Some(full) = cache.get_full_manifest(&name) else {
+                continue;
             };
-            if !seen.insert((name.clone(), spec.clone())) {
+            for spec in specs {
+                let Ok(resolved) = resolve_target_version((&*full).into(), &spec) else {
+                    continue;
+                };
+                if let Some(cached) = cache.get_version_manifest(&name, &resolved) {
+                    cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&cached));
+                    pending_specs.extend(extract_transitive(&cached, peer_deps));
+                    continue;
+                }
+                if let Some(core) = full.get_core_version(&resolved) {
+                    let core_arc = Arc::new(core);
+                    cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&core_arc));
+                    cache.set_version_manifest(name.clone(), resolved, Arc::clone(&core_arc));
+                    pending_specs.extend(extract_transitive(&core_arc, peer_deps));
+                }
+            }
+        }
+
+        if to_fetch.is_empty() {
+            // Iteration drained pending entirely via sibling settles.
+            continue;
+        }
+
+        // PHASE 1 — pure HTTP, mb-style.
+        let p1_start = tokio::time::Instant::now();
+        let bodies = mb_style_pure_fetch(to_fetch.clone(), registry_url, conc).await;
+        let p1_wall = p1_start.elapsed().as_millis();
+        let total_bytes: usize = bodies
+            .iter()
+            .map(|b| b.bytes.as_ref().map(|v| v.len()).unwrap_or(0))
+            .sum();
+        tracing::info!(
+            "p1-breakdown mb_fetch iter={} phase1_http_wall={}ms n={} bytes={}",
+            iter,
+            p1_wall,
+            to_fetch.len(),
+            total_bytes,
+        );
+
+        // PHASE 2 — rayon batch parse + settle.
+        let p2_start = tokio::time::Instant::now();
+        let by_name_for_parse = by_name
+            .iter()
+            .filter(|(name, _)| !done_names.contains(*name))
+            .map(|(n, s)| (n.clone(), s.clone()))
+            .collect::<HashMap<_, _>>();
+        let parsed = parse_settle_batch(bodies, by_name_for_parse, peer_deps).await;
+        let p2_wall = p2_start.elapsed().as_millis();
+
+        let mut new_transitives: Vec<Dep> = Vec::new();
+        let mut settle_count = 0usize;
+        let mut fail_count = 0usize;
+        for outcome in parsed {
+            done_names.insert(outcome.name.clone());
+            let Some(full_arc) = outcome.full else {
+                fail_count += 1;
                 continue;
+            };
+            cache.set_full_manifest(outcome.name.clone(), Arc::clone(&full_arc));
+            for (spec, resolved, core) in outcome.settled {
+                cache.set_version_manifest(outcome.name.clone(), spec, Arc::clone(&core));
+                cache.set_version_manifest(outcome.name.clone(), resolved, Arc::clone(&core));
+                settle_count += 1;
             }
-            futs.push(Box::pin(fetch_or_settle(
-                name,
-                spec,
-                registry_url.clone(),
-                cache.clone(),
-                peer_deps,
-            )));
+            new_transitives.extend(outcome.transitives);
         }
+        // Names that fetched but failed parse — still mark done so we
+        // don't refetch them next iteration.
+        for name in to_fetch {
+            done_names.insert(name);
+        }
+
+        stats.success += settle_count;
+        stats.fail += fail_count;
+
+        tracing::info!(
+            "p1-breakdown mb_fetch iter={} phase2_parse_wall={}ms settles={} fail={} new_transitives={}",
+            iter,
+            p2_wall,
+            settle_count,
+            fail_count,
+            new_transitives.len(),
+        );
+
+        pending_specs.extend(new_transitives);
     }
 
-    let wall = start.elapsed();
+    let total_wall = total_start.elapsed().as_millis();
     tracing::info!(
-        "p1-breakdown mb_fetch wall={}ms ok={} fail={} | {}",
-        wall.as_millis(),
+        "p1-breakdown mb_fetch total_wall={}ms iters={} settled={} fail={}",
+        total_wall,
+        stats.iterations,
         stats.success,
         stats.fail,
-        FETCH_TIMINGS.snapshot().summary_line(),
     );
 
     stats

From 24165fb6d355d78cc606b69773fe2dc466560834 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 11:09:29 +0800
Subject: [PATCH 21/24] =?UTF-8?q?fix(pm):=20mb=5Fresolve=20v3=20=E2=80=94?=
 =?UTF-8?q?=20restore=20spec-level=20dedup=20to=20terminate?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v3 dropped the (name, spec) HashSet from v1/v2 thinking name-level
dedup via done_names was sufficient. It wasn't: sibling-settle's
extract_transitive can re-introduce specs we've already settled
(peer/optional dep cycles trivially trigger this), so the outer
while-loop never terminated.

CI 25589397823 hung on `Run phase-isolated benchmark · npmjs` for
~25 min before being cancelled — the bench's first utoo p1_resolve
hyperfine run got stuck in an infinite settle loop.

Fix: maintain `seen_specs: HashSet<(String, String)>` across all
iterations; filter both initial seed and every wave of new
transitives through it before extending pending_specs.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/resolver/mb_resolve.rs | 42 ++++++++++++++++------
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs
index 05e1bf038..7ef0b5d85 100644
--- a/crates/ruborist/src/resolver/mb_resolve.rs
+++ b/crates/ruborist/src/resolver/mb_resolve.rs
@@ -278,11 +278,20 @@ pub async fn mb_fetch(
 ) -> MbFetchStats {
     let mut stats = MbFetchStats::default();
     let mut pending_specs: Vec<Dep> = initial_deps;
+    // (name, spec) pairs we've already processed (settled or queued
+    // to settle). Without this, sibling-settle's transitive deps can
+    // re-introduce already-walked specs and the outer loop never
+    // terminates — peer / optional dep cycles trivially trigger this.
+    let mut seen_specs: HashSet<(String, String)> = HashSet::new();
     let mut done_names: HashSet<String> = HashSet::new();
     let conc = config.concurrency;
     let peer_deps = config.peer_deps;
     let total_start = tokio::time::Instant::now();
 
+    // Filter the initial seed through `seen_specs` too — root + workspace
+    // edges can list the same dep multiple times across workspaces.
+    pending_specs.retain(|(n, s)| seen_specs.insert((n.clone(), s.clone())));
+
     while !pending_specs.is_empty() {
         stats.iterations += 1;
         let iter = stats.iterations;
@@ -308,7 +317,8 @@ pub async fn mb_fetch(
         }
 
         // Sibling settles (rare on real workloads — most names appear
-        // exactly once across the whole walk).
+        // exactly once across the whole walk). New transitives go
+        // through `seen_specs` dedup before joining `pending_specs`.
         for (name, specs) in sibling_only {
             let Some(full) = cache.get_full_manifest(&name) else {
                 continue;
@@ -317,17 +327,22 @@ pub async fn mb_fetch(
                 let Ok(resolved) = resolve_target_version((&*full).into(), &spec) else {
                     continue;
                 };
-                if let Some(cached) = cache.get_version_manifest(&name, &resolved) {
+                let new_deps = if let Some(cached) = cache.get_version_manifest(&name, &resolved) {
                     cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&cached));
-                    pending_specs.extend(extract_transitive(&cached, peer_deps));
-                    continue;
-                }
-                if let Some(core) = full.get_core_version(&resolved) {
+                    extract_transitive(&cached, peer_deps)
+                } else if let Some(core) = full.get_core_version(&resolved) {
                     let core_arc = Arc::new(core);
                     cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&core_arc));
                     cache.set_version_manifest(name.clone(), resolved, Arc::clone(&core_arc));
-                    pending_specs.extend(extract_transitive(&core_arc, peer_deps));
-                }
+                    extract_transitive(&core_arc, peer_deps)
+                } else {
+                    Vec::new()
+                };
+                pending_specs.extend(
+                    new_deps
+                        .into_iter()
+                        .filter(|(n, s)| seen_specs.insert((n.clone(), s.clone()))),
+                );
             }
         }
 
@@ -388,16 +403,21 @@ pub async fn mb_fetch(
         stats.success += settle_count;
         stats.fail += fail_count;
 
+        let new_unique: Vec<Dep> = new_transitives
+            .into_iter()
+            .filter(|(n, s)| seen_specs.insert((n.clone(), s.clone())))
+            .collect();
+
         tracing::info!(
-            "p1-breakdown mb_fetch iter={} phase2_parse_wall={}ms settles={} fail={} new_transitives={}",
+            "p1-breakdown mb_fetch iter={} phase2_parse_wall={}ms settles={} fail={} new_unique={}",
             iter,
             p2_wall,
             settle_count,
             fail_count,
-            new_transitives.len(),
+            new_unique.len(),
         );
 
-        pending_specs.extend(new_transitives);
+        pending_specs.extend(new_unique);
     }
 
     let total_wall = total_start.elapsed().as_millis();

From 41822b081c713758fdbd633513d7257258f39d45 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 11:51:06 +0800
Subject: [PATCH 22/24] =?UTF-8?q?perf(pm):=20preload-bench=20=E2=80=94=20s?=
 =?UTF-8?q?elf-contained=20streaming=20preload=20baseline?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New crate `crates/preload-bench/` is a fully-standalone bench that:
* Uses the SAME HTTP setup as `manifest-bench` (own reqwest::Client
  built per rep with aws-lc-rs TLS, pool_max_idle_per_host(256), no
  proxy, default DNS, no retry, h1_only).
* Discovers names by walking transitive deps from a package.json
  root — instead of consuming a flat name list like manifest-bench.
* Per-future does GET + body recv + spawn_blocking parse → returns
  transitive deps → main loop refills on completion.
* No dependency on ruborist or any utoo internals (own simd_json,
  own dedup, own everything).

The point: prove (or disprove) that a fully ruborist-independent
streaming preload can hit standalone manifest-bench's wall on the
same workload. ruborist's path runs at ~2.18s for ant-design's
~2700 names; manifest-bench standalone runs the same workload at
~1.6s. The gap could be in any number of things — DNS layer, retry,
pool config, parse-CPU contention, registry single-flight gates.
preload-bench eliminates all of those simultaneously so we can read
the wall directly.

Wired into bench-phases-linux: builds + uploads preload-bench
binary alongside manifest-bench, then runs a conc=64/96/128 sweep
against the same project after the standalone manifest-bench sweep.

bench script reverts UTOO_RESOLVE=mb so utoo runs default
fast_preload — gives a third datapoint (utoo wall on integrated
path) alongside manifest-bench (HTTP-only ceiling) and preload-bench
(streaming-with-walk ceiling).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/pm-e2e-bench.yml |  69 ++++
 Cargo.toml                         |   1 +
 bench/pm-bench-phases.sh           |  12 +-
 crates/preload-bench/Cargo.toml    |  38 +++
 crates/preload-bench/src/main.rs   | 505 +++++++++++++++++++++++++++++
 5 files changed, 619 insertions(+), 6 deletions(-)
 create mode 100644 crates/preload-bench/Cargo.toml
 create mode 100644 crates/preload-bench/src/main.rs

diff --git a/.github/workflows/pm-e2e-bench.yml b/.github/workflows/pm-e2e-bench.yml
index b25f5c380..eb560969b 100644
--- a/.github/workflows/pm-e2e-bench.yml
+++ b/.github/workflows/pm-e2e-bench.yml
@@ -161,6 +161,25 @@ jobs:
           name: manifest-bench-linux-x64
           path: target/x86_64-unknown-linux-gnu/release/manifest-bench
           retention-days: 1
+      # preload-bench: same HTTP setup as manifest-bench, but discovers
+      # names by walking transitive deps from a package.json root —
+      # tests whether a fully self-contained streaming preload can match
+      # standalone manifest-bench's wall on the same workload that
+      # ruborist's path runs at ~2.18s.
+      - name: Build preload-bench
+        if: >
+          (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'benchmark')) ||
+          (github.event_name == 'workflow_dispatch' && (inputs.target == 'pm-bench-phases' || inputs.target == 'pm-bench-pcap'))
+        run: cargo build --release --target x86_64-unknown-linux-gnu -p preload-bench
+      - name: Upload preload-bench binary
+        if: >
+          (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'benchmark')) ||
+          (github.event_name == 'workflow_dispatch' && (inputs.target == 'pm-bench-phases' || inputs.target == 'pm-bench-pcap'))
+        uses: actions/upload-artifact@v4
+        with:
+          name: preload-bench-linux-x64
+          path: target/x86_64-unknown-linux-gnu/release/preload-bench
+          retention-days: 1
       # Piggyback on the already-built target/ from the step above: when the
       # PR is labeled `benchmark`, overlay origin/next's tree onto the current
       # workdir and re-run cargo build. cargo's incremental compile only
@@ -547,6 +566,20 @@ jobs:
           chmod +x /tmp/manifest-bench-dist/manifest-bench
           mv /tmp/manifest-bench-dist/manifest-bench /tmp/manifest-bench
           echo "MANIFEST_BENCH_BIN=/tmp/manifest-bench" >> $GITHUB_ENV
+      # Self-contained streaming preload bench — same HTTP setup as
+      # manifest-bench but discovers names via transitive walk from a
+      # package.json. Used to test whether a fully-isolated path can
+      # match standalone manifest-bench's wall on the same workload.
+      - name: Download preload-bench binary
+        uses: actions/download-artifact@v4
+        with:
+          name: preload-bench-linux-x64
+          path: /tmp/preload-bench-dist
+      - name: Install preload-bench
+        run: |
+          chmod +x /tmp/preload-bench-dist/preload-bench
+          mv /tmp/preload-bench-dist/preload-bench /tmp/preload-bench
+          echo "PRELOAD_BENCH_BIN=/tmp/preload-bench" >> $GITHUB_ENV
       - name: Verify tools
         run: |
           hyperfine --version
@@ -645,6 +678,42 @@ jobs:
             "$MANIFEST_BENCH_BIN" --lockfile package-lock.json --registry "$REGISTRY" \
               --concurrency 128 --reps 2 --http1-only --user-agent "Bun/1.2.21" || true
           } 2>&1 | tee "$MB_LOG"
+      # Self-contained streaming preload (transitive walk from
+      # package.json) — same HTTP setup as manifest-bench but with a
+      # streaming FuturesUnordered + per-future parse. This tests
+      # whether a fully ruborist-independent path can hit standalone
+      # manifest-bench's wall under the same project workload.
+      - name: Standalone preload-bench (transitive walk sweep)
+        env:
+          PROJECT: ${{ github.event.inputs.project || 'ant-design' }}
+          REGISTRY: 'https://registry.npmjs.org'
+        run: |
+          set -eu
+          mkdir -p /tmp/pm-bench-output
+          PROJECT_DIR="/tmp/pm-bench/$PROJECT"
+          if [ ! -d "$PROJECT_DIR" ]; then
+            echo "no project dir; skipping preload-bench"; exit 0
+          fi
+          PJ="$PROJECT_DIR/package.json"
+          if [ ! -f "$PJ" ]; then
+            echo "no package.json; skipping preload-bench"; exit 0
+          fi
+
+          PB_LOG=/tmp/pm-bench-output/preload-bench-npmjs.log
+          {
+            echo "============================================================"
+            echo "preload-bench: streaming transitive-walk preload"
+            echo "  Self-contained (no ruborist deps). Same HTTP setup as"
+            echo "  manifest-bench, but discovers names by walking transitive"
+            echo "  deps from package.json instead of consuming a flat list."
+            echo "============================================================"
+            for CAP in 64 96 128; do
+              echo
+              echo "--- concurrency=$CAP, h1, transitive walk ---"
+              "$PRELOAD_BENCH_BIN" --package-json "$PJ" --registry "$REGISTRY" \
+                --concurrency "$CAP" --reps 4 || true
+            done
+          } 2>&1 | tee "$PB_LOG"
       - name: Upload bench logs
         if: always()
         uses: actions/upload-artifact@v4
diff --git a/Cargo.toml b/Cargo.toml
index 0574a185a..4b2836c06 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,6 +2,7 @@
 resolver = "2"
 members  = [
   "crates/manifest-bench",
+  "crates/preload-bench",
   "crates/pack-api",
   "crates/pack-cli",
   "crates/pack-core",
diff --git a/bench/pm-bench-phases.sh b/bench/pm-bench-phases.sh
index 26e43388c..b025ebc6f 100755
--- a/bench/pm-bench-phases.sh
+++ b/bench/pm-bench-phases.sh
@@ -22,12 +22,12 @@ UTOO_NEXT_CACHE="${UTOO_NEXT_CACHE:-/tmp/utoo-next-bench-cache}"
 BUN_CACHE="${BUN_CACHE:-/tmp/bun-bench-cache}"
 export BUN_INSTALL_CACHE_DIR="$BUN_CACHE"
 
-# Route the current `utoo` binary's resolve phase through the
-# experimental `mb_resolve` flat-fetch path. Other PMs ignore this
-# env var (utoo-next is built from origin/next which doesn't have
-# the flag; utoo-npm/bun ignore unrecognized env vars). Comment out
-# to A/B against the default `fast_preload` path.
-export UTOO_RESOLVE=mb
+# utoo path defaults to fast_preload (combined-parse) so we have a
+# stable baseline to compare against. preload-bench is run as a
+# separate standalone tool by the CI workflow — its wall is the
+# self-contained-streaming reference, ruborist's utoo p1_resolve
+# wall is the integrated path. The gap between them is what
+# remains to close.
 
 # Drop optional baselines from the PM list when their binary is not wired
 # up — UTOO_NPM_BIN is set by CI's "Install utoo@npm" step, UTOO_NEXT_BIN
diff --git a/crates/preload-bench/Cargo.toml b/crates/preload-bench/Cargo.toml
new file mode 100644
index 000000000..9d37d7769
--- /dev/null
+++ b/crates/preload-bench/Cargo.toml
@@ -0,0 +1,38 @@
+[package]
+name        = "preload-bench"
+version     = "0.0.0"
+edition     = "2024"
+license     = "MIT"
+publish     = false
+description = "Self-contained streaming-with-transitive-walk manifest preload bench. Reproduces manifest-bench's standalone fetch loop but discovers transitive deps from package.json instead of consuming a flat name list. No dependency on ruborist or any utoo internals."
+
+[[bin]]
+name = "preload-bench"
+path = "src/main.rs"
+
+# tombi: format.rules.table-keys-order.disabled = true
+[dependencies]
+anyhow      = { workspace = true }
+clap        = { workspace = true }
+futures     = "0.3"
+serde       = { version = "1", features = ["derive"] }
+serde_json  = { workspace = true }
+simd-json   = "0.17"
+tokio       = { workspace = true, features = ["macros", "rt-multi-thread", "fs", "time"] }
+
+# Same TLS/DNS choices as manifest-bench so the only delta vs that bench
+# is the transitive-walk loop.
+reqwest             = { version = "0.12", default-features = false, features = [
+  "brotli",
+  "gzip",
+  "http2",
+  "rustls-tls-native-roots-no-provider",
+  "socks"
+] }
+rustls              = { version = "0.23", default-features = false, features = [
+  "aws-lc-rs",
+  "logging",
+  "std",
+  "tls12"
+] }
+rustls-native-certs = "0.8"
diff --git a/crates/preload-bench/src/main.rs b/crates/preload-bench/src/main.rs
new file mode 100644
index 000000000..46f917d19
--- /dev/null
+++ b/crates/preload-bench/src/main.rs
@@ -0,0 +1,505 @@
+//! Self-contained streaming preload bench with transitive walking.
+//!
+//! Same HTTP setup as `manifest-bench` (own `reqwest::Client` built
+//! per rep with `aws-lc-rs` TLS, `pool_max_idle_per_host(256)`, no
+//! proxy, default DNS, no retry). The only delta vs `manifest-bench`
+//! is that this bench discovers names by walking transitive deps
+//! from a `package.json` root, instead of consuming a flat name
+//! list.
+//!
+//! Why a separate crate: ruborist's manifest-fetch path goes through
+//! several service layers (custom DNS resolver, retry, cache,
+//! single-flight gates, event receivers). Each layer might add
+//! overhead. This bench bypasses all of them — same shape as
+//! manifest-bench, just with a streaming `FuturesUnordered` that
+//! refills from a pending queue extended by parsed transitive deps.
+//!
+//! Reports both the standalone preload wall and a per-rep eff_parallel
+//! number so we can compare directly against manifest-bench's
+//! `phase_wall` + `avg_conc` for the same workload.
+//!
+//! Output (one line per rep, matching manifest-bench shape):
+//!   [rep N] preload_wall=Xms n=Y bytes=Z avg_conc=N.N parse_sum=Wms 200=A 4xx=B err=C
+
+use std::collections::{HashMap, HashSet, VecDeque};
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::Instant;
+
+use anyhow::{Context, Result, anyhow};
+use clap::Parser;
+use futures::stream::{FuturesUnordered, StreamExt};
+use serde::Deserialize;
+
+#[derive(Parser, Debug)]
+#[command(
+    name = "preload-bench",
+    about = "Streaming preload bench with transitive walking (self-contained)"
+)]
+struct Args {
+    /// Registry base URL.
+    #[arg(long, default_value = "https://registry.npmjs.org")]
+    registry: String,
+
+    /// Path to a `package.json` to walk from. Reads `dependencies` +
+    /// `devDependencies` + `optionalDependencies` as the initial seed.
+    #[arg(long)]
+    package_json: PathBuf,
+
+    /// Maximum concurrent in-flight requests.
+    #[arg(long, default_value_t = 96)]
+    concurrency: usize,
+
+    /// Number of times to repeat the whole walk (fresh client per rep).
+    #[arg(long, default_value_t = 4)]
+    reps: usize,
+
+    /// Force HTTP/1.1.
+    #[arg(long, default_value_t = true)]
+    http1_only: bool,
+
+    /// Override `User-Agent`.
+    #[arg(long)]
+    user_agent: Option<String>,
+
+    /// Include `peerDependencies` when walking transitives. Off by
+    /// default (matches utoo's default).
+    #[arg(long)]
+    include_peer: bool,
+}
+
+#[derive(Deserialize)]
+struct PackageJson {
+    #[serde(default)]
+    dependencies: HashMap<String, String>,
+    #[serde(default, rename = "devDependencies")]
+    dev_dependencies: HashMap<String, String>,
+    #[serde(default, rename = "optionalDependencies")]
+    optional_dependencies: HashMap<String, String>,
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let args = Args::parse();
+
+    let raw = std::fs::read_to_string(&args.package_json)
+        .with_context(|| format!("read {:?}", args.package_json))?;
+    let pkg: PackageJson = serde_json::from_str(&raw).context("parse package.json")?;
+    let initial: Vec<(String, String)> = pkg
+        .dependencies
+        .into_iter()
+        .chain(pkg.dev_dependencies)
+        .chain(pkg.optional_dependencies)
+        .filter(|(_, spec)| is_registry_spec(spec))
+        .collect();
+
+    println!(
+        "preload-bench: registry={} concurrency={} reps={} initial={} h1_only={} ua={} include_peer={}",
+        args.registry,
+        args.concurrency,
+        args.reps,
+        initial.len(),
+        args.http1_only,
+        args.user_agent.as_deref().unwrap_or("<reqwest default>"),
+        args.include_peer,
+    );
+
+    for rep in 1..=args.reps {
+        run_once(&args, &initial, rep).await?;
+    }
+
+    Ok(())
+}
+
+/// Quick registry-spec check (a `^...` / `~...` / `latest` / etc).
+/// Excludes `file:`, `link:`, `workspace:`, `git+`, `https://`, and
+/// `<user>/<repo>` shorthand. Same intent as ruborist's
+/// `SpecStr::is_registry_spec` but inlined to keep this crate
+/// dependency-free.
+fn is_registry_spec(spec: &str) -> bool {
+    if spec.is_empty() {
+        return true; // bare entries default to "*"
+    }
+    let lower = spec.to_ascii_lowercase();
+    if lower.starts_with("file:")
+        || lower.starts_with("link:")
+        || lower.starts_with("workspace:")
+        || lower.starts_with("portal:")
+        || lower.starts_with("git+")
+        || lower.starts_with("git://")
+        || lower.starts_with("github:")
+        || lower.starts_with("https://")
+        || lower.starts_with("http://")
+    {
+        return false;
+    }
+    // `<user>/<repo>` shorthand — exactly one '/' and no '@' prefix on
+    // first segment (rules out scoped names like `@scope/pkg`).
+    if let Some((head, tail)) = spec.split_once('/')
+        && !head.starts_with('@')
+        && !tail.is_empty()
+        && !tail.contains('/')
+    {
+        return false;
+    }
+    true
+}
+
+#[derive(Debug, Default)]
+struct RepStats {
+    n: usize,
+    bytes: usize,
+    parse_sum_us: u128,
+    busy_us: u128,
+    sum_us: u128,
+    ok_200: usize,
+    err_4xx: usize,
+    err_other: usize,
+}
+
+async fn run_once(args: &Args, initial: &[(String, String)], rep: usize) -> Result<()> {
+    let client = build_client(args)?;
+    let registry = Arc::new(args.registry.trim_end_matches('/').to_string());
+    let concurrency = args.concurrency;
+    let include_peer = args.include_peer;
+
+    let phase_start = Instant::now();
+    let mut stats = RepStats::default();
+
+    // (name, spec) dedup — same shape as ruborist's seen_specs but
+    // self-contained. We dedup the *spec* level because two specs on
+    // the same name might resolve to different versions.
+    let mut seen: HashSet<(String, String)> = HashSet::new();
+    let mut pending: VecDeque<(String, String)> = VecDeque::new();
+    for (name, spec) in initial {
+        if seen.insert((name.clone(), spec.clone())) {
+            pending.push_back((name.clone(), spec.clone()));
+        }
+    }
+
+    // Sibling-fetch dedup: when two specs for the same name are both
+    // pending, only one fetch is issued; subsequent specs settle from
+    // the cached body. Keyed by name. Maps name → cached parsed body
+    // (`Arc<Vec<u8>>`) once the first fetch lands.
+    let body_cache: Arc<std::sync::Mutex<HashMap<String, Arc<Vec<u8>>>>> =
+        Arc::new(std::sync::Mutex::new(HashMap::new()));
+    let mut in_flight_names: HashSet<String> = HashSet::new();
+    let mut deferred_by_name: HashMap<String, Vec<String>> = HashMap::new();
+
+    let mut futs: FuturesUnordered<Fut> = FuturesUnordered::new();
+
+    loop {
+        while futs.len() < concurrency {
+            let Some((name, spec)) = pending.pop_front() else {
+                break;
+            };
+
+            // If the body is already cached (sibling spec for an
+            // already-fetched name), spawn a settle-only future.
+            if let Some(raw) = body_cache.lock().unwrap().get(&name).cloned() {
+                let n = name.clone();
+                let s = spec.clone();
+                let fut: Fut = Box::pin(settle_only(n, s, raw, include_peer));
+                futs.push(fut);
+                continue;
+            }
+
+            // First time seeing this name: fetch + settle. Stash any
+            // sibling specs that arrive while in-flight.
+            if !in_flight_names.insert(name.clone()) {
+                deferred_by_name.entry(name).or_default().push(spec);
+                continue;
+            }
+
+            spawn_fetch(
+                &client,
+                &registry,
+                name,
+                spec,
+                Arc::clone(&body_cache),
+                include_peer,
+                &mut futs,
+            );
+        }
+
+        if futs.is_empty() {
+            break;
+        }
+
+        let Some(out) = futs.next().await else { break };
+        stats.n += 1;
+        stats.busy_us += out.busy_us;
+        stats.sum_us += out.sum_us;
+        stats.parse_sum_us += out.parse_us;
+        stats.bytes += out.bytes;
+        match out.status {
+            200 => stats.ok_200 += 1,
+            400..=499 => stats.err_4xx += 1,
+            _ => stats.err_other += 1,
+        }
+
+        // Drain sibling specs for this name now that body is cached.
+        if out.fetched
+            && let Some(siblings) = deferred_by_name.remove(&out.name)
+            && let Some(raw) = body_cache.lock().unwrap().get(&out.name).cloned()
+        {
+            for sibling_spec in siblings {
+                let n = out.name.clone();
+                let r = Arc::clone(&raw);
+                let fut: Fut = Box::pin(settle_only(n, sibling_spec, r, include_peer));
+                futs.push(fut);
+            }
+        }
+
+        // Extend pending with new transitives, dedup by (name, spec).
+        for (name, spec) in out.transitives {
+            if seen.insert((name.clone(), spec.clone())) {
+                pending.push_back((name, spec));
+            }
+        }
+    }
+
+    let phase_wall_ms = phase_start.elapsed().as_millis();
+    let parse_sum_ms = stats.parse_sum_us / 1000;
+    // avg_conc = sum_request_us / busy_window_us. busy_us isn't a true
+    // merged-interval here (we don't track per-req start/end timestamps
+    // for that), so use phase_wall as the denominator — slightly
+    // pessimistic but consistent.
+    let avg_conc = if phase_wall_ms > 0 {
+        stats.sum_us as f64 / 1000.0 / phase_wall_ms as f64
+    } else {
+        0.0
+    };
+
+    println!(
+        "[rep {rep}] preload_wall={phase_wall_ms}ms n={} bytes={} parse_sum={parse_sum_ms}ms avg_conc={avg_conc:.1} 200={} 4xx={} err={}",
+        stats.n, stats.bytes, stats.ok_200, stats.err_4xx, stats.err_other,
+    );
+    Ok(())
+}
+
+#[derive(Debug)]
+struct FetchOutcome {
+    name: String,
+    /// `(name, spec)` transitive deps unfolded by parsing the resolved
+    /// version's `dependencies` / `optionalDependencies` (and
+    /// optionally `peerDependencies`).
+    transitives: Vec<(String, String)>,
+    /// `true` if this future fetched the body (vs settle-only on a
+    /// cached body); only fetchers populate `body_cache` and trigger
+    /// sibling drain.
+    fetched: bool,
+    /// HTTP status code (200 / 4xx / 5xx / 0 on transport error).
+    status: u16,
+    /// Body byte count (0 on error).
+    bytes: usize,
+    /// Self-reported per-future busy_us — `end - start`. Approximate.
+    busy_us: u128,
+    /// Sum of all per-future durations summed by the main loop.
+    sum_us: u128,
+    /// Parse work done inside this future (for accounting).
+    parse_us: u128,
+}
+
+type Fut = std::pin::Pin<Box<dyn std::future::Future<Output = FetchOutcome> + Send>>;
+
+fn spawn_fetch(
+    client: &reqwest::Client,
+    registry: &Arc<String>,
+    name: String,
+    spec: String,
+    body_cache: Arc<std::sync::Mutex<HashMap<String, Arc<Vec<u8>>>>>,
+    include_peer: bool,
+    futs: &mut FuturesUnordered<Fut>,
+) {
+    let url = format!("{}/{}", registry, name);
+    let client = client.clone();
+    let fut: Fut = Box::pin(async move {
+        let start = Instant::now();
+        let req = client
+            .get(&url)
+            .header("accept", "application/vnd.npm.install-v1+json")
+            .send();
+        let (raw_bytes, status) = match req.await {
+            Ok(resp) => {
+                let status = resp.status().as_u16();
+                let body = resp.bytes().await.map(|b| b.to_vec()).unwrap_or_default();
+                (body, status)
+            }
+            Err(_) => (Vec::new(), 0),
+        };
+        let bytes = raw_bytes.len();
+
+        let (parse_us, transitives) = if status == 200 && !raw_bytes.is_empty() {
+            let raw_arc = Arc::new(raw_bytes);
+            body_cache
+                .lock()
+                .unwrap()
+                .insert(name.clone(), Arc::clone(&raw_arc));
+            // Move the Arc<Vec<u8>> into spawn_blocking; the parser
+            // mutates a clone, so the cached copy is unaffected.
+            let spec_for_parse = spec.clone();
+            let parse_start = Instant::now();
+            let result = tokio::task::spawn_blocking(move || {
+                parse_and_extract(&raw_arc, &spec_for_parse, include_peer)
+            })
+            .await
+            .ok()
+            .flatten()
+            .unwrap_or_default();
+            (parse_start.elapsed().as_micros(), result)
+        } else {
+            (0, Vec::new())
+        };
+
+        let end = Instant::now();
+        let busy_us = end.duration_since(start).as_micros();
+        FetchOutcome {
+            name,
+            transitives,
+            fetched: true,
+            status,
+            bytes,
+            busy_us,
+            sum_us: busy_us,
+            parse_us,
+        }
+    });
+    futs.push(fut);
+}
+
+async fn settle_only(
+    name: String,
+    spec: String,
+    raw: Arc<Vec<u8>>,
+    include_peer: bool,
+) -> FetchOutcome {
+    let start = Instant::now();
+    let parse_start = start;
+    let transitives = tokio::task::spawn_blocking(move || {
+        parse_and_extract(&raw, &spec, include_peer).unwrap_or_default()
+    })
+    .await
+    .unwrap_or_default();
+    let parse_us = parse_start.elapsed().as_micros();
+    let end = Instant::now();
+    let busy_us = end.duration_since(start).as_micros();
+    FetchOutcome {
+        name,
+        transitives,
+        fetched: false,
+        status: 200,
+        bytes: 0,
+        busy_us,
+        sum_us: busy_us,
+        parse_us,
+    }
+}
+
+/// Parse a manifest body, resolve `spec` against the version list,
+/// extract that version's transitive deps. Single
+/// `simd_json::to_borrowed_value` pass for the whole body — same as
+/// ruborist's combined-parse path, but inlined here so this crate
+/// has no ruborist dependency.
+fn parse_and_extract(
+    raw: &Arc<Vec<u8>>,
+    spec: &str,
+    include_peer: bool,
+) -> Option<Vec<(String, String)>> {
+    use simd_json::prelude::{ValueAsObject, ValueObjectAccess};
+
+    let mut buf = (**raw).clone();
+    let parsed = simd_json::to_borrowed_value(&mut buf).ok()?;
+
+    let dist_tags: HashMap<String, String> = parsed
+        .get("dist-tags")
+        .and_then(|v| HashMap::<String, String>::deserialize(v).ok())
+        .unwrap_or_default();
+    let versions_obj = parsed.get("versions").and_then(ValueAsObject::as_object)?;
+
+    // Resolve spec. Three cases: dist-tag match, exact-version key, or
+    // semver range (we approximate with "first version that satisfies"
+    // — preload-bench is a measurement tool, not a real resolver, so
+    // we tolerate slight selection differences vs ruborist for the
+    // purpose of timing the network path).
+    let resolved = if let Some(via_tag) = dist_tags.get(spec) {
+        via_tag.clone()
+    } else if versions_obj.contains_key(spec) {
+        spec.to_string()
+    } else if let Some(latest) = dist_tags.get("latest")
+        && spec_satisfied_by(spec, latest)
+    {
+        latest.clone()
+    } else {
+        // Last-resort: pick the lexicographically-largest version. Not
+        // semver-correct but bounded by the version set, and good
+        // enough for timing.
+        versions_obj.keys().max().map(|k| k.to_string())?
+    };
+
+    let version_obj = versions_obj.get(resolved.as_str())?;
+    let mut out: Vec<(String, String)> = Vec::new();
+
+    if let Some(deps) = version_obj.get("dependencies")
+        && let Ok(map) = HashMap::<String, String>::deserialize(deps)
+    {
+        out.extend(map.into_iter().filter(|(_, s)| is_registry_spec(s)));
+    }
+    if include_peer
+        && let Some(deps) = version_obj.get("peerDependencies")
+        && let Ok(map) = HashMap::<String, String>::deserialize(deps)
+    {
+        out.extend(map.into_iter().filter(|(_, s)| is_registry_spec(s)));
+    }
+    if let Some(deps) = version_obj.get("optionalDependencies")
+        && let Ok(map) = HashMap::<String, String>::deserialize(deps)
+    {
+        out.extend(map.into_iter().filter(|(_, s)| is_registry_spec(s)));
+    }
+    Some(out)
+}
+
+/// Crude semver-satisfies check: only handles `^X.Y.Z` and `~X.Y.Z`
+/// against an exact target. Sufficient for "does latest satisfy spec"
+/// in this measurement context — full semver is in the resolver, not
+/// the bench.
+fn spec_satisfied_by(spec: &str, target: &str) -> bool {
+    let s = spec.trim();
+    let body = s
+        .strip_prefix('^')
+        .or_else(|| s.strip_prefix('~'))
+        .unwrap_or(s);
+    target.starts_with(body) || target == body
+}
+
+fn build_client(args: &Args) -> Result<reqwest::Client> {
+    // Install aws-lc-rs as the default crypto provider (idempotent —
+    // first call wins). Same setup as manifest-bench.
+    let _ = rustls::crypto::aws_lc_rs::default_provider().install_default();
+
+    let mut roots = rustls::RootCertStore::empty();
+    let native = rustls_native_certs::load_native_certs();
+    for cert in native.certs {
+        let _ = roots.add(cert);
+    }
+
+    let tls_config = rustls::ClientConfig::builder_with_provider(std::sync::Arc::new(
+        rustls::crypto::aws_lc_rs::default_provider(),
+    ))
+    .with_safe_default_protocol_versions()
+    .map_err(|e| anyhow!("rustls protocol versions: {e}"))?
+    .with_root_certificates(roots)
+    .with_no_client_auth();
+
+    let mut builder = reqwest::Client::builder()
+        .use_preconfigured_tls(tls_config)
+        .no_proxy()
+        .pool_max_idle_per_host(256);
+    if args.http1_only {
+        builder = builder.http1_only();
+    }
+    if let Some(ua) = &args.user_agent {
+        builder = builder.user_agent(ua);
+    }
+    builder.build().context("build reqwest client")
+}

From 01d15130d01cb6768d2fe5b4d4c577a7b4139a03 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 13:55:56 +0800
Subject: [PATCH 23/24] perf(pm): integrate standalone preload into ruborist
 for lockfile-only path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Step 1 of staged service-layer ablation. Rewrites mb_resolve as a
fully self-contained streaming preload mirroring preload-bench's
loop shape verbatim, but living inside ruborist so it can populate
MemoryCache for the BFS phase.

Bypasses every other ruborist service layer:
  * service::http::get_client — own reqwest::Client built per call,
    no global LazyLock, no shared_resolver dns layer, no
    connect_timeout, pool_max_idle_per_host(256).
  * service::manifest::fetch_full_manifest_with_settle — own GET +
    body.bytes() + spawn_blocking(simd_json::to_borrowed_value),
    no RetryIf, no FETCH_TIMINGS.
  * service::registry::UnifiedRegistry — no OnceMap, no
    ManifestStore, no EventReceiver.

Only service::* touched is MemoryCache writes (DashMap inserts) so
BFS has data to read from.

PM is unaware: dispatch happens entirely inside
service::api::build_deps when skip_preload=true and no warm cache.
Removes the previous UTOO_RESOLVE=mb env-var gating from
pm::helper::ruborist_context::Context::build_deps and
pipeline::resolve_with_pipeline. Removes the now-unused
service::api::build_deps_mb sibling entry point.

Expected: utoo p1_resolve drops from ~2.67s toward preload-bench's
~2.57s (or better since ruborist fetches fewer names than
preload-bench). The remaining gap to mb's ~1.99s would isolate
incremental layer effects we add back next:
  - tokio runtime config / cooperative scheduling
  - reqwest::Client provider differences (TLS, DNS)
  - cache layer (DashMap vs DiskManifestStore reads on the cold path)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/pm/src/helper/ruborist_context.rs   |  22 +-
 crates/pm/src/service/pipeline/mod.rs      |  17 +-
 crates/ruborist/src/resolver/mb_resolve.rs | 597 ++++++++++-----------
 crates/ruborist/src/service/api.rs         | 175 +-----
 crates/ruborist/src/service/mod.rs         |   2 +-
 5 files changed, 289 insertions(+), 524 deletions(-)

diff --git a/crates/pm/src/helper/ruborist_context.rs b/crates/pm/src/helper/ruborist_context.rs
index 542664f8c..c8b758a6f 100644
--- a/crates/pm/src/helper/ruborist_context.rs
+++ b/crates/pm/src/helper/ruborist_context.rs
@@ -84,23 +84,17 @@ impl Context {
     /// [`BuildDepsOutput`] (lock + project cache); the project cache is
     /// persisted in the background.
     ///
-    /// Used by the lockfile-only path (`utoo deps`). No pipeline consumes
-    /// `PackageResolved` events here, so preload is pure overhead — BFS's
-    /// own per-level parallel prefetch warms the manifest cache.
-    ///
-    /// Set `UTOO_RESOLVE=mb` to opt into the experimental
-    /// manifest-bench-style fetch path (`build_deps_mb`) for A/B
-    /// benchmarking against the current `fast_preload`.
+    /// Used by the lockfile-only path (`utoo deps`). With
+    /// `skip_preload=true`, ruborist's `service::api::build_deps`
+    /// internally routes through `mb_resolve::mb_fetch` — a
+    /// standalone manifest-bench-style preload that bypasses
+    /// `service::http` / `service::manifest` / `service::registry`
+    /// for the cold-cache lockfile-only workload. PM doesn't see
+    /// the dispatch.
     pub async fn build_deps(cwd: PathBuf) -> anyhow::Result<BuildDepsOutput> {
         let mut options = Self::deps_options(cwd.clone(), ProgressReceiver).await;
         options.skip_preload = true;
-        let use_mb = std::env::var("UTOO_RESOLVE").as_deref() == Ok("mb");
-        let output = if use_mb {
-            tracing::debug!("UTOO_RESOLVE=mb: routing to build_deps_mb");
-            utoo_ruborist::service::build_deps_mb(options).await?
-        } else {
-            utoo_ruborist::service::build_deps(options).await?
-        };
+        let output = utoo_ruborist::service::build_deps(options).await?;
         spawn_save_project_cache(cwd, output.project_cache.clone());
         Ok(output)
     }
diff --git a/crates/pm/src/service/pipeline/mod.rs b/crates/pm/src/service/pipeline/mod.rs
index 4169ca88d..719d31d13 100644
--- a/crates/pm/src/service/pipeline/mod.rs
+++ b/crates/pm/src/service/pipeline/mod.rs
@@ -41,22 +41,7 @@ pub async fn resolve_with_pipeline(root_path: &std::path::Path) -> anyhow::Resul
     let (options, channels) = Context::pipeline_deps_options(root_path.to_path_buf()).await;
     let handles = worker::start_workers(channels, root_path.to_path_buf());
 
-    // `UTOO_RESOLVE=mb` reroutes install through the experimental
-    // mb-style fetch path. Pipeline workers are still started, but
-    // because mb_fetch doesn't emit `PackageResolved` events, the
-    // pipeline only fires once BFS completes (graph_to_package_lock
-    // emits `PackagePlaced` from BFS). Install becomes
-    // phase-sequential — fetch all manifests, then download +
-    // clone. Useful for A/B benchmarking the resolve phase in
-    // isolation; the pipelining advantage of the default path is
-    // lost.
-    let use_mb = std::env::var("UTOO_RESOLVE").as_deref() == Ok("mb");
-    let output = if use_mb {
-        tracing::debug!("UTOO_RESOLVE=mb: routing install resolve to build_deps_mb");
-        utoo_ruborist::service::build_deps_mb(options).await?
-    } else {
-        utoo_ruborist::service::build_deps(options).await?
-    };
+    let output = utoo_ruborist::service::build_deps(options).await?;
 
     save_package_lock(root_path, &output.lock).await?;
     spawn_save_project_cache(root_path.to_path_buf(), output.project_cache);
diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs
index 7ef0b5d85..7e1376330 100644
--- a/crates/ruborist/src/resolver/mb_resolve.rs
+++ b/crates/ruborist/src/resolver/mb_resolve.rs
@@ -1,47 +1,42 @@
-//! Two-phase manifest fetcher: phase 1 pure HTTP (mirrors
-//! `manifest-bench` standalone exactly), phase 2 rayon batch parse +
-//! settle.
+//! Standalone manifest preload for the lockfile-only path.
 //!
-//! ## Phase split
+//! Mirrors `crates/preload-bench`'s loop shape verbatim, but lives
+//! inside ruborist so it can populate `MemoryCache` for the BFS phase
+//! to read. Used by `service::api::build_deps` whenever the caller
+//! has `skip_preload=true` and no warm project cache — i.e. the
+//! `utoo deps` (lockfile-only) path.
 //!
-//! Per-fetch parse work was the real bottleneck in v1/v2 — `simd_json`
-//! ran in `spawn_blocking` threads that competed with tokio runtime
-//! workers for CPU on the 2-core GHA box. When 50+ parses ran in
-//! parallel, tokio workers couldn't drive sockets, so `eff_parallel`
-//! capped at ~47 against the 96 cap (vs `manifest-bench` standalone's
-//! 75 on the same box).
+//! Bypasses every other ruborist service layer:
+//!   * `service::http::get_client` — own `reqwest::Client` built per
+//!     call, no global LazyLock, no `dns_resolver(shared_resolver)`,
+//!     no `connect_timeout`, `pool_max_idle_per_host(256)` matching
+//!     `preload-bench` / `manifest-bench`.
+//!   * `service::manifest::fetch_full_manifest_with_settle` — own
+//!     `reqwest::get + body.bytes() + spawn_blocking(simd_json
+//!     to_borrowed_value)`, no `RetryIf`, no `FETCH_TIMINGS`.
+//!   * `service::registry::UnifiedRegistry` — no `OnceMap` inflight
+//!     gates, no `ManifestStore`, no `EventReceiver`.
 //!
-//! v3 separates the work:
+//! The only `service::*` touched is `MemoryCache::set_full_manifest`
+//! and `MemoryCache::set_version_manifest` — thin DashMap wrappers
+//! the BFS phase reads from. Without that, BFS would have nothing to
+//! resolve against.
 //!
-//! - **Phase 1** — `mb_style_pure_fetch` is a structural copy of
-//!   `manifest-bench`'s main loop: `spawn_one` (GET + body recv,
-//!   nothing else) + 1-for-1 refill on completion. The future body
-//!   has zero CPU work, so the tokio runtime workers retain full CPU
-//!   to drive sockets and `eff_parallel` reaches the same level as
-//!   the standalone bench.
-//!
-//! - **Phase 2** — bulk parse on rayon (off the tokio runtime). For
-//!   each fetched body: parse `FullManifest` envelope, resolve every
-//!   spec we need for this name, materialize `CoreVersionManifest`
-//!   subtrees, populate cache slots, collect transitive deps for the
-//!   next iteration.
-//!
-//! Phases alternate until `pending` is empty (typical project: 3-5
-//! iterations as transitive deps fan out wave by wave).
-//!
-//! Phase 1 is the line we measure against `manifest-bench` —
-//! `p1-breakdown mb_fetch_iter=N phase1_http_wall=...` traces let us
-//! check eff_parallel directly.
-//!
-//! Wired in via `UTOO_RESOLVE=mb` env var (see
-//! `pm::helper::ruborist_context::Context::build_deps`).
-
-use std::collections::{HashMap, HashSet};
+//! Why a separate path: same-run CI data shows `preload-bench`
+//! (self-contained, transitive walk, 4153 fetches) lands at ~2.57s
+//! while ruborist's existing `fast_preload` path (combined parse via
+//! service layers, 2733 fetches) lands at ~2.67s on the same network
+//! — so on a per-fetch basis the service-layer path is ~50 % slower.
+//! Removing the layers should close that gap.
+
+use std::collections::{HashMap, HashSet, VecDeque};
+use std::pin::Pin;
 use std::sync::Arc;
+use std::time::Instant;
 
-use bytes::Bytes;
+use anyhow::{Context, Result};
 use futures::stream::{FuturesUnordered, StreamExt};
-use rayon::prelude::*;
+use parking_lot::Mutex;
 use serde::Deserialize;
 
 use crate::model::manifest::{CoreVersionManifest, FullManifest};
@@ -49,38 +44,29 @@ use crate::model::node::PeerDeps;
 use crate::resolver::preload::{Dep, PreloadConfig};
 use crate::resolver::version::resolve_target_version;
 use crate::service::MemoryCache;
-use crate::service::http::get_client;
 use crate::spec::SpecStr;
 
 #[derive(Debug, Default)]
 pub struct MbFetchStats {
     pub success: usize,
     pub fail: usize,
-    pub iterations: usize,
-}
-
-/// Phase 1 result: one body per fetched name. `bytes` is `None` on
-/// transport / non-2xx — kept in the result vector so phase 2 can
-/// account for it, but contributes no settle work.
-struct FetchOutcome {
-    name: String,
-    bytes: Option<Bytes>,
 }
 
-/// Phase 2 per-name output. `full` is `None` on parse failure.
-struct ParseOutcome {
-    name: String,
-    full: Option<Arc<FullManifest>>,
-    /// Per-spec settled subtrees: `(spec, resolved_version, core)`.
-    /// Empty when the body failed to fetch / parse, or when no spec
-    /// resolves against the manifest.
-    settled: Vec<(String, String, Arc<CoreVersionManifest>)>,
-    /// Transitive deps collected across all settled subtrees for this
-    /// name. Already filtered to registry specs; the main loop dedups
-    /// against `done_names` before queueing.
-    transitives: Vec<Dep>,
+/// Build a fresh `reqwest::Client` matching `preload-bench` /
+/// `manifest-bench` exactly, except for the TLS provider — those
+/// benches use aws-lc-rs but we keep ruborist's existing default
+/// rustls (ring on Linux). If A/B data shows TLS is the remaining
+/// gap, we'll add the aws-lc-rs deps separately.
+fn build_mb_client() -> Result<reqwest::Client> {
+    reqwest::Client::builder()
+        .no_proxy()
+        .pool_max_idle_per_host(256)
+        .http1_only()
+        .build()
+        .context("build reqwest client for mb_resolve")
 }
 
+/// Collect deps from a deps map, filtering non-registry specs.
 fn collect_deps(map: Option<&HashMap<String, String>>) -> Vec<Dep> {
     map.into_iter()
         .flatten()
@@ -99,177 +85,183 @@ fn extract_transitive(manifest: &CoreVersionManifest, peer_deps: PeerDeps) -> Ve
     out
 }
 
-/// Phase 1 — structural copy of `manifest-bench`'s main loop. Future
-/// body does ONLY GET + body recv; no parse, no cache writes, no
-/// dedup. Returns one `FetchOutcome` per input name in arrival order.
-async fn mb_style_pure_fetch(
-    names: Vec<String>,
-    registry_url: &str,
-    concurrency: usize,
-) -> Vec<FetchOutcome> {
-    let client = match get_client() {
-        Ok(c) => c.clone(),
-        Err(e) => {
-            tracing::warn!("get_client failed: {e}");
-            return Vec::new();
-        }
-    };
-
-    let mut results: Vec<FetchOutcome> = Vec::with_capacity(names.len());
-    let mut futs = FuturesUnordered::new();
-    let mut idx = 0usize;
-
-    let spawn_one = |client: &reqwest::Client,
-                     registry_url: &str,
-                     name: String,
-                     futs: &mut FuturesUnordered<_>| {
-        let url = format!("{}/{}", registry_url, name);
-        let client = client.clone();
-        futs.push(Box::pin(async move {
-            let bytes = match client
-                .get(&url)
-                .header("accept", "application/vnd.npm.install-v1+json")
-                .send()
-                .await
-            {
-                Ok(resp) if resp.status().is_success() => resp.bytes().await.ok(),
-                _ => None,
-            };
-            FetchOutcome { name, bytes }
-        }));
-    };
+/// What a future returns when it lands. The main loop uses
+/// `transitives` to extend `pending`, plus the cache writes already
+/// happened inside the future. Only `fetched=true` futures populate
+/// `body_cache` and trigger sibling drain.
+struct FetchOutcome {
+    name: String,
+    transitives: Vec<Dep>,
+    fetched: bool,
+}
 
-    while idx < names.len() && futs.len() < concurrency {
-        spawn_one(&client, registry_url, names[idx].clone(), &mut futs);
-        idx += 1;
-    }
+type Fut = Pin<Box<dyn std::future::Future<Output = FetchOutcome> + Send>>;
 
-    while let Some(outcome) = futs.next().await {
-        results.push(outcome);
-        if idx < names.len() {
-            spawn_one(&client, registry_url, names[idx].clone(), &mut futs);
-            idx += 1;
-        }
-    }
+/// `(name, spec) → (FullManifest, resolved_version, version_subtree, transitive_deps)`.
+type ParseResult = (
+    Arc<FullManifest>,
+    String,
+    Arc<CoreVersionManifest>,
+    Vec<Dep>,
+);
 
-    results
-}
+/// Single combined parse: one `simd_json::to_borrowed_value` over the
+/// raw body extracts the envelope (name, dist-tags, versions keys)
+/// AND deserializes the resolved version's `CoreVersionManifest`
+/// subtree. Same shape as the parse step in `preload-bench`.
+fn parse_combined(raw: Arc<[u8]>, spec: &str, peer_deps: PeerDeps) -> Option<ParseResult> {
+    use simd_json::prelude::{ValueAsObject, ValueAsScalar, ValueObjectAccess};
 
-/// Sync phase 2 worker: parse one body, settle all specs we need for
-/// this name. Runs on rayon (called from `par_iter` in
-/// `parse_settle_batch`).
-fn parse_one_body(
-    name: String,
-    raw: Bytes,
-    specs: Vec<String>,
-    peer_deps: PeerDeps,
-) -> ParseOutcome {
-    use simd_json::prelude::{ValueAsScalar, ValueObjectAccess};
-
-    let raw_arc: Arc<[u8]> = Arc::from(raw.as_ref());
-    let mut buf = raw.to_vec();
-    let parsed = match simd_json::to_borrowed_value(&mut buf) {
-        Ok(v) => v,
-        Err(_) => {
-            return ParseOutcome {
-                name,
-                full: None,
-                settled: Vec::new(),
-                transitives: Vec::new(),
-            };
-        }
-    };
+    let mut buf = (*raw).to_vec();
+    let parsed = simd_json::to_borrowed_value(&mut buf).ok()?;
 
-    let envelope_name = parsed
+    let name = parsed
         .get("name")
         .and_then(|v| v.as_str())
         .map(|s| s.to_string())
-        .unwrap_or_else(|| name.clone());
+        .unwrap_or_default();
     let dist_tags: HashMap<String, String> = parsed
         .get("dist-tags")
         .and_then(|v| HashMap::<String, String>::deserialize(v).ok())
         .unwrap_or_default();
     let versions_keys: Vec<String> = parsed
         .get("versions")
-        .and_then(simd_json::prelude::ValueAsObject::as_object)
+        .and_then(ValueAsObject::as_object)
         .map(|obj| obj.keys().map(|k| k.to_string()).collect())
         .unwrap_or_default();
 
     let full = FullManifest {
-        name: envelope_name,
+        name,
         dist_tags,
         versions: versions_keys,
-        raw: Arc::clone(&raw_arc),
+        raw: Arc::clone(&raw),
         ..Default::default()
     };
-    let full_arc = Arc::new(full);
-
-    // For each requested spec, resolve + extract version subtree.
-    // Cache the per-(name, version) `CoreVersionManifest` so sibling
-    // specs that resolve to the same version reuse the same Arc.
-    let mut version_cache: HashMap<String, Arc<CoreVersionManifest>> = HashMap::new();
-    let mut settled = Vec::with_capacity(specs.len());
-    let mut transitives = Vec::new();
-
-    for spec in specs {
-        let Ok(resolved_version) = resolve_target_version((&*full_arc).into(), &spec) else {
-            continue;
+
+    let resolved = resolve_target_version((&full).into(), spec).ok()?;
+    let core = parsed
+        .get("versions")
+        .and_then(|v| v.get(resolved.as_str()))
+        .and_then(|version_obj| CoreVersionManifest::deserialize(version_obj).ok())?;
+    let core_arc = Arc::new(core);
+    let transitives = extract_transitive(&core_arc, peer_deps);
+
+    Some((Arc::new(full), resolved, core_arc, transitives))
+}
+
+/// Fetch + combined parse + cache write for one `(name, spec)`.
+/// Future body owns all per-fetch work; main loop only extends
+/// `pending` from the returned transitives and refills `futs`.
+fn spawn_fetch(
+    client: reqwest::Client,
+    registry_url: Arc<String>,
+    name: String,
+    spec: String,
+    cache: MemoryCache,
+    body_cache: Arc<Mutex<HashMap<String, Arc<[u8]>>>>,
+    peer_deps: PeerDeps,
+) -> Fut {
+    Box::pin(async move {
+        let url = format!("{}/{}", registry_url, name);
+        let resp = match client
+            .get(&url)
+            .header("accept", "application/vnd.npm.install-v1+json")
+            .send()
+            .await
+        {
+            Ok(r) if r.status().is_success() => r,
+            _ => {
+                return FetchOutcome {
+                    name,
+                    transitives: Vec::new(),
+                    fetched: true,
+                };
+            }
         };
-        let core_arc = if let Some(cached) = version_cache.get(&resolved_version) {
-            Arc::clone(cached)
-        } else {
-            let Some(core) = parsed
-                .get("versions")
-                .and_then(|v| v.get(resolved_version.as_str()))
-                .and_then(|version_obj| CoreVersionManifest::deserialize(version_obj).ok())
-            else {
-                continue;
-            };
-            let arc = Arc::new(core);
-            version_cache.insert(resolved_version.clone(), Arc::clone(&arc));
-            arc
+        let raw_bytes = match resp.bytes().await {
+            Ok(b) => b,
+            Err(_) => {
+                return FetchOutcome {
+                    name,
+                    transitives: Vec::new(),
+                    fetched: true,
+                };
+            }
+        };
+        let raw_arc: Arc<[u8]> = Arc::from(raw_bytes.as_ref());
+        // Stash in body_cache early so concurrent sibling specs
+        // arriving slightly after see it on their pending pop.
+        body_cache.lock().insert(name.clone(), Arc::clone(&raw_arc));
+
+        let spec_for_parse = spec.clone();
+        let peer = peer_deps;
+        let parsed =
+            tokio::task::spawn_blocking(move || parse_combined(raw_arc, &spec_for_parse, peer))
+                .await
+                .ok()
+                .flatten();
+
+        let transitives = match parsed {
+            Some((full_arc, resolved, core_arc, transitives)) => {
+                cache.set_full_manifest(name.clone(), Arc::clone(&full_arc));
+                cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc));
+                cache.set_version_manifest(name.clone(), resolved, core_arc);
+                transitives
+            }
+            None => Vec::new(),
         };
-        transitives.extend(extract_transitive(&core_arc, peer_deps));
-        settled.push((spec, resolved_version, core_arc));
-    }
 
-    ParseOutcome {
-        name,
-        full: Some(full_arc),
-        settled,
-        transitives,
-    }
+        FetchOutcome {
+            name,
+            transitives,
+            fetched: true,
+        }
+    })
 }
 
-/// Phase 2 dispatcher: hands the batch to rayon, awaits the result.
-async fn parse_settle_batch(
-    bodies: Vec<FetchOutcome>,
-    by_name: HashMap<String, Vec<String>>,
+/// Settle-only future for a sibling spec whose `(name)` body already
+/// landed via a sibling fetch. Same combined parse, no network.
+fn spawn_settle(
+    name: String,
+    spec: String,
+    raw: Arc<[u8]>,
+    cache: MemoryCache,
     peer_deps: PeerDeps,
-) -> Vec<ParseOutcome> {
-    let work: Vec<(String, Bytes, Vec<String>)> = bodies
-        .into_iter()
-        .filter_map(|f| {
-            let bytes = f.bytes?;
-            let specs = by_name.get(&f.name).cloned().unwrap_or_default();
-            Some((f.name, bytes, specs))
+) -> Fut {
+    Box::pin(async move {
+        let spec_for_parse = spec.clone();
+        let peer = peer_deps;
+        let parsed = tokio::task::spawn_blocking(move || {
+            parse_combined(Arc::clone(&raw), &spec_for_parse, peer)
         })
-        .collect();
-
-    if work.is_empty() {
-        return Vec::new();
-    }
+        .await
+        .ok()
+        .flatten();
+
+        let transitives = match parsed {
+            Some((full_arc, resolved, core_arc, transitives)) => {
+                // Don't overwrite full_manifest — the original fetcher
+                // already set it. Only populate the version-manifest
+                // slots so BFS hits the (name, spec) early-return.
+                cache.set_full_manifest(name.clone(), full_arc);
+                cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc));
+                cache.set_version_manifest(name.clone(), resolved, core_arc);
+                transitives
+            }
+            None => Vec::new(),
+        };
 
-    tokio::task::spawn_blocking(move || {
-        work.into_par_iter()
-            .map(|(name, raw, specs)| parse_one_body(name, raw, specs, peer_deps))
-            .collect::<Vec<_>>()
+        FetchOutcome {
+            name,
+            transitives,
+            fetched: false,
+        }
     })
-    .await
-    .unwrap_or_default()
 }
 
-/// Two-phase mb-style fetch with rayon batch parse. See module docs.
+/// Streaming preload with transitive walk. Self-contained — no
+/// dependency on `service::http` / `service::manifest` /
+/// `service::registry` beyond `MemoryCache` writes.
 pub async fn mb_fetch(
     initial_deps: Vec<Dep>,
     registry_url: &str,
@@ -277,154 +269,109 @@ pub async fn mb_fetch(
     config: &PreloadConfig,
 ) -> MbFetchStats {
     let mut stats = MbFetchStats::default();
-    let mut pending_specs: Vec<Dep> = initial_deps;
-    // (name, spec) pairs we've already processed (settled or queued
-    // to settle). Without this, sibling-settle's transitive deps can
-    // re-introduce already-walked specs and the outer loop never
-    // terminates — peer / optional dep cycles trivially trigger this.
-    let mut seen_specs: HashSet<(String, String)> = HashSet::new();
-    let mut done_names: HashSet<String> = HashSet::new();
-    let conc = config.concurrency;
+    let total_start = Instant::now();
+
+    let client = match build_mb_client() {
+        Ok(c) => c,
+        Err(e) => {
+            tracing::warn!("mb_resolve client build failed: {e}");
+            return stats;
+        }
+    };
+    let registry = Arc::new(registry_url.trim_end_matches('/').to_string());
+    let cap = config.concurrency;
     let peer_deps = config.peer_deps;
-    let total_start = tokio::time::Instant::now();
 
-    // Filter the initial seed through `seen_specs` too — root + workspace
-    // edges can list the same dep multiple times across workspaces.
-    pending_specs.retain(|(n, s)| seen_specs.insert((n.clone(), s.clone())));
+    // Spec-level dedup across the entire run.
+    let mut seen: HashSet<(String, String)> = HashSet::new();
+    let mut pending: VecDeque<Dep> = VecDeque::new();
+    for (name, spec) in initial_deps {
+        if seen.insert((name.clone(), spec.clone())) {
+            pending.push_back((name, spec));
+        }
+    }
 
-    while !pending_specs.is_empty() {
-        stats.iterations += 1;
-        let iter = stats.iterations;
+    // Sibling-fetch dedup: when two specs for the same name are both
+    // in flight, only the first fires a fetch; the second arrives at
+    // the cached body and goes through `spawn_settle` instead.
+    let body_cache: Arc<Mutex<HashMap<String, Arc<[u8]>>>> = Arc::new(Mutex::new(HashMap::new()));
+    let mut in_flight_names: HashSet<String> = HashSet::new();
+    let mut deferred_by_name: HashMap<String, Vec<String>> = HashMap::new();
 
-        // Group this iteration's pending specs by name.
-        let mut by_name: HashMap<String, Vec<String>> = HashMap::new();
-        for (name, spec) in pending_specs.drain(..) {
-            by_name.entry(name).or_default().push(spec);
-        }
+    let mut futs: FuturesUnordered<Fut> = FuturesUnordered::new();
 
-        // Names whose full manifest is already cached from a prior
-        // iteration: settle their siblings synchronously (cheap
-        // semver match + cache lookup; no parse if version_manifest
-        // already cached, otherwise quick simd_json subtree extract).
-        let mut sibling_only: Vec<(String, Vec<String>)> = Vec::new();
-        let mut to_fetch: Vec<String> = Vec::with_capacity(by_name.len());
-        for (name, specs) in &by_name {
-            if done_names.contains(name) {
-                sibling_only.push((name.clone(), specs.clone()));
-            } else {
-                to_fetch.push(name.clone());
+    loop {
+        // Refill to cap.
+        while futs.len() < cap {
+            let Some((name, spec)) = pending.pop_front() else {
+                break;
+            };
+            // Sibling fast path: body already cached.
+            if let Some(raw) = body_cache.lock().get(&name).cloned() {
+                futs.push(spawn_settle(name, spec, raw, cache.clone(), peer_deps));
+                continue;
             }
-        }
-
-        // Sibling settles (rare on real workloads — most names appear
-        // exactly once across the whole walk). New transitives go
-        // through `seen_specs` dedup before joining `pending_specs`.
-        for (name, specs) in sibling_only {
-            let Some(full) = cache.get_full_manifest(&name) else {
+            // Defer if a fetch for this name is already in flight.
+            if !in_flight_names.insert(name.clone()) {
+                deferred_by_name.entry(name).or_default().push(spec);
                 continue;
-            };
-            for spec in specs {
-                let Ok(resolved) = resolve_target_version((&*full).into(), &spec) else {
-                    continue;
-                };
-                let new_deps = if let Some(cached) = cache.get_version_manifest(&name, &resolved) {
-                    cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&cached));
-                    extract_transitive(&cached, peer_deps)
-                } else if let Some(core) = full.get_core_version(&resolved) {
-                    let core_arc = Arc::new(core);
-                    cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&core_arc));
-                    cache.set_version_manifest(name.clone(), resolved, Arc::clone(&core_arc));
-                    extract_transitive(&core_arc, peer_deps)
-                } else {
-                    Vec::new()
-                };
-                pending_specs.extend(
-                    new_deps
-                        .into_iter()
-                        .filter(|(n, s)| seen_specs.insert((n.clone(), s.clone()))),
-                );
             }
+            futs.push(spawn_fetch(
+                client.clone(),
+                Arc::clone(&registry),
+                name,
+                spec,
+                cache.clone(),
+                Arc::clone(&body_cache),
+                peer_deps,
+            ));
         }
 
-        if to_fetch.is_empty() {
-            // Iteration drained pending entirely via sibling settles.
-            continue;
+        if futs.is_empty() {
+            break;
         }
 
-        // PHASE 1 — pure HTTP, mb-style.
-        let p1_start = tokio::time::Instant::now();
-        let bodies = mb_style_pure_fetch(to_fetch.clone(), registry_url, conc).await;
-        let p1_wall = p1_start.elapsed().as_millis();
-        let total_bytes: usize = bodies
-            .iter()
-            .map(|b| b.bytes.as_ref().map(|v| v.len()).unwrap_or(0))
-            .sum();
-        tracing::info!(
-            "p1-breakdown mb_fetch iter={} phase1_http_wall={}ms n={} bytes={}",
-            iter,
-            p1_wall,
-            to_fetch.len(),
-            total_bytes,
-        );
-
-        // PHASE 2 — rayon batch parse + settle.
-        let p2_start = tokio::time::Instant::now();
-        let by_name_for_parse = by_name
-            .iter()
-            .filter(|(name, _)| !done_names.contains(*name))
-            .map(|(n, s)| (n.clone(), s.clone()))
-            .collect::<HashMap<_, _>>();
-        let parsed = parse_settle_batch(bodies, by_name_for_parse, peer_deps).await;
-        let p2_wall = p2_start.elapsed().as_millis();
-
-        let mut new_transitives: Vec<Dep> = Vec::new();
-        let mut settle_count = 0usize;
-        let mut fail_count = 0usize;
-        for outcome in parsed {
-            done_names.insert(outcome.name.clone());
-            let Some(full_arc) = outcome.full else {
-                fail_count += 1;
-                continue;
-            };
-            cache.set_full_manifest(outcome.name.clone(), Arc::clone(&full_arc));
-            for (spec, resolved, core) in outcome.settled {
-                cache.set_version_manifest(outcome.name.clone(), spec, Arc::clone(&core));
-                cache.set_version_manifest(outcome.name.clone(), resolved, Arc::clone(&core));
-                settle_count += 1;
-            }
-            new_transitives.extend(outcome.transitives);
-        }
-        // Names that fetched but failed parse — still mark done so we
-        // don't refetch them next iteration.
-        for name in to_fetch {
-            done_names.insert(name);
+        let Some(out) = futs.next().await else { break };
+
+        if out.transitives.is_empty() && out.fetched {
+            // Empty result from a fetch is ambiguous (no transitives
+            // OR a fetch/parse failure). Track conservatively as
+            // success — the FETCH_TIMINGS-equivalent counter is
+            // omitted in this path on purpose to keep the future
+            // body lean.
+            stats.success += 1;
+        } else if out.fetched {
+            stats.success += 1;
         }
 
-        stats.success += settle_count;
-        stats.fail += fail_count;
-
-        let new_unique: Vec<Dep> = new_transitives
-            .into_iter()
-            .filter(|(n, s)| seen_specs.insert((n.clone(), s.clone())))
-            .collect();
-
-        tracing::info!(
-            "p1-breakdown mb_fetch iter={} phase2_parse_wall={}ms settles={} fail={} new_unique={}",
-            iter,
-            p2_wall,
-            settle_count,
-            fail_count,
-            new_unique.len(),
-        );
+        // Drain sibling specs deferred while the fetch was in flight.
+        if out.fetched
+            && let Some(siblings) = deferred_by_name.remove(&out.name)
+            && let Some(raw) = body_cache.lock().get(&out.name).cloned()
+        {
+            for sibling_spec in siblings {
+                futs.push(spawn_settle(
+                    out.name.clone(),
+                    sibling_spec,
+                    Arc::clone(&raw),
+                    cache.clone(),
+                    peer_deps,
+                ));
+            }
+        }
 
-        pending_specs.extend(new_unique);
+        // Extend pending with new transitive specs, dedup.
+        for (name, spec) in out.transitives {
+            if seen.insert((name.clone(), spec.clone())) {
+                pending.push_back((name, spec));
+            }
+        }
     }
 
     let total_wall = total_start.elapsed().as_millis();
     tracing::info!(
-        "p1-breakdown mb_fetch total_wall={}ms iters={} settled={} fail={}",
+        "p1-breakdown mb_fetch wall={}ms ok={} fail={}",
         total_wall,
-        stats.iterations,
         stats.success,
         stats.fail,
     );
diff --git a/crates/ruborist/src/service/api.rs b/crates/ruborist/src/service/api.rs
index 9687fc875..06079b248 100644
--- a/crates/ruborist/src/service/api.rs
+++ b/crates/ruborist/src/service/api.rs
@@ -38,7 +38,6 @@ use crate::resolver::builder::{
     BuildDepsConfig, DevDeps, EdgeContext, PeerDeps, add_edges_from, build_deps_with_config,
     gather_preload_deps,
 };
-use crate::resolver::fast_preload::fast_preload;
 use crate::resolver::mb_resolve::mb_fetch;
 use crate::resolver::preload::PreloadConfig;
 use crate::resolver::runtime::install_runtime_from_map;
@@ -275,19 +274,19 @@ where
 
     // Lockfile-only callers (`utoo deps`) skip the receiver-driven
     // `run_preload_phase` because they have no pipeline consumer for
-    // `BuildEvent::PackageResolved`. Run `fast_preload` instead — a flat
-    // `FuturesUnordered` over `fetch_full_manifest` that warms the
-    // `MemoryCache` so the BFS phase below is pure cache-hit. This is
-    // the manifest-bench-style path; the heavier `preload_manifests`
-    // path (with `OnceMap` gates + `EventReceiver` events) only runs
-    // for install paths that need the pipeline signal.
+    // `BuildEvent::PackageResolved`. Route through `mb_fetch` — a
+    // ruborist-internal standalone preload that bypasses
+    // `service::http`, `service::manifest`, and `service::registry`
+    // to match `manifest-bench`'s loop shape directly. PM is
+    // unaware: this dispatch happens entirely inside ruborist when
+    // `skip_preload=true` and there's no warm project cache.
     if skip_preload_caller && cache_count == 0 {
         let initial_deps = gather_preload_deps(&graph, peer_deps);
         let preload_config = PreloadConfig {
             peer_deps,
             concurrency,
         };
-        fast_preload(
+        mb_fetch(
             initial_deps,
             registry.registry_url(),
             registry.cache(),
@@ -333,166 +332,6 @@ where
     })
 }
 
-/// Experimental parallel-track entry point: structurally identical to
-/// [`build_deps`] but routes the manifest-fetch phase through
-/// [`crate::resolver::mb_resolve::mb_fetch`] instead of
-/// [`crate::resolver::fast_preload::fast_preload`].
-///
-/// Intended for A/B benchmarking: install + lockfile-only callers can
-/// opt in via `UTOO_RESOLVE=mb` (wired in `pm::helper::ruborist_context`).
-/// All other behavior — workspace discovery, runtime injection, BFS,
-/// graph→lock serialization, project cache export — is the same as
-/// `build_deps`. The `EventReceiver` still receives BFS events; it
-/// does NOT receive `PreloadFetching` / `PreloadProgress` events
-/// because mb_fetch is silent (matches `manifest-bench`'s zero-event
-/// loop).
-///
-/// **Install-path note:** `pipeline_deps_options` callers that need
-/// `PackageResolved` events to drive the download/clone pipeline
-/// won't pipeline under this path — mb_fetch finishes all fetches
-/// before BFS starts. Use only for `utoo deps`-style workloads, or
-/// accept that install becomes phase-sequential.
-pub async fn build_deps_mb<G, R>(options: BuildDepsOptions<G, R>) -> Result<BuildDepsOutput>
-where
-    G: Glob + Clone,
-    R: EventReceiver,
-{
-    let BuildDepsOptions {
-        cwd,
-        registry_url,
-        cache_dir,
-        manifest_store,
-        warm_project_cache,
-        concurrency,
-        peer_deps,
-        glob,
-        receiver,
-        supports_semver,
-        catalogs,
-        skip_preload: _,
-    } = options;
-
-    // Steps 1-6: structurally identical to `build_deps` — read
-    // package.json, inject runtime deps, build initial graph, add
-    // root edges, discover and add workspaces.
-    let discovery = WorkspaceDiscovery::new(glob.clone());
-    let root_path = discovery.find_root_path(&cwd).await?;
-    let pkg_path = root_path.join("package.json");
-    let mut pkg: PackageJson = super::fs::read_json(&pkg_path)
-        .await
-        .map_err(|e| anyhow::anyhow!("Failed to read/parse package.json: {}", e))?;
-
-    if let Some(engines) = &pkg.engines {
-        let runtime_deps = install_runtime_from_map(engines);
-        if !runtime_deps.is_empty() {
-            for (name, version) in runtime_deps {
-                pkg.optional_dependencies
-                    .get_or_insert_with(HashMap::new)
-                    .entry(name)
-                    .or_insert(version);
-            }
-        }
-    }
-
-    let mut graph = DependencyGraph::from_package_json(root_path.clone(), pkg.clone());
-    let root_index = graph.root_index;
-    let edge_ctx = EdgeContext::new(peer_deps, DevDeps::Include).with_catalogs(&catalogs);
-    add_edges_from(&mut graph, root_index, &pkg, &edge_ctx);
-
-    let workspaces = discovery.find_workspaces_from_pkg(&root_path, &pkg).await?;
-    for workspace in workspaces {
-        let ws_pkg = workspace.package_json;
-        let workspace_node =
-            PackageNode::workspace_from_package_json(workspace.path.clone(), ws_pkg.clone());
-        let workspace_index = graph.add_node(workspace_node);
-        let link_node = PackageNode::link_from_package_json(workspace.path.clone(), ws_pkg.clone());
-        let link_index = graph.add_node(link_node);
-        graph.add_physical_edge(root_index, workspace_index);
-        graph.add_physical_edge(root_index, link_index);
-        let dep_edge_id = graph.add_dependency_edge(
-            root_index,
-            workspace.name.clone(),
-            &ws_pkg.version,
-            EdgeType::Prod,
-        );
-        graph.mark_dependency_resolved(dep_edge_id, workspace_index);
-        add_edges_from(&mut graph, workspace_index, &ws_pkg, &edge_ctx);
-    }
-
-    // Step 7-8: cache + registry, same as `build_deps`. Warm project
-    // cache is honored.
-    let package_cache = Arc::new(PackageCache::default());
-    let (cache_count, _) = prepopulate_warm_cache(&package_cache, warm_project_cache.as_ref());
-
-    let mut builder = UnifiedRegistry::builder()
-        .registry(&registry_url)
-        .cache(package_cache)
-        .store(Arc::clone(&manifest_store));
-    if let Some(semver) = supports_semver {
-        builder = builder.supports_semver(semver);
-    }
-    let registry = builder.build();
-
-    // Run mb_fetch instead of fast_preload — pre-warms cache by
-    // walking transitive deps via flat FuturesUnordered. Skipped if
-    // the warm project cache already covers the workload.
-    if cache_count == 0 {
-        let initial_deps = gather_preload_deps(&graph, peer_deps);
-        let preload_config = PreloadConfig {
-            peer_deps,
-            concurrency,
-        };
-        mb_fetch(
-            initial_deps,
-            registry.registry_url(),
-            registry.cache(),
-            &preload_config,
-        )
-        .await;
-    }
-
-    // BFS phase reads the now-warm cache. `skip_preload=true` skips
-    // the receiver-driven preload — mb_fetch already ran.
-    let mut config = BuildDepsConfig::default()
-        .with_peer_deps(peer_deps)
-        .with_concurrency(concurrency)
-        .with_skip_preload(true)
-        .with_catalogs(catalogs);
-    if let Some(dir) = cache_dir {
-        config = config.with_cache_dir(dir);
-    }
-
-    build_deps_with_config(&mut graph, &registry, config, &receiver)
-        .await
-        .map_err(|e| anyhow::Error::new(e).context("Dependency resolution failed"))?;
-
-    let t_serialize_start = std::time::Instant::now();
-    let (packages, _total) = graph.serialize_to_packages(&root_path);
-    let serialize_us = t_serialize_start.elapsed().as_micros() as u64;
-
-    let t_cache_export_start = std::time::Instant::now();
-    let mut project_cache = ProjectCacheData::default();
-    for (key, manifest) in registry.cache().export_version_manifests() {
-        let (name, spec) = parse_package_spec(&key);
-        let version = manifest.version.clone();
-        let pkg_cache = project_cache.cache.entry(name.to_string()).or_default();
-        pkg_cache.specs.insert(spec.to_string(), version.clone());
-        pkg_cache.manifests.insert(version, (*manifest).clone());
-    }
-    let cache_export_us = t_cache_export_start.elapsed().as_micros() as u64;
-
-    tracing::info!(
-        "p1-breakdown serialize_us={} cache_export_us={}",
-        serialize_us,
-        cache_export_us,
-    );
-
-    Ok(BuildDepsOutput {
-        lock: PackageLock::new(&pkg.name, &pkg.version, packages),
-        project_cache,
-    })
-}
-
 /// Pre-populate `cache` from a warm project cache. Returns
 /// `(loaded, missing)` — `loaded` is the count of usable spec→manifest
 /// entries; `missing` counts specs whose resolved version had no manifest
diff --git a/crates/ruborist/src/service/mod.rs b/crates/ruborist/src/service/mod.rs
index 7a7cf8ca8..5adb6bf0b 100644
--- a/crates/ruborist/src/service/mod.rs
+++ b/crates/ruborist/src/service/mod.rs
@@ -52,7 +52,7 @@ mod manifest;
 mod registry;
 mod store;
 
-pub use api::{BuildDepsOptions, BuildDepsOutput, build_deps, build_deps_mb};
+pub use api::{BuildDepsOptions, BuildDepsOutput, build_deps};
 pub use cache::{
     CacheStats, MemoryCache, PackageCache, ProjectCacheData, ProjectPackageCache, Versions,
     VersionsInfo,

From 05486b5028961768791ffa3c97d518b33b738d50 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 13:59:05 +0800
Subject: [PATCH 24/24] experiment(pm): swap DiskManifestStore for NoopStore
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Side experiment branched off perf/p1-resolve-concurrency at
01d15130. Question under test: how much of utoo's p1/p3 wall
comes from the per-fetch disk-cache existence-check IO that
service::registry::UnifiedRegistry issues alongside each manifest
fetch (store.load_versions / store.load_version_manifest +
fire-and-forget store.store_*)?

Swaps `Context::manifest_store` from `DiskManifestStore` to
`NoopStore`, which makes every store call a no-op without
touching the filesystem.

Affects ALL paths that go through `Context`:
  * `utoo deps` (lockfile-only): already bypasses UnifiedRegistry
    via mb_resolve, so no perf impact expected — confirms baseline.
  * `utoo install` (pipeline path): preload_manifests still goes
    through UnifiedRegistry, so this swap removes per-fetch disk
    IO from the install resolve phase. p3_cold_install delta is
    the meaningful number.
  * BFS edges that miss MemoryCache and fall into
    resolve_via_full_manifest: no disk fallback, so a cold cache
    miss falls straight to network instead of checking disk first.

NOT for landing — measurement-only branch. Compare against
01d15130 to read the disk-cache IO cost.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/pm/src/helper/ruborist_context.rs | 22 +++++++++++++++++++++-
 crates/pm/src/util/manifest_store.rs     |  6 ++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/crates/pm/src/helper/ruborist_context.rs b/crates/pm/src/helper/ruborist_context.rs
index c8b758a6f..e9226243b 100644
--- a/crates/pm/src/helper/ruborist_context.rs
+++ b/crates/pm/src/helper/ruborist_context.rs
@@ -9,6 +9,9 @@ use utoo_ruborist::service::{
 use crate::service::pipeline::{PipelineChannels, PipelineReceiver};
 use crate::util::cache::get_cache_dir;
 use crate::util::logger::ProgressReceiver;
+// EXPERIMENT: DiskManifestStore swapped for NoopStore (see manifest_store
+// fn below), so the disk-backed store is unused on this branch.
+#[allow(unused_imports)]
 use crate::util::manifest_store::DiskManifestStore;
 use crate::util::project_cache;
 use crate::util::user_config::{
@@ -40,8 +43,25 @@ pub(crate) type Registry = UnifiedRegistry;
 pub(crate) struct Context;
 
 impl Context {
+    /// EXPERIMENT (experiment/no-disk-cache branch): swap
+    /// `DiskManifestStore` for `NoopStore` so every
+    /// `store.load_versions` / `store.load_version_manifest` call in
+    /// `service::registry::UnifiedRegistry` returns `None` without
+    /// touching the filesystem, and every `store.store_*` call is a
+    /// no-op. Used to A/B test how much of utoo's p1/p3 wall comes
+    /// from the per-fetch disk-cache existence-check IO that the
+    /// registry layer issues alongside each manifest fetch.
+    ///
+    /// Affects ALL paths that build `BuildDepsOptions` via this
+    /// helper (`deps_options` → `pipeline_deps_options`,
+    /// `build_deps`). The new `mb_resolve` lockfile-only path
+    /// already bypasses `UnifiedRegistry` entirely, so it sees no
+    /// effect from this swap; the install path (which still goes
+    /// through `UnifiedRegistry` for the pipeline preload) does see
+    /// the difference, and so does any BFS edge that misses
+    /// `MemoryCache` and falls into `resolve_via_full_manifest`.
     fn manifest_store() -> Arc<dyn ManifestStore> {
-        Arc::new(DiskManifestStore::new(get_cache_dir()))
+        Arc::new(utoo_ruborist::service::NoopStore)
     }
 
     /// Create BuildDepsOptions with a custom event receiver.
diff --git a/crates/pm/src/util/manifest_store.rs b/crates/pm/src/util/manifest_store.rs
index 7f9c61bb1..b1fee9818 100644
--- a/crates/pm/src/util/manifest_store.rs
+++ b/crates/pm/src/util/manifest_store.rs
@@ -19,10 +19,15 @@ use utoo_ruborist::service::{ManifestStore, VersionsInfo};
 
 use crate::util::json::read_json_file;
 
+// EXPERIMENT: ruborist_context swaps DiskManifestStore for NoopStore on
+// this branch — type stays defined to keep the import path valid, but
+// fields go unread.
+#[allow(dead_code)]
 pub struct DiskManifestStore {
     cache_dir: PathBuf,
 }
 
+#[allow(dead_code)]
 impl DiskManifestStore {
     pub fn new(cache_dir: PathBuf) -> Self {
         Self { cache_dir }
@@ -75,6 +80,7 @@ impl ManifestStore for DiskManifestStore {
 /// Serialize `value` and write to `path`. On `NotFound`, create the parent
 /// directory and retry once — saves the mkdir syscall on every warm-cache
 /// rewrite. Errors are logged at debug; disk cache is opportunistic.
+#[allow(dead_code)]
 async fn write_json<T: Serialize>(path: &Path, value: &T) {
     let bytes = match serde_json::to_vec(value) {
         Ok(b) => b,