From 2bf71158cde397de4b423ce100385a0e5561e900 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Fri, 8 May 2026 21:56:24 +0800
Subject: [PATCH 01/32] =?UTF-8?q?perf(pm):=20bump=20manifests-concurrency-?=
 =?UTF-8?q?limit=2064=20=E2=86=92=20256=20+=20add=20fetch=20breakdown?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

p1_resolve has been ~0.9s behind bun on phases bench for the past
several PRs. Pcap on prior runs measured bun opening ~260 parallel
TCP streams against registry.npmjs.org for resolve, while utoo
opened ~70 (the 64 manifests-concurrency-limit cap was at saturation).

Adding fetch-breakdown timing in ruborist showed where p1's 22s
(local Mac) actually goes:

  fetch-timings: n=2730
    sum_request   = 1089s   (88% — TCP+TLS+HTTP RTT to first byte)
    sum_body      = 138s    (11% — body download)
    sum_parse     = 2s      (0.16% — simd_json on rayon)

The dominant cost is per-request RTT, not parsing or body transfer.
The lever is the cap on concurrent in-flight requests.

This commit:

1. Adds `crates/ruborist/src/util/timing.rs` — process-wide atomic
   accumulator that records per-fetch (request_us, body_us,
   parse_us, bytes) inside both `fetch_full_manifest` and
   `fetch_version_manifest`. Reset before each preload phase, dumped
   at INFO level after preload + bfs.

2. Bumps `manifests-concurrency-limit` default 64 → 256 to match
   bun's observed working point against npmjs.org.

CI bench will validate. Expected: p1 utoo wall drops toward bun's
range (~2.3s on GHA).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/pm/src/util/user_config.rs       |  10 +-
 crates/ruborist/src/resolver/builder.rs |  17 ++-
 crates/ruborist/src/service/manifest.rs |  24 ++++-
 crates/ruborist/src/util/mod.rs         |   2 +
 crates/ruborist/src/util/timing.rs      | 134 ++++++++++++++++++++++++
 5 files changed, 181 insertions(+), 6 deletions(-)
 create mode 100644 crates/ruborist/src/util/timing.rs
diff --git a/crates/pm/src/util/user_config.rs b/crates/pm/src/util/user_config.rs
index 34ee45a34..bc281fb40 100644
--- a/crates/pm/src/util/user_config.rs
+++ b/crates/pm/src/util/user_config.rs
@@ -132,9 +132,15 @@ pub fn get_install_scope() -> InstallScope {
     INSTALL_SCOPE.get().copied().unwrap_or_default()
 }
 
-// Manifest fetch concurrency configuration
+// Manifest fetch concurrency configuration.
+//
+// 256 to match bun's observed ~260 parallel TCP streams against
+// registry.npmjs.org. Local fetch-breakdown instrumentation showed
+// 88% of preload-phase work is in per-request RTT (TCP+TLS+server),
+// only 11% body, 0.16% parse — so the dominant lever for p1 wall is
+// the cap on concurrent in-flight manifest requests.
 static MANIFESTS_CONCURRENCY_LIMIT: LazyLock<ConfigValue<usize>> =
-    LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 64));
+    LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 256));
 
 pub fn set_manifests_concurrency_limit(value: Option<usize>) {
     MANIFESTS_CONCURRENCY_LIMIT.set(value);
diff --git a/crates/ruborist/src/resolver/builder.rs b/crates/ruborist/src/resolver/builder.rs
index b0bf2794c..166372c91 100644
--- a/crates/ruborist/src/resolver/builder.rs
+++ b/crates/ruborist/src/resolver/builder.rs
@@ -756,6 +756,7 @@ async fn run_preload_phase<R: RegistryClient, E: EventReceiver>(
         return;
     }
 
+    crate::util::FETCH_TIMINGS.reset();
     let start = tokio::time::Instant::now();
 
     let initial_deps = gather_preload_deps(graph, config.peer_deps);
@@ -794,7 +795,13 @@ async fn run_preload_phase<R: RegistryClient, E: EventReceiver>(
         failed: stats.failed_count,
     });
 
-    tracing::debug!("Preload phase: {:?}", start.elapsed());
+    let preload_elapsed = start.elapsed();
+    tracing::debug!("Preload phase: {:?}", preload_elapsed);
+    tracing::info!(
+        "p1-breakdown preload_wall={}ms | {}",
+        preload_elapsed.as_millis(),
+        crate::util::FETCH_TIMINGS.snapshot().summary_line(),
+    );
 }
 
 /// Run the BFS traversal phase to build the dependency tree.
@@ -896,7 +903,13 @@ async fn run_bfs_phase<R: RegistryClient, E: EventReceiver>(
         current_level = next_level;
     }
 
-    tracing::debug!("Build phase: {:?}", start.elapsed());
+    let bfs_elapsed = start.elapsed();
+    tracing::debug!("Build phase: {:?}", bfs_elapsed);
+    tracing::info!(
+        "p1-breakdown bfs_wall={}ms | {}",
+        bfs_elapsed.as_millis(),
+        crate::util::FETCH_TIMINGS.snapshot().summary_line(),
+    );
     Ok(())
 }
 
diff --git a/crates/ruborist/src/service/manifest.rs b/crates/ruborist/src/service/manifest.rs
index 74baf3b9c..36bc6a85a 100644
--- a/crates/ruborist/src/service/manifest.rs
+++ b/crates/ruborist/src/service/manifest.rs
@@ -12,6 +12,7 @@ use super::fetch::{
 };
 use super::http::get_client;
 use crate::model::manifest::{CoreVersionManifest, FullManifest};
+use crate::util::FETCH_TIMINGS;
 
 /// Parse JSON bytes on rayon's CPU thread pool (native) or inline
 /// (wasm32). Keeps the tokio runtime free of `simd_json` work so other
@@ -91,7 +92,9 @@ pub async fn fetch_full_manifest(opts: FetchManifestOptions<'_>) -> Result<Fetch
                     request = request.header("If-None-Match", etag_value);
                 }
 
+                let t_request_start = std::time::Instant::now();
                 let response = request.send().await.map_err(classify_reqwest_error)?;
+                let request_us = t_request_start.elapsed().as_micros() as u64;
                 let status = response.status();
 
                 if status == reqwest::StatusCode::NOT_MODIFIED {
@@ -109,19 +112,25 @@ pub async fn fetch_full_manifest(opts: FetchManifestOptions<'_>) -> Result<Fetch
                         .and_then(|v| v.to_str().ok())
                         .map(|s| s.to_string());
 
+                    let t_body_start = std::time::Instant::now();
                     let raw_bytes = response
                         .bytes()
                         .await
                         .map_err(|e| FetchError::Permanent(anyhow!("Response read error: {e}")))?
                         .to_vec();
+                    let body_us = t_body_start.elapsed().as_micros() as u64;
+                    let bytes_len = raw_bytes.len() as u64;
                     // simd_json mutates the parse buffer; clone so the raw
                     // bytes survive for `manifest.raw`.
                     let parse_buf = raw_bytes.clone();
+                    let t_parse_start = std::time::Instant::now();
                     let mut manifest: FullManifest = parse_json_off_runtime(parse_buf)
                         .await
                         .map_err(FetchError::Permanent)?;
+                    let parse_us = t_parse_start.elapsed().as_micros() as u64;
                     manifest.raw = std::sync::Arc::from(raw_bytes);
 
+                    FETCH_TIMINGS.record(request_us, body_us, parse_us, bytes_len);
                     Ok(FetchManifestResult::Ok(manifest, new_etag))
                 } else {
                     Err(classify_status(status, &url))
@@ -190,6 +199,7 @@ pub async fn fetch_version_manifest(
         || {
             let url = url.clone();
             async move {
+                let t_request_start = std::time::Instant::now();
                 let response = get_client()
                     .map_err(FetchError::Permanent)?
                     .get(&url)
@@ -197,16 +207,26 @@ pub async fn fetch_version_manifest(
                     .send()
                     .await
                     .map_err(classify_reqwest_error)?;
+                let request_us = t_request_start.elapsed().as_micros() as u64;
 
                 if response.status().is_success() {
+                    let t_body_start = std::time::Instant::now();
                     let bytes = response
                         .bytes()
                         .await
                         .map_err(|e| FetchError::Permanent(anyhow!("Response read error: {e}")))?
                         .to_vec();
-                    parse_json_off_runtime::<CoreVersionManifest>(bytes)
+                    let body_us = t_body_start.elapsed().as_micros() as u64;
+                    let bytes_len = bytes.len() as u64;
+                    let t_parse_start = std::time::Instant::now();
+                    let parsed = parse_json_off_runtime::<CoreVersionManifest>(bytes)
                         .await
-                        .map_err(FetchError::Permanent)
+                        .map_err(FetchError::Permanent);
+                    let parse_us = t_parse_start.elapsed().as_micros() as u64;
+                    if parsed.is_ok() {
+                        FETCH_TIMINGS.record(request_us, body_us, parse_us, bytes_len);
+                    }
+                    parsed
                 } else {
                     Err(classify_status(response.status(), &url))
                 }
diff --git a/crates/ruborist/src/util/mod.rs b/crates/ruborist/src/util/mod.rs
index 649e47c95..a7f0b7b7d 100644
--- a/crates/ruborist/src/util/mod.rs
+++ b/crates/ruborist/src/util/mod.rs
@@ -1,6 +1,8 @@
 //! Shared utility primitives for ruborist and downstream consumers.
 
 pub mod oncemap;
+pub mod timing;
 
 pub use crate::model::util::{PackageNameStr, parse_package_spec, read_package_json};
 pub use oncemap::OnceMap;
+pub use timing::{FETCH_TIMINGS, FetchTimings, FetchTimingsSnapshot};
diff --git a/crates/ruborist/src/util/timing.rs b/crates/ruborist/src/util/timing.rs
new file mode 100644
index 000000000..f50e921b9
--- /dev/null
+++ b/crates/ruborist/src/util/timing.rs
@@ -0,0 +1,134 @@
+//! Per-phase manifest fetch timing accumulator for p1 perf investigation.
+//!
+//! Splits each `fetch_*_manifest` call into three observable pieces:
+//!   - `request_us`: from `request.send().await` to response headers
+//!     received. Captures TCP connect (when not pooled), TLS handshake,
+//!     HTTP request roundtrip, and server-side processing.
+//!   - `body_us`: from response headers to the entire JSON body buffered.
+//!     Network-bandwidth bound for large packuments.
+//!   - `parse_us`: from full body buffered to a typed manifest. CPU bound
+//!     (simd_json on a spawn_blocking thread).
+//!
+//! `parse_us` is wall-clock for the await on `parse_json_off_runtime` —
+//! since JSON parse runs on `spawn_blocking`, this includes scheduling
+//! latency rather than pure CPU time. Together with the per-fetch total
+//! already tracked in `preload_manifests`, this lets us answer "where
+//! did p1's wall time go?" without external profiling.
+//!
+//! All counters are `AtomicU64` so the recording path is lock-free.
+//! Numbers are reset between resolves via [`reset()`] so successive
+//! `utoo deps` invocations report independently.
+
+use std::sync::atomic::{AtomicU64, Ordering};
+
+/// Per-process accumulator for manifest fetch timings.
+#[derive(Default, Debug)]
+pub struct FetchTimings {
+    /// Number of fetches recorded (full + version manifest).
+    pub count: AtomicU64,
+    /// Sum of microseconds spent in `request.send().await`.
+    pub request_us: AtomicU64,
+    /// Sum of microseconds spent in `response.bytes().await`.
+    pub body_us: AtomicU64,
+    /// Sum of microseconds spent awaiting `parse_json_off_runtime`.
+    pub parse_us: AtomicU64,
+    /// Sum of body bytes received across all fetches.
+    pub bytes: AtomicU64,
+}
+
+impl FetchTimings {
+    /// Record one fetch's split timings. Call once per successful fetch.
+    pub fn record(&self, request_us: u64, body_us: u64, parse_us: u64, bytes: u64) {
+        self.count.fetch_add(1, Ordering::Relaxed);
+        self.request_us.fetch_add(request_us, Ordering::Relaxed);
+        self.body_us.fetch_add(body_us, Ordering::Relaxed);
+        self.parse_us.fetch_add(parse_us, Ordering::Relaxed);
+        self.bytes.fetch_add(bytes, Ordering::Relaxed);
+    }
+
+    /// Reset all counters to zero.
+    pub fn reset(&self) {
+        self.count.store(0, Ordering::Relaxed);
+        self.request_us.store(0, Ordering::Relaxed);
+        self.body_us.store(0, Ordering::Relaxed);
+        self.parse_us.store(0, Ordering::Relaxed);
+        self.bytes.store(0, Ordering::Relaxed);
+    }
+
+    /// Snapshot of the current accumulator state.
+    pub fn snapshot(&self) -> FetchTimingsSnapshot {
+        FetchTimingsSnapshot {
+            count: self.count.load(Ordering::Relaxed),
+            request_us: self.request_us.load(Ordering::Relaxed),
+            body_us: self.body_us.load(Ordering::Relaxed),
+            parse_us: self.parse_us.load(Ordering::Relaxed),
+            bytes: self.bytes.load(Ordering::Relaxed),
+        }
+    }
+}
+
+/// Immutable snapshot suitable for printing.
+#[derive(Debug, Clone, Copy)]
+pub struct FetchTimingsSnapshot {
+    pub count: u64,
+    pub request_us: u64,
+    pub body_us: u64,
+    pub parse_us: u64,
+    pub bytes: u64,
+}
+
+impl FetchTimingsSnapshot {
+    /// One-line summary for tracing logs.
+    pub fn summary_line(&self) -> String {
+        if self.count == 0 {
+            return "fetch-timings: no requests recorded".to_string();
+        }
+        let count = self.count;
+        let avg_req = self.request_us / count;
+        let avg_body = self.body_us / count;
+        let avg_parse = self.parse_us / count;
+        let avg_bytes = self.bytes / count;
+        format!(
+            "fetch-timings: n={} sum_request={}ms sum_body={}ms sum_parse={}ms total_bytes={}MB | avg_request={}us avg_body={}us avg_parse={}us avg_bytes={}KB",
+            count,
+            self.request_us / 1_000,
+            self.body_us / 1_000,
+            self.parse_us / 1_000,
+            self.bytes / 1_000_000,
+            avg_req,
+            avg_body,
+            avg_parse,
+            avg_bytes / 1_024,
+        )
+    }
+}
+
+/// Process-wide manifest fetch timing accumulator.
+pub static FETCH_TIMINGS: FetchTimings = FetchTimings {
+    count: AtomicU64::new(0),
+    request_us: AtomicU64::new(0),
+    body_us: AtomicU64::new(0),
+    parse_us: AtomicU64::new(0),
+    bytes: AtomicU64::new(0),
+};
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn record_and_snapshot() {
+        FETCH_TIMINGS.reset();
+        FETCH_TIMINGS.record(100, 200, 300, 1024);
+        FETCH_TIMINGS.record(150, 250, 350, 2048);
+        let snap = FETCH_TIMINGS.snapshot();
+        assert_eq!(snap.count, 2);
+        assert_eq!(snap.request_us, 250);
+        assert_eq!(snap.body_us, 450);
+        assert_eq!(snap.parse_us, 650);
+        assert_eq!(snap.bytes, 3072);
+        FETCH_TIMINGS.reset();
+        let snap2 = FETCH_TIMINGS.snapshot();
+        assert_eq!(snap2.count, 0);
+    }
+}

From 8ac97ae036ab97cb986ce19109af18e130dbc1cd Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Fri, 8 May 2026 22:25:36 +0800
Subject: [PATCH 02/32] =?UTF-8?q?chore(p1):=20revert=20concurrency=20256?=
 =?UTF-8?q?=20=E2=86=92=2064=20+=20restore=20manifest-bench?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two changes after the GHA bench on the previous commit (PR #2916,
run 25559625024) showed the concurrency=256 hypothesis was wrong on
GHA's environment.

Revert concurrency 256 → 64
---------------------------

The new fetch-timing instrumentation shipped in the previous commit
caught the surprise: GHA's pcap-vs-local profile is the *opposite*
of what local Mac measurements suggested.

  metric          local Mac    GHA Linux
  avg_request     399ms        70ms      ← network MUCH faster on GHA
  avg_body         50ms        20ms
  avg_parse       730µs        266ms     ← parse 365× SLOWER on GHA

Mechanism: `parse_json_off_runtime` dispatches to `rayon::spawn`,
and rayon's pool size is `num_cpus` (= 2 on GHA ubuntu-latest).
Bumping concurrency 64 → 256 queued 256 manifest parses behind 2
rayon workers — head-of-line blocking. avg_parse jumped from ~10ms
to 266ms wall, dragging p1 utoo wall from 3.10s up to 3.33s.

Restore manifest-bench
----------------------

Brought back `crates/manifest-bench` (originally landed in the
post-#2818 driver hunt, dropped in af714eb3 once #2818 graduated).
It's a single-binary HTTP-only fetch tool that strips out the
ruborist pipeline (no BFS, no dedup, no parse, no project cache,
no lockfile write) — fires `GET <registry>/<name>` in parallel
and reports the same diag shape as the new `p1-breakdown` lines.

Goal: separate the network ceiling from the resolver pipeline so
the next round of p1 experiments (parse offload, partial parse,
dedicated parse pool, etc.) can be evaluated against a stable
"pure network" baseline.

Knobs (unchanged from the original drop):
  --concurrency N    sweep without rebuilding utoo
  --reps N           run same workload back-to-back
  --single-version   use /<name>/latest (smaller bodies)
  --user-agent X     UA-fingerprint experiments
  --http1-only       H2 vs H1 toggle
  --accept X         override Accept header

Same TLS stack as ruborist (rustls + aws-lc-rs, native roots).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 Cargo.toml                        |   1 +
 crates/manifest-bench/Cargo.toml  |  37 +++
 crates/manifest-bench/src/main.rs | 371 ++++++++++++++++++++++++++++++
 crates/pm/src/util/user_config.rs |  19 +-
 4 files changed, 421 insertions(+), 7 deletions(-)
 create mode 100644 crates/manifest-bench/Cargo.toml
 create mode 100644 crates/manifest-bench/src/main.rs

diff --git a/Cargo.toml b/Cargo.toml
index ef4a4f926..0574a185a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,7 @@
 [workspace]
 resolver = "2"
 members  = [
+  "crates/manifest-bench",
   "crates/pack-api",
   "crates/pack-cli",
   "crates/pack-core",
diff --git a/crates/manifest-bench/Cargo.toml b/crates/manifest-bench/Cargo.toml
new file mode 100644
index 000000000..5b01e57c0
--- /dev/null
+++ b/crates/manifest-bench/Cargo.toml
@@ -0,0 +1,37 @@
+[package]
+name        = "manifest-bench"
+version     = "0.0.0"
+edition     = "2024"
+license     = "MIT"
+publish     = false
+description = "Standalone HTTP-only manifest fetch benchmark, isolating network behaviour from ruborist's resolver pipeline."
+
+[[bin]]
+name = "manifest-bench"
+path = "src/main.rs"
+
+# tombi: format.rules.table-keys-order.disabled = true
+[dependencies]
+anyhow      = { workspace = true }
+clap        = { workspace = true }
+futures     = "0.3"
+serde       = { version = "1", features = ["derive"] }
+serde_json  = { workspace = true }
+tokio       = { workspace = true, features = ["macros", "rt-multi-thread", "fs", "time"] }
+
+# Identical TLS / DNS choices to ruborist so we measure the *protocol*
+# characteristics of the same stack, not a different implementation.
+reqwest             = { version = "0.12", default-features = false, features = [
+  "brotli",
+  "gzip",
+  "http2",
+  "rustls-tls-native-roots-no-provider",
+  "socks"
+] }
+rustls              = { version = "0.23", default-features = false, features = [
+  "aws-lc-rs",
+  "logging",
+  "std",
+  "tls12"
+] }
+rustls-native-certs = "0.8"
diff --git a/crates/manifest-bench/src/main.rs b/crates/manifest-bench/src/main.rs
new file mode 100644
index 000000000..fa70f3fe4
--- /dev/null
+++ b/crates/manifest-bench/src/main.rs
@@ -0,0 +1,371 @@
+//! Standalone HTTP-only manifest fetch benchmark.
+//!
+//! Isolates the network behaviour of `reqwest + rustls + tokio` from
+//! ruborist's resolver pipeline (BFS, dedup, parse, lockfile, project
+//! cache). Reads a list of package names, builds manifest URLs, fires
+//! parallel `GET` requests, records `(start, end)` per request, and
+//! reports the same diag shape as ruborist's `Preload HTTP diag` line.
+//!
+//! Two input modes:
+//! - `--names-file <path>` — newline-separated package names
+//! - `--lockfile <path>` — a npm-style package-lock.json; we extract
+//!   the `packages.*` (v3) or `dependencies.*` (v2) keys
+//!
+//! Two registry modes:
+//! - `<registry>/<name>` — full manifest endpoint (default, npmjs)
+//! - `<registry>/<name>/latest` — single-version endpoint
+//!   (gated behind `--single-version`)
+//!
+//! Each request reads the body to completion (we only measure I/O, no
+//! parse). Output: same fields as preload's HTTP diag for direct
+//! comparison.
+
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::Instant;
+
+use anyhow::{Context, Result, anyhow};
+use clap::Parser;
+use futures::stream::{FuturesUnordered, StreamExt};
+
+#[derive(Parser, Debug)]
+#[command(
+    name = "manifest-bench",
+    about = "HTTP-only manifest fetch bench (no parse, no resolver)"
+)]
+struct Args {
+    /// Registry base URL.
+    #[arg(long, default_value = "https://registry.npmjs.org")]
+    registry: String,
+
+    /// File of newline-separated package names. Mutually exclusive with `--lockfile`.
+    #[arg(long, conflicts_with = "lockfile")]
+    names_file: Option<PathBuf>,
+
+    /// `package-lock.json` file. Reads top-level `packages.*.name` keys.
+    #[arg(long)]
+    lockfile: Option<PathBuf>,
+
+    /// Maximum concurrent in-flight requests.
+    #[arg(long, default_value_t = 128)]
+    concurrency: usize,
+
+    /// Number of times to repeat the whole sweep (each iteration is a
+    /// fresh `reqwest::Client`, so connection pool / TLS handshake
+    /// costs are paid each time, matching `hyperfine` cold-start).
+    #[arg(long, default_value_t = 1)]
+    reps: usize,
+
+    /// Use the single-version endpoint `/<name>/latest` instead of the
+    /// full-manifest endpoint `/<name>`. Smaller bodies, more requests
+    /// served per byte.
+    #[arg(long)]
+    single_version: bool,
+
+    /// Override `Accept` header. Default mimics ruborist's preload
+    /// (`application/vnd.npm.install-v1+json` — abbreviated metadata).
+    #[arg(long)]
+    accept: Option<String>,
+
+    /// Override `User-Agent`. Default uses reqwest's default. Try
+    /// `Bun/1.x.x` to test whether Cloudflare differentiates by UA.
+    #[arg(long)]
+    user_agent: Option<String>,
+
+    /// Force HTTP/1.1 (no H2 negotiation). Default lets ALPN decide.
+    #[arg(long)]
+    http1_only: bool,
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let args = Args::parse();
+
+    let names = load_names(&args)?;
+    if names.is_empty() {
+        return Err(anyhow!("no package names found in input"));
+    }
+
+    println!(
+        "manifest-bench: registry={} concurrency={} reps={} names={} h1_only={} single_version={} accept={} ua={}",
+        args.registry,
+        args.concurrency,
+        args.reps,
+        names.len(),
+        args.http1_only,
+        args.single_version,
+        args.accept.as_deref().unwrap_or("<default>"),
+        args.user_agent.as_deref().unwrap_or("<reqwest default>"),
+    );
+
+    for rep in 1..=args.reps {
+        run_once(&args, &names, rep).await?;
+    }
+
+    Ok(())
+}
+
+fn load_names(args: &Args) -> Result<Vec<String>> {
+    if let Some(path) = &args.names_file {
+        let raw = std::fs::read_to_string(path).with_context(|| format!("read {path:?}"))?;
+        return Ok(raw
+            .lines()
+            .map(str::trim)
+            .filter(|s| !s.is_empty() && !s.starts_with('#'))
+            .map(str::to_string)
+            .collect());
+    }
+
+    if let Some(path) = &args.lockfile {
+        let raw = std::fs::read_to_string(path).with_context(|| format!("read {path:?}"))?;
+        return extract_lockfile_names(&raw);
+    }
+
+    Err(anyhow!("provide --names-file or --lockfile"))
+}
+
+/// Pull unique package names from an npm v3 lockfile (`packages.*`)
+/// or an older v2 lockfile (`dependencies.*`).
+fn extract_lockfile_names(raw: &str) -> Result<Vec<String>> {
+    use std::collections::BTreeSet;
+
+    let v: serde_json::Value = serde_json::from_str(raw).context("parse lockfile JSON")?;
+    let mut names: BTreeSet<String> = BTreeSet::new();
+
+    if let Some(packages) = v.get("packages").and_then(|p| p.as_object()) {
+        for key in packages.keys() {
+            if key.is_empty() {
+                continue;
+            }
+            // npm v3 packages key like "node_modules/foo" or
+            // "node_modules/@scope/bar/node_modules/baz" — take the
+            // last path segment (or @scope/name pair).
+            let last = last_module_name(key);
+            if !last.is_empty() {
+                names.insert(last);
+            }
+        }
+    } else if let Some(deps) = v.get("dependencies").and_then(|d| d.as_object()) {
+        for key in deps.keys() {
+            names.insert(key.clone());
+        }
+    }
+
+    Ok(names.into_iter().collect())
+}
+
+fn last_module_name(key: &str) -> String {
+    let parts: Vec<&str> = key.split("node_modules/").collect();
+    let tail = parts.last().copied().unwrap_or("");
+    tail.to_string()
+}
+
+#[derive(Debug)]
+struct ReqResult {
+    start: Instant,
+    end: Instant,
+    bytes: usize,
+    status: u16,
+}
+
+async fn run_once(args: &Args, names: &[String], rep: usize) -> Result<()> {
+    // Build a fresh client per rep — matches hyperfine's cold-start
+    // assumption that each iteration pays the TLS handshake cost.
+    let client = build_client(args)?;
+    let registry = Arc::new(args.registry.trim_end_matches('/').to_string());
+    let accept = Arc::new(
+        args.accept
+            .clone()
+            .unwrap_or_else(|| "application/vnd.npm.install-v1+json".to_string()),
+    );
+
+    let single_version = args.single_version;
+    let concurrency = args.concurrency;
+
+    let phase_start = Instant::now();
+    let mut futs = FuturesUnordered::new();
+    let mut idx = 0usize;
+    let mut results: Vec<ReqResult> = Vec::with_capacity(names.len());
+
+    while idx < names.len() && futs.len() < concurrency {
+        spawn_one(
+            &client,
+            &registry,
+            &names[idx],
+            &accept,
+            single_version,
+            &mut futs,
+        );
+        idx += 1;
+    }
+
+    while let Some(res) = futs.next().await {
+        results.push(res);
+        if idx < names.len() {
+            spawn_one(
+                &client,
+                &registry,
+                &names[idx],
+                &accept,
+                single_version,
+                &mut futs,
+            );
+            idx += 1;
+        }
+    }
+    let phase_wall_ms = phase_start.elapsed().as_millis();
+
+    report(rep, &results, phase_wall_ms);
+    Ok(())
+}
+
+type Fut = std::pin::Pin<Box<dyn std::future::Future<Output = ReqResult> + Send>>;
+
+fn spawn_one(
+    client: &reqwest::Client,
+    registry: &Arc<String>,
+    name: &str,
+    accept: &Arc<String>,
+    single_version: bool,
+    futs: &mut FuturesUnordered<Fut>,
+) {
+    let url = if single_version {
+        format!("{registry}/{name}/latest")
+    } else {
+        format!("{registry}/{name}")
+    };
+    let client = client.clone();
+    let accept = Arc::clone(accept);
+    futs.push(Box::pin(async move {
+        let start = Instant::now();
+        let req = client.get(&url).header("accept", accept.as_str()).send();
+        let (bytes, status) = match req.await {
+            Ok(resp) => {
+                let status = resp.status().as_u16();
+                let body = resp.bytes().await.map(|b| b.len()).unwrap_or(0);
+                (body, status)
+            }
+            Err(_) => (0, 0),
+        };
+        let end = Instant::now();
+        ReqResult {
+            start,
+            end,
+            bytes,
+            status,
+        }
+    }));
+}
+
+fn build_client(args: &Args) -> Result<reqwest::Client> {
+    // Install aws-lc-rs as the default crypto provider (idempotent —
+    // first call wins). Matches ruborist's `service::http` setup.
+    let _ = rustls::crypto::aws_lc_rs::default_provider().install_default();
+
+    let mut roots = rustls::RootCertStore::empty();
+    let native = rustls_native_certs::load_native_certs();
+    for cert in native.certs {
+        let _ = roots.add(cert);
+    }
+
+    let tls_config = rustls::ClientConfig::builder_with_provider(std::sync::Arc::new(
+        rustls::crypto::aws_lc_rs::default_provider(),
+    ))
+    .with_safe_default_protocol_versions()
+    .map_err(|e| anyhow!("rustls protocol versions: {e}"))?
+    .with_root_certificates(roots)
+    .with_no_client_auth();
+
+    let mut builder = reqwest::Client::builder()
+        .use_preconfigured_tls(tls_config)
+        .no_proxy()
+        .pool_max_idle_per_host(256);
+    if args.http1_only {
+        builder = builder.http1_only();
+    }
+    if let Some(ua) = &args.user_agent {
+        builder = builder.user_agent(ua);
+    }
+    builder.build().context("build reqwest client")
+}
+
+fn report(rep: usize, results: &[ReqResult], wall_ms: u128) {
+    if results.is_empty() {
+        eprintln!("[rep {rep}] no results");
+        return;
+    }
+
+    let mut spans: Vec<(Instant, Instant)> = results.iter().map(|r| (r.start, r.end)).collect();
+    spans.sort_by_key(|(s, _)| *s);
+
+    let first_start = spans.first().unwrap().0;
+    let last_end = spans.iter().map(|(_, e)| *e).max().unwrap();
+    let win_wall = last_end.duration_since(first_start).as_millis();
+
+    let mut per_us: Vec<u128> = spans
+        .iter()
+        .map(|(s, e)| e.duration_since(*s).as_micros())
+        .collect();
+    per_us.sort_unstable();
+    let n = per_us.len();
+    let pct = |p: usize| per_us[(n * p).div_ceil(100).saturating_sub(1)];
+    let sum: u128 = per_us.iter().sum();
+    let p50 = per_us[n / 2];
+
+    let mut busy_us: u128 = 0;
+    let (mut cur_s, mut cur_e) = spans[0];
+    for &(s, e) in &spans[1..] {
+        if s <= cur_e {
+            if e > cur_e {
+                cur_e = e;
+            }
+        } else {
+            busy_us += cur_e.duration_since(cur_s).as_micros();
+            cur_s = s;
+            cur_e = e;
+        }
+    }
+    busy_us += cur_e.duration_since(cur_s).as_micros();
+
+    let bytes_total: usize = results.iter().map(|r| r.bytes).sum();
+    let ok = results.iter().filter(|r| r.status == 200).count();
+    let err = results.iter().filter(|r| r.status == 0).count();
+    let four_xx = results
+        .iter()
+        .filter(|r| (400..500).contains(&r.status))
+        .count();
+    let five_xx = results
+        .iter()
+        .filter(|r| (500..600).contains(&r.status))
+        .count();
+
+    let avg_conc = if busy_us > 0 {
+        sum as f64 / busy_us as f64
+    } else {
+        0.0
+    };
+
+    println!(
+        "[rep {rep}] n={} phase_wall={}ms win_wall={}ms busy={}ms ({:.0}%) sum={}ms avg_conc={:.1} p50={}ms p95={}ms p99={}ms max={}ms bytes={} 200={} 4xx={} 5xx={} err={}",
+        n,
+        wall_ms,
+        win_wall,
+        busy_us / 1000,
+        if win_wall > 0 {
+            100.0 * (busy_us as f64 / 1000.0) / win_wall as f64
+        } else {
+            0.0
+        },
+        sum / 1000,
+        avg_conc,
+        p50 / 1000,
+        pct(95) / 1000,
+        pct(99) / 1000,
+        per_us.last().unwrap() / 1000,
+        bytes_total,
+        ok,
+        four_xx,
+        five_xx,
+        err,
+    );
+}
diff --git a/crates/pm/src/util/user_config.rs b/crates/pm/src/util/user_config.rs
index bc281fb40..a0235830a 100644
--- a/crates/pm/src/util/user_config.rs
+++ b/crates/pm/src/util/user_config.rs
@@ -132,15 +132,20 @@ pub fn get_install_scope() -> InstallScope {
     INSTALL_SCOPE.get().copied().unwrap_or_default()
 }
 
-// Manifest fetch concurrency configuration.
+// Manifest fetch concurrency configuration. Default kept at 64.
 //
-// 256 to match bun's observed ~260 parallel TCP streams against
-// registry.npmjs.org. Local fetch-breakdown instrumentation showed
-// 88% of preload-phase work is in per-request RTT (TCP+TLS+server),
-// only 11% body, 0.16% parse — so the dominant lever for p1 wall is
-// the cap on concurrent in-flight manifest requests.
+// We tried 256 to match bun's observed parallel streams; on GHA the
+// fetch-breakdown instrumentation showed sum_parse exploded from
+// ~10ms (local Mac, network-bound) to 728s on first cold run with
+// 256 concurrency. Mechanism: parse_json_off_runtime dispatches to
+// rayon, which has only num_cpus (=2 on GHA) workers. Bumping
+// concurrency to 256 queued 256 parses behind 2 workers → wall
+// per-parse jumped from 730µs to 266ms. Net p1 wall *increased*
+// 3.10s → 3.33s on phases bench. Keep 64 until we address the
+// parse-side queueing (e.g. inline parse on tokio, or a wider
+// dedicated parse pool).
 static MANIFESTS_CONCURRENCY_LIMIT: LazyLock<ConfigValue<usize>> =
-    LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 256));
+    LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 64));
 
 pub fn set_manifests_concurrency_limit(value: Option<usize>) {
     MANIFESTS_CONCURRENCY_LIMIT.set(value);

From 5690a9b6b416fb7040a52a3ce24a303177d8bc76 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Fri, 8 May 2026 22:56:20 +0800
Subject: [PATCH 03/32] ci(p1): wire manifest-bench standalone HTTP sweep into
 bench-phases-linux
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

build-linux now also builds + uploads `manifest-bench` when a phases
bench is going to run (label or dispatch). bench-phases-linux
downloads the binary and runs it after the regular phase-isolated
benchmark.

Sweep mirrors the original (#2818-era) wire-in:

  concurrency: 32 / 64 / 96 / 128 / 192 / 256  (HTTP/1.1, full manifest)
  protocol:    H1 vs H2-negotiate  (cap=128)
  endpoint:    full vs `/<name>/latest`  (cap=128, smaller bodies)
  UA:          default vs `Bun/1.2.21`  (cap=128)

Output goes to /tmp/pm-bench-output/manifest-bench-npmjs.log and
ships in the existing pm-bench-logs-linux artifact — no PR comment
surface (the headline phases bench comment stays the same).

Why now: the new ruborist `p1-breakdown` instrumentation showed
sum_parse on GHA can dominate when concurrency is bumped (256:
sum_parse 728s vs sum_request 193s). To attribute the bun-vs-utoo
gap on p1_resolve we need a "pure HTTP" baseline that strips out
ruborist's parse / BFS / dedup / lockfile path. manifest-bench is
that baseline: same TLS stack as ruborist (rustls + aws-lc-rs,
native roots), no resolver pipeline.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/pm-e2e-bench.yml | 80 ++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/.github/workflows/pm-e2e-bench.yml b/.github/workflows/pm-e2e-bench.yml
index 74c90ece5..b25f5c380 100644
--- a/.github/workflows/pm-e2e-bench.yml
+++ b/.github/workflows/pm-e2e-bench.yml
@@ -143,6 +143,24 @@ jobs:
           name: utoo-linux-x64
           path: target/x86_64-unknown-linux-gnu/release/utoo
           retention-days: 1
+      # manifest-bench is a standalone HTTP-only fetch sweeper used as
+      # the network-only baseline for p1_resolve perf work. Built only
+      # when phases bench is going to run (label or dispatch), so plain
+      # PR builds aren't slowed by the extra crate.
+      - name: Build manifest-bench (p1 baseline)
+        if: >
+          (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'benchmark')) ||
+          (github.event_name == 'workflow_dispatch' && (inputs.target == 'pm-bench-phases' || inputs.target == 'pm-bench-pcap'))
+        run: cargo build --release --target x86_64-unknown-linux-gnu -p manifest-bench
+      - name: Upload manifest-bench binary
+        if: >
+          (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'benchmark')) ||
+          (github.event_name == 'workflow_dispatch' && (inputs.target == 'pm-bench-phases' || inputs.target == 'pm-bench-pcap'))
+        uses: actions/upload-artifact@v4
+        with:
+          name: manifest-bench-linux-x64
+          path: target/x86_64-unknown-linux-gnu/release/manifest-bench
+          retention-days: 1
       # Piggyback on the already-built target/ from the step above: when the
       # PR is labeled `benchmark`, overlay origin/next's tree onto the current
       # workdir and re-run cargo build. cargo's incremental compile only
@@ -516,6 +534,19 @@ jobs:
           mv /tmp/utoo-next-dist/utoo /tmp/utoo-next
           echo "Baseline utoo (next) version: $(/tmp/utoo-next --version)"
           echo "UTOO_NEXT_BIN=/tmp/utoo-next" >> $GITHUB_ENV
+      # Download the manifest-bench binary built by build-linux. Used as
+      # the network-only baseline for p1_resolve work — strips out parse,
+      # BFS, dedup, lockfile write so the wall is pure HTTP fetch.
+      - name: Download manifest-bench binary
+        uses: actions/download-artifact@v4
+        with:
+          name: manifest-bench-linux-x64
+          path: /tmp/manifest-bench-dist
+      - name: Install manifest-bench
+        run: |
+          chmod +x /tmp/manifest-bench-dist/manifest-bench
+          mv /tmp/manifest-bench-dist/manifest-bench /tmp/manifest-bench
+          echo "MANIFEST_BENCH_BIN=/tmp/manifest-bench" >> $GITHUB_ENV
       - name: Verify tools
         run: |
           hyperfine --version
@@ -565,6 +596,55 @@ jobs:
         run: |
           mkdir -p /tmp/pm-bench-output
           bash bench/pm-bench-phases.sh 2>&1 | tee /tmp/pm-bench-output/bench-phases-npmmirror.log
+      # Standalone HTTP-only sweep — sweeps the network-only ceiling
+      # against the same lockfile-derived workload phase-bench just used.
+      # Output goes into the bench logs artifact; no PR comment surface.
+      - name: Standalone manifest-bench (HTTP-only sweep)
+        env:
+          PROJECT: ${{ github.event.inputs.project || 'ant-design' }}
+          REGISTRY: 'https://registry.npmjs.org'
+        run: |
+          set -eu
+          mkdir -p /tmp/pm-bench-output
+          PROJECT_DIR="/tmp/pm-bench/$PROJECT"
+          if [ ! -d "$PROJECT_DIR" ]; then
+            mkdir -p /tmp/pm-bench
+            git clone --depth 1 "https://github.com/ant-design/$PROJECT" "$PROJECT_DIR"
+          fi
+          cd "$PROJECT_DIR"
+          if [ ! -f package-lock.json ]; then
+            echo "==> generating lockfile via utoo (one-shot, untimed)"
+            utoo deps --registry "$REGISTRY" || true
+          fi
+          ls -la package-lock.json || { echo "no lockfile; skipping manifest-bench"; exit 0; }
+
+          MB_LOG=/tmp/pm-bench-output/manifest-bench-npmjs.log
+          {
+            echo "============================================================"
+            echo "manifest-bench: HTTP-only fetch (no parse, no resolver)"
+            echo "  Goal: isolate reqwest/rustls/tokio behaviour from"
+            echo "  ruborist's resolver pipeline. Same metric shape as"
+            echo "  ruborist's p1-breakdown line."
+            echo "============================================================"
+            for CAP in 32 64 96 128 192 256; do
+              echo
+              echo "--- concurrency=$CAP, h1, full manifest, default UA ---"
+              "$MANIFEST_BENCH_BIN" --lockfile package-lock.json --registry "$REGISTRY" \
+                --concurrency "$CAP" --reps 2 --http1-only || true
+            done
+            echo
+            echo "--- concurrency=128, h2 negotiate, full manifest, default UA ---"
+            "$MANIFEST_BENCH_BIN" --lockfile package-lock.json --registry "$REGISTRY" \
+              --concurrency 128 --reps 2 || true
+            echo
+            echo "--- concurrency=128, h1, single-version endpoint ---"
+            "$MANIFEST_BENCH_BIN" --lockfile package-lock.json --registry "$REGISTRY" \
+              --concurrency 128 --reps 2 --http1-only --single-version || true
+            echo
+            echo "--- concurrency=128, h1, UA=Bun/1.2.21 ---"
+            "$MANIFEST_BENCH_BIN" --lockfile package-lock.json --registry "$REGISTRY" \
+              --concurrency 128 --reps 2 --http1-only --user-agent "Bun/1.2.21" || true
+          } 2>&1 | tee "$MB_LOG"
       - name: Upload bench logs
         if: always()
         uses: actions/upload-artifact@v4

From 94af458887de3add09f2e973dbbad6f2524f1a5f Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Fri, 8 May 2026 23:24:56 +0800
Subject: [PATCH 04/32] perf(ruborist): inline JSON parse, drop rayon::spawn
 dispatch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CI fetch-breakdown on GHA (run 25562552058, conc=64) showed parse
queueing on rayon dominates the gap to manifest-bench's pure-HTTP
baseline:

  manifest-bench (pure HTTP, conc=64): 2.12s wall
  utoo p1 (full ruborist):             3.10s wall  ← +1.0s overhead
  ↑ sum_parse 95s vs sum_request 95s, parse 50% of work-time
  ↑ avg_parse 30ms wall vs ~5ms actual CPU — the 25ms extra is rayon
    queue wait

Mechanism: 64 concurrent tasks all dispatching parse to rayon's pool
(size = num_cpus = 2 on GHA). Queue depth grows to ~32 per worker.
Each parse waits 25ms+ in queue before running its 5ms of CPU work.

Round 1 fix: inline parse, drop the rayon hop. simd_json on a tokio
worker thread is fast (~5ms for 115KB JSON), and the tokio runtime's
cooperative budget naturally rebalances CPU across the 64 tasks.

Expected on next CI:
- avg_parse drops from 30ms wall → ~5-10ms wall (close to CPU-only)
- preload_wall drops from 5.4s → ~3.5-4s for cold runs
- p1 hyperfine wall drops from 3.10s → 2.3-2.5s, narrowing the gap
  to manifest-bench's 2.12s ceiling

If parse becomes the new bottleneck (CPU-bound), next round could
look at partial parse / lazy field access. If wall doesn't drop,
hypothesis is wrong and we look elsewhere (BFS, dedup, lockfile).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/service/manifest.rs | 29 +++++++++----------------
 1 file changed, 10 insertions(+), 19 deletions(-)

diff --git a/crates/ruborist/src/service/manifest.rs b/crates/ruborist/src/service/manifest.rs
index 36bc6a85a..3502f6ec2 100644
--- a/crates/ruborist/src/service/manifest.rs
+++ b/crates/ruborist/src/service/manifest.rs
@@ -14,29 +14,20 @@ use super::http::get_client;
 use crate::model::manifest::{CoreVersionManifest, FullManifest};
 use crate::util::FETCH_TIMINGS;
 
-/// Parse JSON bytes on rayon's CPU thread pool (native) or inline
-/// (wasm32). Keeps the tokio runtime free of `simd_json` work so other
-/// in-flight manifest fetches keep driving network IO while this one
-/// parses.
+/// Parse JSON bytes inline on the calling tokio task. Previously this
+/// dispatched to `rayon::spawn` to "free the runtime", but
+/// fetch-breakdown instrumentation on GHA showed the rayon hop made it
+/// strictly worse: rayon's pool is `num_cpus` (= 2 on ubuntu-latest),
+/// 64 concurrent fetches all dispatching parse queued behind 2 workers
+/// — avg_parse ballooned from ~5ms (CPU only) to 30ms wall (queue +
+/// CPU). Inlining puts parse on the tokio worker that already owns
+/// the buffer; the cooperative-scheduling budget naturally rebalances
+/// CPU between fetches.
 async fn parse_json_off_runtime<T>(mut bytes: Vec<u8>) -> Result<T, anyhow::Error>
 where
     T: serde::de::DeserializeOwned + Send + 'static,
 {
-    #[cfg(not(target_arch = "wasm32"))]
-    {
-        let (tx, rx) = tokio::sync::oneshot::channel();
-        rayon::spawn(move || {
-            let result = simd_json::serde::from_slice::<T>(&mut bytes)
-                .map_err(|e| anyhow!("JSON parse error: {e}"));
-            let _ = tx.send(result);
-        });
-        rx.await
-            .map_err(|e| anyhow!("rayon parse channel closed: {e}"))?
-    }
-    #[cfg(target_arch = "wasm32")]
-    {
-        simd_json::serde::from_slice::<T>(&mut bytes).map_err(|e| anyhow!("JSON parse error: {e}"))
-    }
+    simd_json::serde::from_slice::<T>(&mut bytes).map_err(|e| anyhow!("JSON parse error: {e}"))
 }
 
 /// Result of a full manifest fetch with ETag support.

From ee5f5f4d23c8c9668c90c7d6b3b12eb49dab3afe Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Fri, 8 May 2026 23:47:49 +0800
Subject: [PATCH 05/32] perf(ruborist): switch JSON parse to tokio
 spawn_blocking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Round 1 (inline parse) reverted on data: GHA showed +0.37s p1
regression because parse blocked tokio runtime workers, dropping
eff_parallel 42 → 35 even though per-fetch work-time fell. avg_request
went up from 35ms → 52ms — symptomatic of socket reads being delayed
by the parsing task on the same worker.

  metric           round 0 (rayon)  round 1 (inline)
  p1 wall          3.27s            3.64s   ⚠️ +0.37s
  avg_parse        30ms (queued)    300µs   ✓
  avg_request      35ms             52ms    ⚠️ +17ms (worker contention)
  eff_parallel     42               35      ⚠️

Round 2 attempts the third option: `tokio::task::spawn_blocking`.

  - rayon's pool was too small (num_cpus = 2 on GHA) — 64 concurrent
    parses queued behind 2 workers, parse wall 30ms.
  - inline parse held tokio worker hostage during simd_json call,
    starving in-flight socket reads.
  - tokio's blocking pool has a much larger default cap (512), so 64
    concurrent parses never queue. Unlike rayon there's no contention
    with the install path's parallel-write rayon usage. Unlike inline
    the tokio runtime workers stay free to drive network I/O.

Expected on next CI:
  - avg_parse drops to ~5-10ms wall (close to CPU floor, no queue)
  - avg_request stays ~35ms (workers free for I/O)
  - eff_parallel returns to ~50, possibly higher
  - p1 wall drops toward manifest-bench's 2.10s ceiling

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/service/manifest.rs | 39 ++++++++++++++++++-------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/crates/ruborist/src/service/manifest.rs b/crates/ruborist/src/service/manifest.rs
index 3502f6ec2..90f1db71b 100644
--- a/crates/ruborist/src/service/manifest.rs
+++ b/crates/ruborist/src/service/manifest.rs
@@ -14,20 +14,39 @@ use super::http::get_client;
 use crate::model::manifest::{CoreVersionManifest, FullManifest};
 use crate::util::FETCH_TIMINGS;
 
-/// Parse JSON bytes inline on the calling tokio task. Previously this
-/// dispatched to `rayon::spawn` to "free the runtime", but
-/// fetch-breakdown instrumentation on GHA showed the rayon hop made it
-/// strictly worse: rayon's pool is `num_cpus` (= 2 on ubuntu-latest),
-/// 64 concurrent fetches all dispatching parse queued behind 2 workers
-/// — avg_parse ballooned from ~5ms (CPU only) to 30ms wall (queue +
-/// CPU). Inlining puts parse on the tokio worker that already owns
-/// the buffer; the cooperative-scheduling budget naturally rebalances
-/// CPU between fetches.
+/// Parse JSON bytes on tokio's blocking thread pool.
+///
+/// The history of this function captures three different attempts:
+///   - rayon::spawn (original): rayon's pool is `num_cpus` (= 2 on
+///     GHA), 64 concurrent parses queued behind 2 workers → avg_parse
+///     30ms wall vs ~5ms CPU. round-0 baseline.
+///   - inline (round 1, reverted): no rayon hop, but the simd_json
+///     call blocks the tokio runtime worker, so other in-flight
+///     fetches couldn't drive their socket I/O — avg_request grew
+///     35ms → 52ms (+17ms), eff_parallel 42 → 35, net p1 wall +0.37s.
+///   - spawn_blocking (current): tokio's dedicated blocking pool has
+///     a much higher default cap (512), so 64 concurrent parses are
+///     never queued. Unlike rayon there's no contention with the
+///     install path's parallel-write rayon usage, and unlike inline
+///     the tokio runtime workers stay free to drive network I/O on
+///     all in-flight fetches.
 async fn parse_json_off_runtime<T>(mut bytes: Vec<u8>) -> Result<T, anyhow::Error>
 where
     T: serde::de::DeserializeOwned + Send + 'static,
 {
-    simd_json::serde::from_slice::<T>(&mut bytes).map_err(|e| anyhow!("JSON parse error: {e}"))
+    #[cfg(not(target_arch = "wasm32"))]
+    {
+        tokio::task::spawn_blocking(move || {
+            simd_json::serde::from_slice::<T>(&mut bytes)
+                .map_err(|e| anyhow!("JSON parse error: {e}"))
+        })
+        .await
+        .map_err(|e| anyhow!("spawn_blocking parse panicked: {e}"))?
+    }
+    #[cfg(target_arch = "wasm32")]
+    {
+        simd_json::serde::from_slice::<T>(&mut bytes).map_err(|e| anyhow!("JSON parse error: {e}"))
+    }
 }
 
 /// Result of a full manifest fetch with ETag support.

From 16404fc481577a03b00ba2f46aa1f3711ec5351f Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 00:14:46 +0800
Subject: [PATCH 06/32] perf(ruborist): switch extract_core_version to
 spawn_blocking too
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Round 2 moved parse_json_off_runtime off rayon (-0.11s p1). But
fetch-breakdown still showed avg_request 41ms vs round 0's 35ms,
hinting at a second source of rayon contention.

Found it: `extract_core_version_off_runtime` is also on
`rayon::spawn`. On npmjs.org's `!supports_semver` path EVERY fetch
resolves through `resolve_via_full_manifest`, which fetches the
full packument once per package name (deduped via inflight_full)
and then calls `extract_core_version_off_runtime` per (name, spec)
to materialize the chosen version into a `CoreVersionManifest`.

So per fetch we hit rayon TWICE — once for the JSON parse (round 2
moved to spawn_blocking), and once for `get_core_version` (still on
rayon). The second hop has the same head-of-line blocking signature
as the first: 64 concurrent resolves dispatching to a 2-thread
rayon pool.

Round 3: move extract_core_version_off_runtime to spawn_blocking
for the same reasons. The work is JSON lazy-reparse (`raw_json`
sub-tree decoding) — genuinely blocking, well-suited for tokio's
blocking pool.

Expected: utoo p1 wall drops further toward manifest-bench's 2.10s
ceiling. avg_request should fall back from 41ms → ~35ms (rayon
contention removed from the fetch task's await chain).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/model/manifest.rs | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/crates/ruborist/src/model/manifest.rs b/crates/ruborist/src/model/manifest.rs
index 37e95deb9..15c762eb5 100644
--- a/crates/ruborist/src/model/manifest.rs
+++ b/crates/ruborist/src/model/manifest.rs
@@ -163,14 +163,20 @@ pub async fn extract_core_version_off_runtime(
     full: Arc<FullManifest>,
     version: String,
 ) -> (String, Option<Arc<CoreVersionManifest>>) {
+    // See `parse_json_off_runtime` for the same rayon-vs-spawn_blocking
+    // history: rayon's `num_cpus` pool oversubscribes when many concurrent
+    // resolves all extract from full manifests at once. spawn_blocking's
+    // larger pool avoids the queue, and the work is genuinely blocking
+    // (lazy JSON re-parse via `get_core_version`) so the blocking pool
+    // is the right home.
     #[cfg(not(target_arch = "wasm32"))]
     {
-        let (tx, rx) = tokio::sync::oneshot::channel();
-        rayon::spawn(move || {
+        tokio::task::spawn_blocking(move || {
             let core = full.get_core_version(&version).map(Arc::new);
-            let _ = tx.send((version, core));
-        });
-        rx.await.expect("rayon parse worker dropped before sending")
+            (version, core)
+        })
+        .await
+        .expect("spawn_blocking parse worker panicked")
     }
     #[cfg(target_arch = "wasm32")]
     {

From 460a53885b30982bd19c68ca1a866fa540c66a76 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 00:37:37 +0800
Subject: [PATCH 07/32] revert + instrument(ruborist): post-build phase timing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two changes for round 4 of p1 optimization:

1. Revert `extract_core_version_off_runtime` from spawn_blocking back
   to rayon::spawn (round 3). Within-run measurement showed +0.42s
   regression vs utoo-next (round 2 was +0.11s). Likely cause: this
   function is called per (name, spec), so multi-spec packages call
   it 2-5x per fetch. spawn_blocking's per-dispatch overhead exceeds
   rayon queue savings at this multiplier.

2. Add `serialize_us` and `cache_export_us` to the p1-breakdown line
   so we can attribute the remaining gap. Currently:

     manifest-bench wall:     2.10s   (pure HTTP ceiling)
     utoo p1 wall (round 2):  3.16s
     gap:                     1.06s

   We have:
     preload_wall  ≈ 2.7s   (logged)
     bfs_wall      ≈ 0.3s   (logged)
     serialize_us  ?
     cache_export_us ?      ← suspected: full manifest deep-clone
                              into ProjectCacheData for ~2730 entries

   Next round will have data to choose between attacking serialize,
   cache export, or the BFS loop body.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/model/manifest.rs | 24 +++++++++++++-----------
 crates/ruborist/src/service/api.rs    | 10 ++++++++++
 2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/crates/ruborist/src/model/manifest.rs b/crates/ruborist/src/model/manifest.rs
index 15c762eb5..3509e839d 100644
--- a/crates/ruborist/src/model/manifest.rs
+++ b/crates/ruborist/src/model/manifest.rs
@@ -163,20 +163,22 @@ pub async fn extract_core_version_off_runtime(
     full: Arc<FullManifest>,
     version: String,
 ) -> (String, Option<Arc<CoreVersionManifest>>) {
-    // See `parse_json_off_runtime` for the same rayon-vs-spawn_blocking
-    // history: rayon's `num_cpus` pool oversubscribes when many concurrent
-    // resolves all extract from full manifests at once. spawn_blocking's
-    // larger pool avoids the queue, and the work is genuinely blocking
-    // (lazy JSON re-parse via `get_core_version`) so the blocking pool
-    // is the right home.
+    // Round 3 attempted to switch this to `tokio::task::spawn_blocking`
+    // for the same reasons as `parse_json_off_runtime`, but CI showed
+    // it regressed p1 by 0.5s on `preload_wall`. Mechanism: this
+    // function is called per (name, spec), so packages with multiple
+    // specs (e.g. peer-dep range overlaps) call it 2-5x per fetch.
+    // spawn_blocking's per-dispatch overhead (channel + thread wake)
+    // is significant for short CPU work; with the multiplier this
+    // outweighed rayon queue waits at conc=64. Keep on rayon::spawn.
     #[cfg(not(target_arch = "wasm32"))]
     {
-        tokio::task::spawn_blocking(move || {
+        let (tx, rx) = tokio::sync::oneshot::channel();
+        rayon::spawn(move || {
             let core = full.get_core_version(&version).map(Arc::new);
-            (version, core)
-        })
-        .await
-        .expect("spawn_blocking parse worker panicked")
+            let _ = tx.send((version, core));
+        });
+        rx.await.expect("rayon parse worker dropped before sending")
     }
     #[cfg(target_arch = "wasm32")]
     {
diff --git a/crates/ruborist/src/service/api.rs b/crates/ruborist/src/service/api.rs
index 878b357a1..82703ed97 100644
--- a/crates/ruborist/src/service/api.rs
+++ b/crates/ruborist/src/service/api.rs
@@ -258,9 +258,12 @@ where
         .await
         .map_err(|e| anyhow::Error::new(e).context("Dependency resolution failed"))?;
 
+    let t_serialize_start = std::time::Instant::now();
     let (packages, _total) = graph.serialize_to_packages(&root_path);
+    let serialize_us = t_serialize_start.elapsed().as_micros() as u64;
 
     // Export project cache from memory cache for the host to persist.
+    let t_cache_export_start = std::time::Instant::now();
     let mut project_cache = ProjectCacheData::default();
     for (key, manifest) in registry.cache().export_version_manifests() {
         // `parse_package_spec` rather than `split_once('@')` so scoped names
@@ -271,6 +274,13 @@ where
         pkg_cache.specs.insert(spec.to_string(), version.clone());
         pkg_cache.manifests.insert(version, (*manifest).clone());
     }
+    let cache_export_us = t_cache_export_start.elapsed().as_micros() as u64;
+
+    tracing::info!(
+        "p1-breakdown serialize_us={} cache_export_us={}",
+        serialize_us,
+        cache_export_us,
+    );
 
     Ok(BuildDepsOutput {
         lock: PackageLock::new(&pkg.name, &pkg.version, packages),

From 58d49aafd2f886d1af364d91f85997e4dc01e37e Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 01:02:11 +0800
Subject: [PATCH 08/32] instrument(ruborist): preload main loop dispatch +
 result split
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Round 4 measured serialize_us = 15ms and cache_export_us = 34ms — both
tiny — confirming the 1s gap from manifest-bench (utoo p1 = 3.16s vs
mb wall = 2.10s) is not in post-build code.

Per-fetch math also pointed at main-loop bookkeeping:

  manifest-bench: eff_parallel = 52 (sum_work 111s / wall 2.14s)
  utoo preload  : eff_parallel = 43 (sum_work 120s / wall 2.85s)

Same conc=64 cap, but utoo loses 9 effective slots — most likely
the main loop's serial bookkeeping (dedup hash insert, format!
key, extract_transitive_deps, queue push, 3-4 receiver events)
holds the flow between futures.next() returning and the next
fetch dispatch.

This commit splits the main loop into two timed segments:

  preload_loop_dispatch_us: time spent in the `while in_flight <
                            concurrency` block — popping pending,
                            dedup check, futures.push.
  preload_loop_result_us:   time spent processing each completed
                            future — extract_transitive_deps,
                            pending.extend, on_manifest.

If dispatch+result sum approaches preload_wall, the main loop is
the bottleneck and we need to either (a) split processing onto a
dedicated task, or (b) use unbounded futures with a downstream
consumer. If they're small, the gap is elsewhere (per-task
overhead in resolve_package's inflight gates).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/resolver/preload.rs | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/crates/ruborist/src/resolver/preload.rs b/crates/ruborist/src/resolver/preload.rs
index 1230c5bf6..e9a777407 100644
--- a/crates/ruborist/src/resolver/preload.rs
+++ b/crates/ruborist/src/resolver/preload.rs
@@ -99,8 +99,17 @@ where
     let mut in_flight = 0usize;
     let mut started = false;
 
+    // Main-loop overhead instrumentation. Atomic accumulators so we
+    // can attribute the gap between manifest-bench's pure-HTTP wall
+    // and ruborist's preload wall: how much of the gap is bookkeeping
+    // (dedup hash, extract_transitive_deps, queue push, events) vs
+    // actual fetch wait?
+    let mut total_dispatch_us: u64 = 0;
+    let mut total_result_us: u64 = 0;
+
     loop {
         // Fill up to concurrency limit
+        let dispatch_start = tokio::time::Instant::now();
         while in_flight < concurrency {
             let item = loop {
                 let Some((name, spec)) = pending.pop_front() else {
@@ -134,6 +143,7 @@ where
             });
             in_flight += 1;
         }
+        total_dispatch_us += dispatch_start.elapsed().as_micros() as u64;
 
         if in_flight == 0 {
             break;
@@ -142,6 +152,7 @@ where
         let Some((name, result, elapsed_ms)) = futures.next().await else {
             break;
         };
+        let result_start = tokio::time::Instant::now();
         in_flight -= 1;
 
         if stats.success_count == 0 && stats.failed_count == 0 {
@@ -174,8 +185,15 @@ where
                 tracing::debug!("Failed to preload {}: {}", name, e);
             }
         }
+        total_result_us += result_start.elapsed().as_micros() as u64;
     }
 
+    tracing::info!(
+        "p1-breakdown preload_loop_dispatch_us={} preload_loop_result_us={}",
+        total_dispatch_us,
+        total_result_us,
+    );
+
     stats.total_processed = processed.len();
 
     receiver.on_event(BuildEvent::PreloadComplete {

From 8114bf42af0e9d102bd9c2893acd764d9e0470be Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 01:25:54 +0800
Subject: [PATCH 09/32] perf(pm): grow rayon pool to max(num_cpus, 8) to drain
 p1 extract queue
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Round 5 main-loop instrumentation showed the preload main loop
itself is fast (15-25ms total dispatch+result). The 0.8s gap from
manifest-bench's 2.10s wall lives INSIDE the spawned fetch tasks.

Per-fetch wall (warm runs):
  measured: avg_request 30ms + avg_body 6ms + avg_parse 2.5ms = ~38ms
  derived:  preload_wall 2.4s × eff_parallel(43) / 2730 = 38ms
  delta:    ~12ms unaccounted per task

That 12ms is `extract_core_version_off_runtime` queueing on rayon's
2-thread pool. extract is called per (name, spec) — for ant-design
that's ~3000+ calls. With pool=2 and 64 concurrent fetches each
dispatching extract, the queue depth grows; each task waits its
turn before extract returns.

Bump rayon pool to `max(num_cpus, 8)` for non-Windows. Sizing the
pool above the CPU count for short blocking JSON ops (parse + extract)
replaces FIFO queueing with parallel dispatch. Real CPU contention
is bounded by num_cpus (the kernel scheduler still gates), so the
extra pool threads just hold ready-to-run dispatches in parallel
rather than serialised in a queue.

Why not just spawn_blocking (round 3 attempt): tokio's blocking pool
defaults to 512 threads, but its per-dispatch overhead was higher
than rayon's even when queueing — round 3 regressed by 0.5s.

Expected: extract queue wait drops from ~12ms to ~1-2ms wall, p1
preload_wall narrows toward manifest-bench's 2.10s.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/pm/src/util/sysconf.rs | 45 ++++++++++++++++++++++++++++++-----
 1 file changed, 39 insertions(+), 6 deletions(-)

diff --git a/crates/pm/src/util/sysconf.rs b/crates/pm/src/util/sysconf.rs
index af77a7745..645b7b451 100644
--- a/crates/pm/src/util/sysconf.rs
+++ b/crates/pm/src/util/sysconf.rs
@@ -6,13 +6,46 @@ pub fn init() {
         reset_sigpipe();
     }
 
-    // Windows default thread stack is 1MB, insufficient for libdeflater + tar
-    // + rayon work-stealing.
+    init_rayon_pool();
+}
+
+/// Configure the global rayon pool size.
+///
+/// Rayon defaults to `num_cpus` workers, which is 2 on GHA ubuntu-latest.
+/// Two workers are enough for the install-path's `par_chunks(64)` extract
+/// (mostly disk-bound), but the resolve-path's manifest parse + extract
+/// pipeline runs *many* short CPU bursts (parse: ~5ms, get_core_version:
+/// ~1-3ms) dispatched from up to 64 concurrent fetches.
+///
+/// With pool=2, each fetch waits up to ~25ms in queue per dispatch —
+/// fetch-breakdown instrumentation showed avg_parse jumping 5ms (CPU)
+/// → 30ms (CPU + queue) just from the first dispatch. The second hop
+/// (`extract_core_version_off_runtime`) has the same problem. `tokio
+/// spawn_blocking` avoids the queue but its per-dispatch overhead
+/// (round 3 measurement) was higher than rayon's queue wait at 64×.
+///
+/// Sizing the pool above the host CPU count for these short, blocking
+/// JSON-shape operations gives the queue a chance to drain even when
+/// 64 fetches dispatch concurrently. The work itself is bounded — at
+/// most 2 are doing real CPU at once on a 2-core box; the extra pool
+/// slots just hold pending tasks until a CPU is free, replacing FIFO
+/// queueing with parallel dispatch.
+///
+/// Cap of 8 keeps the pool reasonable on bigger machines (where
+/// `num_cpus` is already enough); the floor of 8 oversubscribes
+/// only on the constrained 2-core CI image.
+fn init_rayon_pool() {
+    let parallelism = std::thread::available_parallelism()
+        .map(std::num::NonZero::get)
+        .unwrap_or(2);
+    let threads = parallelism.max(8);
+
+    let builder = rayon::ThreadPoolBuilder::new().num_threads(threads);
+
     #[cfg(target_os = "windows")]
-    rayon::ThreadPoolBuilder::new()
-        .stack_size(8 * 1024 * 1024)
-        .build_global()
-        .ok();
+    let builder = builder.stack_size(8 * 1024 * 1024);
+
+    builder.build_global().ok();
 }
 
 /// Restore default SIGPIPE handling so broken pipes cause a clean exit

From 394f6c92d7c5f929c18846abec54fefb9dbbb1bd Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 04:06:09 +0800
Subject: [PATCH 10/32] perf(pm): skip preload for p1 path; BFS does per-level
 parallel prefetch

Adds `BuildDepsOptions::skip_preload` so callers without a pipeline
consumer (utoo deps / package-lock-only) can drop the up-front
preload phase entirely. BFS now batches prefetch per level across
the whole frontier, then runs the existing sequential
process_dependency walk against the warmed cache.

For install paths (Context::pipeline_deps_options), skip_preload
stays false so PackageResolved events still feed the
download/clone pipeline.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/pm/src/helper/ruborist_context.rs |  8 ++-
 crates/ruborist/src/resolver/builder.rs  | 71 +++++++++++++++++++++---
 crates/ruborist/src/service/api.rs       | 21 ++++++-
 3 files changed, 90 insertions(+), 10 deletions(-)

diff --git a/crates/pm/src/helper/ruborist_context.rs b/crates/pm/src/helper/ruborist_context.rs
index b47def019..bc4d7faa1 100644
--- a/crates/pm/src/helper/ruborist_context.rs
+++ b/crates/pm/src/helper/ruborist_context.rs
@@ -63,6 +63,7 @@ impl Context {
             receiver,
             supports_semver: get_supports_semver(),
             catalogs,
+            skip_preload: false,
         }
     }
 
@@ -82,8 +83,13 @@ impl Context {
     /// Resolve dependency tree with plain ProgressReceiver. Returns
     /// [`BuildDepsOutput`] (lock + project cache); the project cache is
     /// persisted in the background.
+    ///
+    /// Used by the lockfile-only path (`utoo deps`). No pipeline consumes
+    /// `PackageResolved` events here, so preload is pure overhead — BFS's
+    /// own per-level parallel prefetch warms the manifest cache.
     pub async fn build_deps(cwd: PathBuf) -> anyhow::Result<BuildDepsOutput> {
-        let options = Self::deps_options(cwd.clone(), ProgressReceiver).await;
+        let mut options = Self::deps_options(cwd.clone(), ProgressReceiver).await;
+        options.skip_preload = true;
         let output = utoo_ruborist::service::build_deps(options).await?;
         spawn_save_project_cache(cwd, output.project_cache.clone());
         Ok(output)
diff --git a/crates/ruborist/src/resolver/builder.rs b/crates/ruborist/src/resolver/builder.rs
index 166372c91..d811fc38c 100644
--- a/crates/ruborist/src/resolver/builder.rs
+++ b/crates/ruborist/src/resolver/builder.rs
@@ -18,21 +18,22 @@
 //! This separation allows for maximum parallelism during network I/O
 //! while keeping the graph building logic simple and deterministic.
 
-use petgraph::graph::NodeIndex;
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::path::PathBuf;
 use std::sync::Arc;
 
 #[cfg(feature = "http-tarball")]
 use anyhow::Context as _;
+use futures::stream::{self, StreamExt};
+use petgraph::graph::NodeIndex;
 
 use crate::model::graph::{DependencyGraph, FindResult, PackageNode};
 use crate::model::manifest::NodeManifest;
 use crate::model::node::EdgeType;
 use crate::model::package_json::PackageJson;
 use crate::resolver::preload::{PreloadConfig, preload_manifests};
-use crate::resolver::registry::{ResolveError, resolve_registry_dep};
-use crate::spec::{Catalogs, PackageSpec, Protocol};
+use crate::resolver::registry::{ResolveError, resolve_package, resolve_registry_dep};
+use crate::spec::{Catalogs, PackageSpec, Protocol, SpecStr};
 use crate::traits::progress::{BuildEvent, EventReceiver, NoopReceiver};
 use crate::traits::registry::{RegistryClient, ResolvedPackage};
 
@@ -181,9 +182,6 @@ struct NodeFlags {
 /// resolved at edge creation time, so by the time this runs they are already
 /// concrete registry specs.
 fn gather_preload_deps(graph: &DependencyGraph, peer_deps: PeerDeps) -> Vec<(String, String)> {
-    use crate::spec::SpecStr;
-    use std::collections::HashSet;
-
     let mut deps = HashSet::new();
 
     let collect = |node_index: NodeIndex, deps: &mut HashSet<(String, String)>| {
@@ -805,20 +803,74 @@ async fn run_preload_phase<R: RegistryClient, E: EventReceiver>(
 }
 
 /// Run the BFS traversal phase to build the dependency tree.
+///
+/// Each level does a parallel prefetch of all unresolved registry specs
+/// before the sequential `process_dependency` walk. The prefetch warms
+/// the registry's manifest cache so the per-edge `process_dependency`
+/// calls below hit cache instead of awaiting network.
+///
+/// This collapses the previously-separate `run_preload_phase` (which
+/// fetched all transitive manifests up-front) into per-level batches.
+/// Net effect on `utoo deps`: no separate preload wall — fetch happens
+/// inside BFS in waves matching the dep tree's natural levels. For
+/// install paths (p0/p3), `run_preload_phase` may still run via
+/// `skip_preload=false` and feed the `PackageResolved` pipeline event.
 async fn run_bfs_phase<R: RegistryClient, E: EventReceiver>(
     graph: &mut DependencyGraph,
     registry: &R,
     config: &BuildDepsConfig,
     receiver: &E,
 ) -> Result<(), ResolveError<R::Error>> {
+    // Reset fetch counters so the breakdown line reports fetches issued
+    // *during* this BFS phase, not preload's. (Preload still runs for
+    // install-path callers and reports its own breakdown.)
+    if config.skip_preload {
+        crate::util::FETCH_TIMINGS.reset();
+    }
+
     let start = tokio::time::Instant::now();
+    let mut total_prefetch_wall_us: u64 = 0;
+    let mut total_merge_wall_us: u64 = 0;
 
     let mut current_level = vec![graph.root_index];
+    let mut prefetched: HashSet<String> = HashSet::new();
 
     while !current_level.is_empty() {
         receiver.on_event(BuildEvent::LevelStart {
             node_count: current_level.len(),
         });
+
+        // Phase A: collect unresolved registry edges across the whole level
+        // (deduplicated against earlier levels — once a (name, spec) is
+        // prefetched, the registry's cache satisfies every subsequent
+        // `process_dependency` call).
+        let mut prefetch_targets: Vec<(String, String)> = Vec::new();
+        for &node_index in &current_level {
+            for edge in collect_unresolved_edges(graph, node_index) {
+                if edge.spec.is_registry_spec() {
+                    let key = format!("{}@{}", edge.name, edge.spec);
+                    if prefetched.insert(key) {
+                        prefetch_targets.push((edge.name, edge.spec));
+                    }
+                }
+            }
+        }
+
+        // Phase B: parallel prefetch — pure cache warming. Errors are
+        // ignored here; the sequential `process_dependency` below will
+        // re-issue (now hitting either cache or the same fresh failure)
+        // and propagate any real error through the existing path.
+        if !prefetch_targets.is_empty() {
+            let prefetch_start = tokio::time::Instant::now();
+            stream::iter(prefetch_targets)
+                .for_each_concurrent(config.concurrency, |(name, spec)| async move {
+                    let _ = resolve_package(registry, &name, &spec).await;
+                })
+                .await;
+            total_prefetch_wall_us += prefetch_start.elapsed().as_micros() as u64;
+        }
+
+        let merge_start = tokio::time::Instant::now();
         let mut next_level = Vec::new();
 
         for node_index in current_level {
@@ -900,14 +952,17 @@ async fn run_bfs_phase<R: RegistryClient, E: EventReceiver>(
         receiver.on_event(BuildEvent::LevelComplete {
             next_level_count: next_level.len(),
         });
+        total_merge_wall_us += merge_start.elapsed().as_micros() as u64;
         current_level = next_level;
     }
 
     let bfs_elapsed = start.elapsed();
     tracing::debug!("Build phase: {:?}", bfs_elapsed);
     tracing::info!(
-        "p1-breakdown bfs_wall={}ms | {}",
+        "p1-breakdown bfs_wall={}ms bfs_prefetch_wall_us={} bfs_merge_wall_us={} | {}",
         bfs_elapsed.as_millis(),
+        total_prefetch_wall_us,
+        total_merge_wall_us,
         crate::util::FETCH_TIMINGS.snapshot().summary_line(),
     );
     Ok(())
diff --git a/crates/ruborist/src/service/api.rs b/crates/ruborist/src/service/api.rs
index 82703ed97..5a14f2a56 100644
--- a/crates/ruborist/src/service/api.rs
+++ b/crates/ruborist/src/service/api.rs
@@ -70,6 +70,16 @@ pub struct BuildDepsOptions<G, R> {
     /// Catalog definitions for the `catalog:` dependency protocol.
     /// Key `""` = default catalog, other keys = named catalogs.
     pub catalogs: Catalogs,
+    /// When true, skip the up-front `run_preload_phase`. Set by callers
+    /// that don't consume the `BuildEvent::PackageResolved` pipeline
+    /// stream — e.g. `utoo deps` (lockfile-only). The BFS phase has its
+    /// own per-level prefetch that warms the manifest cache, so dropping
+    /// preload doesn't change correctness, only avoids the redundant
+    /// up-front fetch + dedicated wall.
+    /// Install paths (which feed `PipelineReceiver` to start tarball
+    /// downloads as resolves complete) leave this false so preload still
+    /// emits PackageResolved events to the pipeline.
+    pub skip_preload: bool,
 }
 
 impl<G, R> BuildDepsOptions<G, R> {
@@ -91,6 +101,7 @@ impl<G, R> BuildDepsOptions<G, R> {
             receiver,
             supports_semver: None,
             catalogs: HashMap::new(),
+            skip_preload: false,
         }
     }
 }
@@ -132,6 +143,7 @@ where
         receiver,
         supports_semver,
         catalogs,
+        skip_preload: skip_preload_caller,
     } = options;
 
     // 1. Find root path (workspace root if applicable)
@@ -234,7 +246,13 @@ where
         registry.supports_semver(),
     );
 
-    let skip_preload = cache_count > 0;
+    // Skip preload when:
+    //   - the caller asked us to (e.g. `utoo deps`, no pipeline consumer
+    //     for PackageResolved events — BFS does its own per-level
+    //     prefetch, preload is redundant), OR
+    //   - the project's warm cache already has manifests covering most
+    //     of the workload (existing skip-on-warm behavior).
+    let skip_preload = skip_preload_caller || cache_count > 0;
     let mut config = BuildDepsConfig::default()
         .with_peer_deps(peer_deps)
         .with_concurrency(concurrency)
@@ -334,6 +352,7 @@ mod tests {
             receiver: NoopReceiver,
             supports_semver: None,
             catalogs: HashMap::new(),
+            skip_preload: false,
         };
 
         assert_eq!(options.concurrency, 20);

From 596cd2045fd6ef5031703343b52ccad2a67a907f Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 04:18:21 +0800
Subject: [PATCH 11/32] perf(pm): fast_preload bypasses UnifiedRegistry for
 utoo deps path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds resolver::fast_preload, a manifest-bench-style flat
FuturesUnordered over service::manifest::fetch_full_manifest. It
warms MemoryCache (both full_manifests and version_manifests slots)
synchronously after each fetch, so the BFS phase is pure cache-hit:
no rayon hop on extract_core_version, no OnceMap gates, no
DiskManifestStore writes, no PackageResolved events.

Wired into service::api::build_deps: when the caller asks to skip
preload (Context::build_deps for `utoo deps`) and there's no warm
project cache, fast_preload runs ahead of build_deps_with_config.
Install paths still go through preload_manifests so the pipeline
keeps its early-start signal.

Also reverts the per-level prefetch I added in 394f6c92 — with
fast_preload pre-warming everything, BFS doesn't need its own
prefetch wave.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/resolver/builder.rs      |  72 ++----
 crates/ruborist/src/resolver/fast_preload.rs | 234 +++++++++++++++++++
 crates/ruborist/src/resolver/mod.rs          |   1 +
 crates/ruborist/src/service/api.rs           |  26 +++
 4 files changed, 275 insertions(+), 58 deletions(-)
 create mode 100644 crates/ruborist/src/resolver/fast_preload.rs

diff --git a/crates/ruborist/src/resolver/builder.rs b/crates/ruborist/src/resolver/builder.rs
index d811fc38c..156622502 100644
--- a/crates/ruborist/src/resolver/builder.rs
+++ b/crates/ruborist/src/resolver/builder.rs
@@ -24,7 +24,6 @@ use std::sync::Arc;
 
 #[cfg(feature = "http-tarball")]
 use anyhow::Context as _;
-use futures::stream::{self, StreamExt};
 use petgraph::graph::NodeIndex;
 
 use crate::model::graph::{DependencyGraph, FindResult, PackageNode};
@@ -32,7 +31,7 @@ use crate::model::manifest::NodeManifest;
 use crate::model::node::EdgeType;
 use crate::model::package_json::PackageJson;
 use crate::resolver::preload::{PreloadConfig, preload_manifests};
-use crate::resolver::registry::{ResolveError, resolve_package, resolve_registry_dep};
+use crate::resolver::registry::{ResolveError, resolve_registry_dep};
 use crate::spec::{Catalogs, PackageSpec, Protocol, SpecStr};
 use crate::traits::progress::{BuildEvent, EventReceiver, NoopReceiver};
 use crate::traits::registry::{RegistryClient, ResolvedPackage};
@@ -181,7 +180,10 @@ struct NodeFlags {
 /// Only registry specs (e.g. `^4.17.0`) are collected. `catalog:` specs are
 /// resolved at edge creation time, so by the time this runs they are already
 /// concrete registry specs.
-fn gather_preload_deps(graph: &DependencyGraph, peer_deps: PeerDeps) -> Vec<(String, String)> {
+pub(crate) fn gather_preload_deps(
+    graph: &DependencyGraph,
+    peer_deps: PeerDeps,
+) -> Vec<(String, String)> {
     let mut deps = HashSet::new();
 
     let collect = |node_index: NodeIndex, deps: &mut HashSet<(String, String)>| {
@@ -805,72 +807,29 @@ async fn run_preload_phase<R: RegistryClient, E: EventReceiver>(
 /// Run the BFS traversal phase to build the dependency tree.
 ///
 /// Each level does a parallel prefetch of all unresolved registry specs
-/// before the sequential `process_dependency` walk. The prefetch warms
-/// the registry's manifest cache so the per-edge `process_dependency`
-/// calls below hit cache instead of awaiting network.
+/// before the sequential `process_dependency` walk.
 ///
-/// This collapses the previously-separate `run_preload_phase` (which
-/// fetched all transitive manifests up-front) into per-level batches.
-/// Net effect on `utoo deps`: no separate preload wall — fetch happens
-/// inside BFS in waves matching the dep tree's natural levels. For
-/// install paths (p0/p3), `run_preload_phase` may still run via
-/// `skip_preload=false` and feed the `PackageResolved` pipeline event.
+/// When `skip_preload=true` (lockfile-only path), the caller is
+/// expected to have already populated `registry.cache()` via
+/// [`super::fast_preload::fast_preload`], so this BFS sees only
+/// cache hits. When `skip_preload=false` (install paths), the
+/// receiver-driven [`super::preload::preload_manifests`] runs ahead
+/// of this phase and feeds `BuildEvent::PackageResolved` to the
+/// pipeline.
 async fn run_bfs_phase<R: RegistryClient, E: EventReceiver>(
     graph: &mut DependencyGraph,
     registry: &R,
     config: &BuildDepsConfig,
     receiver: &E,
 ) -> Result<(), ResolveError<R::Error>> {
-    // Reset fetch counters so the breakdown line reports fetches issued
-    // *during* this BFS phase, not preload's. (Preload still runs for
-    // install-path callers and reports its own breakdown.)
-    if config.skip_preload {
-        crate::util::FETCH_TIMINGS.reset();
-    }
-
     let start = tokio::time::Instant::now();
-    let mut total_prefetch_wall_us: u64 = 0;
-    let mut total_merge_wall_us: u64 = 0;
-
     let mut current_level = vec![graph.root_index];
-    let mut prefetched: HashSet<String> = HashSet::new();
 
     while !current_level.is_empty() {
         receiver.on_event(BuildEvent::LevelStart {
             node_count: current_level.len(),
         });
 
-        // Phase A: collect unresolved registry edges across the whole level
-        // (deduplicated against earlier levels — once a (name, spec) is
-        // prefetched, the registry's cache satisfies every subsequent
-        // `process_dependency` call).
-        let mut prefetch_targets: Vec<(String, String)> = Vec::new();
-        for &node_index in &current_level {
-            for edge in collect_unresolved_edges(graph, node_index) {
-                if edge.spec.is_registry_spec() {
-                    let key = format!("{}@{}", edge.name, edge.spec);
-                    if prefetched.insert(key) {
-                        prefetch_targets.push((edge.name, edge.spec));
-                    }
-                }
-            }
-        }
-
-        // Phase B: parallel prefetch — pure cache warming. Errors are
-        // ignored here; the sequential `process_dependency` below will
-        // re-issue (now hitting either cache or the same fresh failure)
-        // and propagate any real error through the existing path.
-        if !prefetch_targets.is_empty() {
-            let prefetch_start = tokio::time::Instant::now();
-            stream::iter(prefetch_targets)
-                .for_each_concurrent(config.concurrency, |(name, spec)| async move {
-                    let _ = resolve_package(registry, &name, &spec).await;
-                })
-                .await;
-            total_prefetch_wall_us += prefetch_start.elapsed().as_micros() as u64;
-        }
-
-        let merge_start = tokio::time::Instant::now();
         let mut next_level = Vec::new();
 
         for node_index in current_level {
@@ -952,17 +911,14 @@ async fn run_bfs_phase<R: RegistryClient, E: EventReceiver>(
         receiver.on_event(BuildEvent::LevelComplete {
             next_level_count: next_level.len(),
         });
-        total_merge_wall_us += merge_start.elapsed().as_micros() as u64;
         current_level = next_level;
     }
 
     let bfs_elapsed = start.elapsed();
     tracing::debug!("Build phase: {:?}", bfs_elapsed);
     tracing::info!(
-        "p1-breakdown bfs_wall={}ms bfs_prefetch_wall_us={} bfs_merge_wall_us={} | {}",
+        "p1-breakdown bfs_wall={}ms | {}",
         bfs_elapsed.as_millis(),
-        total_prefetch_wall_us,
-        total_merge_wall_us,
         crate::util::FETCH_TIMINGS.snapshot().summary_line(),
     );
     Ok(())
diff --git a/crates/ruborist/src/resolver/fast_preload.rs b/crates/ruborist/src/resolver/fast_preload.rs
new file mode 100644
index 000000000..975c18a81
--- /dev/null
+++ b/crates/ruborist/src/resolver/fast_preload.rs
@@ -0,0 +1,234 @@
+//! Lean parallel manifest fetcher modeled on `manifest-bench`.
+//!
+//! Bypasses [`crate::service::registry::UnifiedRegistry`] — and therefore
+//! its `OnceMap` gates, [`crate::service::store::ManifestStore`] writes,
+//! and `EventReceiver` event dispatch — to drive a flat
+//! `FuturesUnordered` over [`crate::service::manifest::fetch_full_manifest`]
+//! plus a synchronous transitive walk. The warm
+//! [`crate::service::cache::MemoryCache`] it leaves behind makes the
+//! subsequent BFS phase a pure cache-hit walk: no network, no rayon
+//! re-parse hop on `extract_core_version`.
+//!
+//! Intended for the lockfile-only path (`utoo deps`) which has no
+//! pipeline consumer for `BuildEvent::PackageResolved` — install paths
+//! still go through [`super::preload::preload_manifests`] so the
+//! pipeline keeps its early-start signal.
+
+use std::collections::{HashSet, VecDeque};
+use std::sync::Arc;
+
+use futures::stream::{FuturesUnordered, StreamExt};
+
+use crate::model::manifest::CoreVersionManifest;
+use crate::model::node::PeerDeps;
+use crate::resolver::preload::{Dep, PreloadConfig};
+use crate::resolver::version::resolve_target_version;
+use crate::service::{
+    FetchManifestOptions, FetchManifestResult, MemoryCache, MetadataFormat, fetch_full_manifest,
+};
+use crate::spec::SpecStr;
+use crate::util::FETCH_TIMINGS;
+
+/// Statistics from the lean fetch loop. Mirrors `PreloadStats` shape so
+/// the bench-grep regex stays the same.
+#[derive(Debug, Default)]
+pub struct FastPreloadStats {
+    pub success_count: usize,
+    pub failed_count: usize,
+    pub fetched_names: usize,
+    pub min_request_ms: u64,
+    pub max_request_ms: u64,
+    pub total_request_ms: u64,
+}
+
+/// Collect dependencies from any deps map, filtering out non-registry specs.
+fn collect_deps(map: Option<&std::collections::HashMap<String, String>>) -> Vec<Dep> {
+    map.into_iter()
+        .flatten()
+        .filter(|(_, spec)| spec.is_registry_spec())
+        .map(|(name, spec)| (name.clone(), spec.clone()))
+        .collect()
+}
+
+/// Extract transitive dependencies from a resolved manifest.
+/// devDependencies are omitted (only the root installs devDeps).
+fn extract_transitive_deps(manifest: &CoreVersionManifest, peer_deps: PeerDeps) -> Vec<Dep> {
+    let mut deps = Vec::new();
+    deps.extend(collect_deps(manifest.dependencies.as_ref()));
+    if peer_deps == PeerDeps::Include {
+        deps.extend(collect_deps(manifest.peer_dependencies.as_ref()));
+    }
+    deps.extend(collect_deps(manifest.optional_dependencies.as_ref()));
+    deps
+}
+
+/// Resolve `(name, spec)` against the cached `FullManifest` synchronously.
+///
+/// Inlines the work that `UnifiedRegistry::resolve_via_full_manifest` does
+/// after a cache hit — pick a version, parse just that subset, populate
+/// the per-version cache slot the BFS phase will read from. Skips the
+/// rayon/`spawn_blocking` hop because the caller is already doing
+/// CPU-bound bookkeeping between fetches.
+fn settle_spec(name: &str, spec: &str, cache: &MemoryCache, peer_deps: PeerDeps) -> Vec<Dep> {
+    let Some(full) = cache.get_full_manifest(name) else {
+        return Vec::new();
+    };
+    let Ok(resolved_version) = resolve_target_version((&*full).into(), spec) else {
+        return Vec::new();
+    };
+    if let Some(cached) = cache.get_version_manifest(name, &resolved_version) {
+        return extract_transitive_deps(&cached, peer_deps);
+    }
+    let Some(core) = full.get_core_version(&resolved_version) else {
+        return Vec::new();
+    };
+    let core_arc = Arc::new(core);
+    cache.set_version_manifest(name.to_string(), resolved_version, Arc::clone(&core_arc));
+    extract_transitive_deps(&core_arc, peer_deps)
+}
+
+/// Manifest-bench-style flat parallel fetch of all transitively-reachable
+/// registry manifests. Populates `cache` with both `full_manifests` and
+/// `version_manifests` slots so the subsequent BFS does no network and no
+/// re-parse.
+///
+/// `initial_deps` should already be the union of root+workspace
+/// registry edges, with non-registry specs filtered out.
+pub async fn fast_preload(
+    initial_deps: Vec<Dep>,
+    registry_url: &str,
+    cache: &MemoryCache,
+    config: &PreloadConfig,
+) -> FastPreloadStats {
+    let mut stats = FastPreloadStats::default();
+    let mut pending: VecDeque<Dep> = VecDeque::from(initial_deps);
+    // Specs we've already enqueued (or settled). Prevents duplicate
+    // sync resolutions from re-walking the same transitive subtree.
+    let mut seen_specs: HashSet<(String, String)> = HashSet::new();
+    // Names whose full manifest is either cached or in flight. Spec-level
+    // dedup happens in `seen_specs` above; this set is the gate that
+    // prevents two concurrent fetches for the same package (sibling
+    // specs queue against the in-flight one rather than racing).
+    let mut fetched_names: HashSet<String> = HashSet::new();
+    // Specs that arrived while their package's full manifest was still
+    // in flight — we'll settle them once the fetch lands.
+    let mut deferred_specs: Vec<(String, String)> = Vec::new();
+    let mut futs = FuturesUnordered::new();
+    let concurrency = config.concurrency;
+    let peer_deps = config.peer_deps;
+
+    loop {
+        while futs.len() < concurrency {
+            let Some((name, spec)) = pending.pop_front() else {
+                break;
+            };
+            if !seen_specs.insert((name.clone(), spec.clone())) {
+                continue;
+            }
+
+            // Full manifest already cached: skip the network round-trip,
+            // settle synchronously and queue this package's transitive
+            // deps. This is the hot path on the second-and-later spec
+            // for any popular package (lodash, semver, etc.).
+            if cache.get_full_manifest(&name).is_some() {
+                let new_deps = settle_spec(&name, &spec, cache, peer_deps);
+                pending.extend(new_deps);
+                continue;
+            }
+
+            // Fetch in flight for this name — defer settling this spec
+            // until the fetch lands. The deferred set is small (only
+            // sibling specs for in-flight names) so the linear scan is
+            // cheaper than another HashMap.
+            if !fetched_names.insert(name.clone()) {
+                deferred_specs.push((name, spec));
+                continue;
+            }
+
+            let registry_url = registry_url.to_string();
+            let n = name.clone();
+            futs.push(async move {
+                let start = tokio::time::Instant::now();
+                let result = fetch_full_manifest(FetchManifestOptions {
+                    registry_url: &registry_url,
+                    name: &n,
+                    format: MetadataFormat::Abbreviated,
+                    etag: None,
+                })
+                .await;
+                let elapsed_ms = start.elapsed().as_millis() as u64;
+                (name, spec, result, elapsed_ms)
+            });
+        }
+
+        if futs.is_empty() {
+            break;
+        }
+
+        let Some((name, spec, result, elapsed_ms)) = futs.next().await else {
+            break;
+        };
+
+        if stats.success_count == 0 && stats.failed_count == 0 {
+            stats.min_request_ms = elapsed_ms;
+            stats.max_request_ms = elapsed_ms;
+        } else {
+            stats.min_request_ms = stats.min_request_ms.min(elapsed_ms);
+            stats.max_request_ms = stats.max_request_ms.max(elapsed_ms);
+        }
+        stats.total_request_ms += elapsed_ms;
+
+        match result {
+            Ok(FetchManifestResult::Ok(manifest, _etag)) => {
+                stats.success_count += 1;
+                stats.fetched_names += 1;
+                cache.set_full_manifest(name.clone(), Arc::new(manifest));
+
+                let new_deps = settle_spec(&name, &spec, cache, peer_deps);
+                pending.extend(new_deps);
+
+                // Drain any sibling specs that arrived while this fetch
+                // was in flight. `extract_if`-style retain in place.
+                let mut i = 0;
+                while i < deferred_specs.len() {
+                    if deferred_specs[i].0 == name {
+                        let (n, s) = deferred_specs.swap_remove(i);
+                        let new_deps = settle_spec(&n, &s, cache, peer_deps);
+                        pending.extend(new_deps);
+                    } else {
+                        i += 1;
+                    }
+                }
+            }
+            Ok(FetchManifestResult::NotModified) => {
+                // No ETag was sent on these requests, so 304 is unreachable
+                // here in practice; treat it as a soft-failure to keep the
+                // path total.
+                stats.failed_count += 1;
+            }
+            Err(e) => {
+                stats.failed_count += 1;
+                tracing::debug!("fast_preload failed for {}: {}", name, e);
+            }
+        }
+    }
+
+    let total = stats.success_count + stats.failed_count;
+    let avg_ms = if total > 0 {
+        stats.total_request_ms / total as u64
+    } else {
+        0
+    };
+    tracing::info!(
+        "p1-breakdown fast_preload n={} ok={} fail={} avg_req={}ms min={}ms max={}ms | {}",
+        total,
+        stats.success_count,
+        stats.failed_count,
+        avg_ms,
+        stats.min_request_ms,
+        stats.max_request_ms,
+        FETCH_TIMINGS.snapshot().summary_line(),
+    );
+
+    stats
+}
diff --git a/crates/ruborist/src/resolver/mod.rs b/crates/ruborist/src/resolver/mod.rs
index 582e03b31..e7baad988 100644
--- a/crates/ruborist/src/resolver/mod.rs
+++ b/crates/ruborist/src/resolver/mod.rs
@@ -3,6 +3,7 @@
 pub mod builder;
 pub mod common;
 pub mod edges;
+pub mod fast_preload;
 #[cfg(feature = "native-git")]
 pub mod git;
 #[cfg(feature = "http-tarball")]
diff --git a/crates/ruborist/src/service/api.rs b/crates/ruborist/src/service/api.rs
index 5a14f2a56..3b9b713ea 100644
--- a/crates/ruborist/src/service/api.rs
+++ b/crates/ruborist/src/service/api.rs
@@ -36,7 +36,10 @@ use crate::model::package_lock::PackageLock;
 use crate::model::util::parse_package_spec;
 use crate::resolver::builder::{
     BuildDepsConfig, DevDeps, EdgeContext, PeerDeps, add_edges_from, build_deps_with_config,
+    gather_preload_deps,
 };
+use crate::resolver::fast_preload::fast_preload;
+use crate::resolver::preload::PreloadConfig;
 use crate::resolver::runtime::install_runtime_from_map;
 use crate::resolver::workspace::WorkspaceDiscovery;
 use crate::spec::Catalogs;
@@ -269,6 +272,29 @@ where
         );
     }
 
+    // Lockfile-only callers (`utoo deps`) skip the receiver-driven
+    // `run_preload_phase` because they have no pipeline consumer for
+    // `BuildEvent::PackageResolved`. Run `fast_preload` instead — a flat
+    // `FuturesUnordered` over `fetch_full_manifest` that warms the
+    // `MemoryCache` so the BFS phase below is pure cache-hit. This is
+    // the manifest-bench-style path; the heavier `preload_manifests`
+    // path (with `OnceMap` gates + `EventReceiver` events) only runs
+    // for install paths that need the pipeline signal.
+    if skip_preload_caller && cache_count == 0 {
+        let initial_deps = gather_preload_deps(&graph, peer_deps);
+        let preload_config = PreloadConfig {
+            peer_deps,
+            concurrency,
+        };
+        fast_preload(
+            initial_deps,
+            registry.registry_url(),
+            registry.cache(),
+            &preload_config,
+        )
+        .await;
+    }
+
     // Preserve the typed error via `Error::new` + `.context(...)` so CLI
     // renderers (e.g. pm's format_print) can downcast and pretty-print the
     // dependency chain carried by `ResolveError::WithChain`.

From 2e74bba904e391931a71960464932334e0d46e94 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 04:51:27 +0800
Subject: [PATCH 12/32] perf(pm): dispatch fast_preload settle to rayon to free
 tokio runtime
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v1 of fast_preload called settle_spec inline on the tokio worker —
each settle ran simd_json::to_borrowed_value over the full
manifest's raw bytes (5–10ms per spec) right on the runtime
thread. CI showed it starved sibling fetches: avg_request rose
+3ms, avg_parse jumped 5→11ms, p1_resolve regressed +1.0s vs the
preload+BFS baseline (4.0s vs 3.0s).

Fix: route every settle through extract_core_version_off_runtime
(the same rayon::spawn helper the BFS path uses), and merge fetch
and settle completions into a single FuturesUnordered so
backpressure on either side throttles the other. Sibling specs
that arrived during a fetch are now stashed by name (HashMap, not
linear scan), then dispatched as their own settle futures when
the fetch lands.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/resolver/fast_preload.rs | 248 ++++++++++++-------
 1 file changed, 163 insertions(+), 85 deletions(-)

diff --git a/crates/ruborist/src/resolver/fast_preload.rs b/crates/ruborist/src/resolver/fast_preload.rs
index 975c18a81..faea79752 100644
--- a/crates/ruborist/src/resolver/fast_preload.rs
+++ b/crates/ruborist/src/resolver/fast_preload.rs
@@ -4,7 +4,7 @@
 //! its `OnceMap` gates, [`crate::service::store::ManifestStore`] writes,
 //! and `EventReceiver` event dispatch — to drive a flat
 //! `FuturesUnordered` over [`crate::service::manifest::fetch_full_manifest`]
-//! plus a synchronous transitive walk. The warm
+//! plus a rayon-dispatched per-spec settle. The warm
 //! [`crate::service::cache::MemoryCache`] it leaves behind makes the
 //! subsequent BFS phase a pure cache-hit walk: no network, no rayon
 //! re-parse hop on `extract_core_version`.
@@ -13,13 +13,28 @@
 //! pipeline consumer for `BuildEvent::PackageResolved` — install paths
 //! still go through [`super::preload::preload_manifests`] so the
 //! pipeline keeps its early-start signal.
+//!
+//! ## Why settle is dispatched off-runtime
+//!
+//! A "settle" turns a freshly-fetched `FullManifest` plus a spec into a
+//! `CoreVersionManifest` for one version, via `simd_json::to_borrowed_value`
+//! over the manifest's raw bytes. That parse is 5–10ms per spec on a
+//! 100KB body. Calling it inline on the tokio runtime (the v1 of this
+//! module) starves the runtime worker — sibling fetches in flight stop
+//! draining their sockets while the worker is parsing, which CI showed
+//! as `avg_request` rising +3ms and `avg_parse` jumping 5→11ms vs the
+//! UnifiedRegistry baseline. Routing settle through `rayon::spawn`
+//! (the same path the `extract_core_version_off_runtime` helper takes)
+//! keeps the runtime free to drive I/O.
 
-use std::collections::{HashSet, VecDeque};
+use std::collections::{HashMap, HashSet, VecDeque};
+use std::pin::Pin;
 use std::sync::Arc;
 
+use futures::future::BoxFuture;
 use futures::stream::{FuturesUnordered, StreamExt};
 
-use crate::model::manifest::CoreVersionManifest;
+use crate::model::manifest::{CoreVersionManifest, FullManifest, extract_core_version_off_runtime};
 use crate::model::node::PeerDeps;
 use crate::resolver::preload::{Dep, PreloadConfig};
 use crate::resolver::version::resolve_target_version;
@@ -41,8 +56,32 @@ pub struct FastPreloadStats {
     pub total_request_ms: u64,
 }
 
+/// Output of one in-flight future. The main loop merges fetch and settle
+/// completions through a single `FuturesUnordered` so backpressure on
+/// either side throttles the other naturally.
+///
+/// `Fetched` is boxed because `FetchManifestResult::Ok` carries a fully-
+/// parsed `FullManifest` (`raw` bytes + parsed envelope), which makes
+/// the variant large enough that clippy flags the size delta with
+/// `Settled`. The cost is one heap allocation per fetched manifest;
+/// trivial against the network round-trip we already paid.
+#[allow(clippy::large_enum_variant)]
+enum FastEvent {
+    Fetched {
+        name: String,
+        primary_spec: String,
+        result: anyhow::Result<FetchManifestResult>,
+        elapsed_ms: u64,
+    },
+    Settled {
+        new_deps: Vec<Dep>,
+    },
+}
+
+type FastFut = Pin<Box<dyn std::future::Future<Output = FastEvent> + Send>>;
+
 /// Collect dependencies from any deps map, filtering out non-registry specs.
-fn collect_deps(map: Option<&std::collections::HashMap<String, String>>) -> Vec<Dep> {
+fn collect_deps(map: Option<&HashMap<String, String>>) -> Vec<Dep> {
     map.into_iter()
         .flatten()
         .filter(|(_, spec)| spec.is_registry_spec())
@@ -62,29 +101,41 @@ fn extract_transitive_deps(manifest: &CoreVersionManifest, peer_deps: PeerDeps)
     deps
 }
 
-/// Resolve `(name, spec)` against the cached `FullManifest` synchronously.
+/// Resolve `(name, spec)` against `full` off the tokio runtime.
 ///
-/// Inlines the work that `UnifiedRegistry::resolve_via_full_manifest` does
-/// after a cache hit — pick a version, parse just that subset, populate
-/// the per-version cache slot the BFS phase will read from. Skips the
-/// rayon/`spawn_blocking` hop because the caller is already doing
-/// CPU-bound bookkeeping between fetches.
-fn settle_spec(name: &str, spec: &str, cache: &MemoryCache, peer_deps: PeerDeps) -> Vec<Dep> {
-    let Some(full) = cache.get_full_manifest(name) else {
-        return Vec::new();
-    };
-    let Ok(resolved_version) = resolve_target_version((&*full).into(), spec) else {
-        return Vec::new();
-    };
-    if let Some(cached) = cache.get_version_manifest(name, &resolved_version) {
-        return extract_transitive_deps(&cached, peer_deps);
-    }
-    let Some(core) = full.get_core_version(&resolved_version) else {
-        return Vec::new();
-    };
-    let core_arc = Arc::new(core);
-    cache.set_version_manifest(name.to_string(), resolved_version, Arc::clone(&core_arc));
-    extract_transitive_deps(&core_arc, peer_deps)
+/// Returns the freshly-extracted version manifest's transitive deps so
+/// the caller can extend its pending queue. The heavy
+/// `simd_json::to_borrowed_value` parse runs inside
+/// `extract_core_version_off_runtime`, which dispatches to rayon — same
+/// path the BFS phase uses for cold extracts.
+fn settle_future(
+    name: String,
+    spec: String,
+    full: Arc<FullManifest>,
+    cache: MemoryCache,
+    peer_deps: PeerDeps,
+) -> BoxFuture<'static, FastEvent> {
+    Box::pin(async move {
+        let resolved_version = match resolve_target_version((&*full).into(), &spec) {
+            Ok(v) => v,
+            Err(_) => return FastEvent::Settled { new_deps: vec![] },
+        };
+        if let Some(cached) = cache.get_version_manifest(&name, &resolved_version) {
+            return FastEvent::Settled {
+                new_deps: extract_transitive_deps(&cached, peer_deps),
+            };
+        }
+        let (resolved_version, core) =
+            extract_core_version_off_runtime(Arc::clone(&full), resolved_version).await;
+        let new_deps = match core {
+            Some(core_arc) => {
+                cache.set_version_manifest(name, resolved_version, Arc::clone(&core_arc));
+                extract_transitive_deps(&core_arc, peer_deps)
+            }
+            None => Vec::new(),
+        };
+        FastEvent::Settled { new_deps }
+    })
 }
 
 /// Manifest-bench-style flat parallel fetch of all transitively-reachable
@@ -103,17 +154,15 @@ pub async fn fast_preload(
     let mut stats = FastPreloadStats::default();
     let mut pending: VecDeque<Dep> = VecDeque::from(initial_deps);
     // Specs we've already enqueued (or settled). Prevents duplicate
-    // sync resolutions from re-walking the same transitive subtree.
+    // settles from re-walking the same transitive subtree.
     let mut seen_specs: HashSet<(String, String)> = HashSet::new();
-    // Names whose full manifest is either cached or in flight. Spec-level
-    // dedup happens in `seen_specs` above; this set is the gate that
-    // prevents two concurrent fetches for the same package (sibling
-    // specs queue against the in-flight one rather than racing).
+    // Names whose full manifest is in flight or already cached.
     let mut fetched_names: HashSet<String> = HashSet::new();
-    // Specs that arrived while their package's full manifest was still
-    // in flight — we'll settle them once the fetch lands.
-    let mut deferred_specs: Vec<(String, String)> = Vec::new();
-    let mut futs = FuturesUnordered::new();
+    // Sibling specs that arrived while their package's full manifest
+    // was still in flight. The fetch's completion handler drains this
+    // bucket — we stash by name so the lookup is one HashMap probe.
+    let mut deferred_by_name: HashMap<String, Vec<String>> = HashMap::new();
+    let mut futs: FuturesUnordered<FastFut> = FuturesUnordered::new();
     let concurrency = config.concurrency;
     let peer_deps = config.peer_deps;
 
@@ -126,28 +175,33 @@ pub async fn fast_preload(
                 continue;
             }
 
-            // Full manifest already cached: skip the network round-trip,
-            // settle synchronously and queue this package's transitive
-            // deps. This is the hot path on the second-and-later spec
-            // for any popular package (lodash, semver, etc.).
-            if cache.get_full_manifest(&name).is_some() {
-                let new_deps = settle_spec(&name, &spec, cache, peer_deps);
-                pending.extend(new_deps);
+            // Hot path: the full manifest is already cached (a sibling
+            // spec for this name has already returned). Dispatch a
+            // settle so the parse work runs on rayon, not on the tokio
+            // worker — keeps the runtime free for ongoing fetches.
+            if let Some(full) = cache.get_full_manifest(&name) {
+                futs.push(Box::pin(settle_future(
+                    name,
+                    spec,
+                    full,
+                    cache.clone(),
+                    peer_deps,
+                )));
                 continue;
             }
 
-            // Fetch in flight for this name — defer settling this spec
-            // until the fetch lands. The deferred set is small (only
-            // sibling specs for in-flight names) so the linear scan is
-            // cheaper than another HashMap.
+            // A fetch for this name is already in flight: stash this
+            // spec; the fetch's completion handler will dispatch its
+            // settle.
             if !fetched_names.insert(name.clone()) {
-                deferred_specs.push((name, spec));
+                deferred_by_name.entry(name).or_default().push(spec);
                 continue;
             }
 
             let registry_url = registry_url.to_string();
+            let primary_spec = spec.clone();
             let n = name.clone();
-            futs.push(async move {
+            futs.push(Box::pin(async move {
                 let start = tokio::time::Instant::now();
                 let result = fetch_full_manifest(FetchManifestOptions {
                     registry_url: &registry_url,
@@ -157,58 +211,82 @@ pub async fn fast_preload(
                 })
                 .await;
                 let elapsed_ms = start.elapsed().as_millis() as u64;
-                (name, spec, result, elapsed_ms)
-            });
+                FastEvent::Fetched {
+                    name,
+                    primary_spec,
+                    result,
+                    elapsed_ms,
+                }
+            }));
         }
 
         if futs.is_empty() {
             break;
         }
 
-        let Some((name, spec, result, elapsed_ms)) = futs.next().await else {
+        let Some(event) = futs.next().await else {
             break;
         };
 
-        if stats.success_count == 0 && stats.failed_count == 0 {
-            stats.min_request_ms = elapsed_ms;
-            stats.max_request_ms = elapsed_ms;
-        } else {
-            stats.min_request_ms = stats.min_request_ms.min(elapsed_ms);
-            stats.max_request_ms = stats.max_request_ms.max(elapsed_ms);
-        }
-        stats.total_request_ms += elapsed_ms;
+        match event {
+            FastEvent::Fetched {
+                name,
+                primary_spec,
+                result,
+                elapsed_ms,
+            } => {
+                if stats.success_count == 0 && stats.failed_count == 0 {
+                    stats.min_request_ms = elapsed_ms;
+                    stats.max_request_ms = elapsed_ms;
+                } else {
+                    stats.min_request_ms = stats.min_request_ms.min(elapsed_ms);
+                    stats.max_request_ms = stats.max_request_ms.max(elapsed_ms);
+                }
+                stats.total_request_ms += elapsed_ms;
 
-        match result {
-            Ok(FetchManifestResult::Ok(manifest, _etag)) => {
-                stats.success_count += 1;
-                stats.fetched_names += 1;
-                cache.set_full_manifest(name.clone(), Arc::new(manifest));
+                match result {
+                    Ok(FetchManifestResult::Ok(manifest, _etag)) => {
+                        stats.success_count += 1;
+                        stats.fetched_names += 1;
+                        let full_arc = Arc::new(manifest);
+                        cache.set_full_manifest(name.clone(), Arc::clone(&full_arc));
 
-                let new_deps = settle_spec(&name, &spec, cache, peer_deps);
-                pending.extend(new_deps);
+                        // Primary settle.
+                        futs.push(Box::pin(settle_future(
+                            name.clone(),
+                            primary_spec,
+                            Arc::clone(&full_arc),
+                            cache.clone(),
+                            peer_deps,
+                        )));
 
-                // Drain any sibling specs that arrived while this fetch
-                // was in flight. `extract_if`-style retain in place.
-                let mut i = 0;
-                while i < deferred_specs.len() {
-                    if deferred_specs[i].0 == name {
-                        let (n, s) = deferred_specs.swap_remove(i);
-                        let new_deps = settle_spec(&n, &s, cache, peer_deps);
-                        pending.extend(new_deps);
-                    } else {
-                        i += 1;
+                        // Sibling settles that were stashed while the
+                        // fetch was in flight.
+                        if let Some(siblings) = deferred_by_name.remove(&name) {
+                            for sibling_spec in siblings {
+                                futs.push(Box::pin(settle_future(
+                                    name.clone(),
+                                    sibling_spec,
+                                    Arc::clone(&full_arc),
+                                    cache.clone(),
+                                    peer_deps,
+                                )));
+                            }
+                        }
+                    }
+                    Ok(FetchManifestResult::NotModified) => {
+                        // No ETag was sent on these requests, so 304 is
+                        // unreachable in practice; treat as soft failure.
+                        stats.failed_count += 1;
+                    }
+                    Err(e) => {
+                        stats.failed_count += 1;
+                        tracing::debug!("fast_preload failed for {}: {}", name, e);
                     }
                 }
             }
-            Ok(FetchManifestResult::NotModified) => {
-                // No ETag was sent on these requests, so 304 is unreachable
-                // here in practice; treat it as a soft-failure to keep the
-                // path total.
-                stats.failed_count += 1;
-            }
-            Err(e) => {
-                stats.failed_count += 1;
-                tracing::debug!("fast_preload failed for {}: {}", name, e);
+            FastEvent::Settled { new_deps } => {
+                pending.extend(new_deps);
             }
         }
     }

From 04c9ec34d26fdb97f83014c9a09e241cd64715aa Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 05:19:48 +0800
Subject: [PATCH 13/32] =?UTF-8?q?perf(pm):=20bump=20manifests-concurrency-?=
 =?UTF-8?q?limit=2064=20=E2=86=92=2096=20(manifest-bench=20best)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Standalone manifest-bench HTTP-only sweep (npmjs, h1) shows wall
bottoming at concurrency=96 (1817ms) — earlier 256 regression was
caused by rayon-queued parses behind 2 workers, no longer relevant
since fetch parse is on spawn_blocking and settle is rayon-dispatched
off the runtime.

fast_preload's wave-shaped transitive walk currently runs at
eff_parallel ~35 against the 64 cap because pending refills lag
settles; raising the cap to 96 gives headroom for sustained
in-flight on the deep waves without crossing the npmjs per-IP
tail-latency cliff that conc 128+ trips.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/pm/src/util/user_config.rs | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/crates/pm/src/util/user_config.rs b/crates/pm/src/util/user_config.rs
index a0235830a..f05b0f52f 100644
--- a/crates/pm/src/util/user_config.rs
+++ b/crates/pm/src/util/user_config.rs
@@ -137,15 +137,18 @@ pub fn get_install_scope() -> InstallScope {
 // We tried 256 to match bun's observed parallel streams; on GHA the
 // fetch-breakdown instrumentation showed sum_parse exploded from
 // ~10ms (local Mac, network-bound) to 728s on first cold run with
-// 256 concurrency. Mechanism: parse_json_off_runtime dispatches to
-// rayon, which has only num_cpus (=2 on GHA) workers. Bumping
-// concurrency to 256 queued 256 parses behind 2 workers → wall
-// per-parse jumped from 730µs to 266ms. Net p1 wall *increased*
-// 3.10s → 3.33s on phases bench. Keep 64 until we address the
-// parse-side queueing (e.g. inline parse on tokio, or a wider
-// dedicated parse pool).
+// Once we moved fetch parse off rayon to tokio's spawn_blocking pool
+// (cap 512) and settle off the runtime via rayon::spawn, the original
+// 256-concurrency regression mechanism (parses queued behind 2 rayon
+// workers) no longer applies. The standalone manifest-bench HTTP-only
+// sweep on GHA (npmjs, conc 32→256) shows wall bottoming out at conc 96
+// (1817ms) and tracking flat-then-rising past that — beyond ~96
+// in-flight, npmjs's per-IP rate degrades and tail latency widens.
+// 96 is the sweet spot: enough headroom for the wave-shaped transitive
+// dep walk in fast_preload to keep the runtime busy, without paying the
+// p99 widening that 128+ shows.
 static MANIFESTS_CONCURRENCY_LIMIT: LazyLock<ConfigValue<usize>> =
-    LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 64));
+    LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 96));
 
 pub fn set_manifests_concurrency_limit(value: Option<usize>) {
     MANIFESTS_CONCURRENCY_LIMIT.set(value);

From 6455852e518b3cc9859e12442972f40697360d73 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 05:46:06 +0800
Subject: [PATCH 14/32] perf(pm): fast_preload populates (name, spec) cache
 slot for BFS fast path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`UnifiedRegistry::resolve_version_manifest`'s first cache check
(service/registry.rs:347) keys on `(name, spec)` — the original spec
string the caller passed, e.g. `^4.0.0`. settle_future was only
populating `(name, resolved_version)` (e.g. `4.17.21`), so on every
BFS edge for `lodash@^4.0.0`-style specs the warm path missed and
fell into the OnceMap inflight gate + `resolve_via_full_manifest`
re-walk before recovering the manifest from the
`(name, resolved_version)` slot we'd already set.

Now settle writes both keys so BFS hits the early-return at
service/registry.rs:347 with no further dispatch. Saves ~1
OnceMap+resolve_target_version round-trip per unique (name, spec)
the BFS encounters (≈3000 calls on ant-design-x).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/resolver/fast_preload.rs | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/crates/ruborist/src/resolver/fast_preload.rs b/crates/ruborist/src/resolver/fast_preload.rs
index faea79752..c3845a73a 100644
--- a/crates/ruborist/src/resolver/fast_preload.rs
+++ b/crates/ruborist/src/resolver/fast_preload.rs
@@ -121,6 +121,8 @@ fn settle_future(
             Err(_) => return FastEvent::Settled { new_deps: vec![] },
         };
         if let Some(cached) = cache.get_version_manifest(&name, &resolved_version) {
+            // Populate the (name, spec) slot too — see comment below.
+            cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&cached));
             return FastEvent::Settled {
                 new_deps: extract_transitive_deps(&cached, peer_deps),
             };
@@ -129,6 +131,18 @@ fn settle_future(
             extract_core_version_off_runtime(Arc::clone(&full), resolved_version).await;
         let new_deps = match core {
             Some(core_arc) => {
+                // Populate BOTH cache slots so the subsequent BFS hits the
+                // fast path on its first call:
+                //   * `(name, resolved_version)` — what
+                //     `resolve_via_full_manifest` writes in the cold path,
+                //     and what `extract_core_version_off_runtime`'s callers
+                //     elsewhere expect.
+                //   * `(name, spec)` — what `resolve_version_manifest`'s
+                //     first cache check uses (line 347 in service/registry.rs).
+                //     Without this slot, BFS still pays one OnceMap dispatch
+                //     + `resolve_via_full_manifest` walk per `(name, spec)`,
+                //     even though we've already done that work here.
+                cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&core_arc));
                 cache.set_version_manifest(name, resolved_version, Arc::clone(&core_arc));
                 extract_transitive_deps(&core_arc, peer_deps)
             }

From 4bbcae8083de94ea69b6ef19611cdb59c719ca9c Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 06:12:08 +0800
Subject: [PATCH 15/32] perf(pm): fuse primary settle into fetch task to drop
 dispatch RTT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previous fast_preload (v2) dispatched primary settles to rayon as
separate FuturesUnordered futures. CI breakdown showed
eff_parallel ~44 against the conc=96 cap — the wave-shaped
transitive walk was held back by settle dispatch RTT: each fetch
landed → primary settle queued → settle popped → only then did
`pending` get transitive deps and fill the next dispatch wave.

v3 folds the primary settle into the fetch task itself via
`tokio::task::spawn_blocking`. The fetch task does the network
round-trip and the primary version-extract on the same blocking
pool slot, then returns with the resolved CoreVersionManifest
attached. Main loop pulls one Fetched event, immediately extends
`pending`, no second `next().await` to wait through the queue.

Sibling specs (rare; same name, different range) still go through
the rayon settle_future path so the primary path stays lean.

Carries primary_spec through FastEvent so the fused path can
populate both `(name, primary_spec)` and `(name, resolved_version)`
cache slots — preserves the 6455852e BFS fast-path win.

FetchOutcome enum replaces by-value FetchManifestResult to avoid a
full FullManifest clone (HashMap+Vec) per fetch event.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/resolver/fast_preload.rs | 206 ++++++++++++-------
 1 file changed, 135 insertions(+), 71 deletions(-)

diff --git a/crates/ruborist/src/resolver/fast_preload.rs b/crates/ruborist/src/resolver/fast_preload.rs
index c3845a73a..008030139 100644
--- a/crates/ruborist/src/resolver/fast_preload.rs
+++ b/crates/ruborist/src/resolver/fast_preload.rs
@@ -4,7 +4,7 @@
 //! its `OnceMap` gates, [`crate::service::store::ManifestStore`] writes,
 //! and `EventReceiver` event dispatch — to drive a flat
 //! `FuturesUnordered` over [`crate::service::manifest::fetch_full_manifest`]
-//! plus a rayon-dispatched per-spec settle. The warm
+//! plus a fused-into-fetch primary settle. The warm
 //! [`crate::service::cache::MemoryCache`] it leaves behind makes the
 //! subsequent BFS phase a pure cache-hit walk: no network, no rayon
 //! re-parse hop on `extract_core_version`.
@@ -14,18 +14,30 @@
 //! still go through [`super::preload::preload_manifests`] so the
 //! pipeline keeps its early-start signal.
 //!
-//! ## Why settle is dispatched off-runtime
+//! ## Why settle is fused into the fetch task
 //!
 //! A "settle" turns a freshly-fetched `FullManifest` plus a spec into a
 //! `CoreVersionManifest` for one version, via `simd_json::to_borrowed_value`
 //! over the manifest's raw bytes. That parse is 5–10ms per spec on a
-//! 100KB body. Calling it inline on the tokio runtime (the v1 of this
-//! module) starves the runtime worker — sibling fetches in flight stop
-//! draining their sockets while the worker is parsing, which CI showed
-//! as `avg_request` rising +3ms and `avg_parse` jumping 5→11ms vs the
-//! UnifiedRegistry baseline. Routing settle through `rayon::spawn`
-//! (the same path the `extract_core_version_off_runtime` helper takes)
-//! keeps the runtime free to drive I/O.
+//! 100KB body.
+//!
+//! v1 ran settle inline on the tokio runtime worker — that starved
+//! sibling fetches' I/O drive (CI showed `avg_request` +3ms,
+//! `avg_parse` 5→11ms). v2 dispatched settle to rayon via a separate
+//! `FuturesUnordered` future, which fixed the runtime starvation but
+//! introduced a dispatch RTT: fetch lands → rayon settle queued → settle
+//! pops → `pending` finally gets transitive deps. That round-trip held
+//! the wave-shaped transitive walk back, capping `eff_parallel` at ~44
+//! against a 96 cap.
+//!
+//! v3 (this) folds the primary settle into the fetch task itself via
+//! `tokio::task::spawn_blocking`. The fetch task awaits both the
+//! network round-trip and the version-extract on the same blocking
+//! pool slot, then returns with the resolved `CoreVersionManifest`
+//! attached. The main loop pulls a single `Fetched` event and
+//! immediately extends `pending` — no separate settle pop. Sibling
+//! specs (rare; same package, different range) still go through a
+//! `Settled` future to keep the primary path lean.
 
 use std::collections::{HashMap, HashSet, VecDeque};
 use std::pin::Pin;
@@ -56,21 +68,31 @@ pub struct FastPreloadStats {
     pub total_request_ms: u64,
 }
 
-/// Output of one in-flight future. The main loop merges fetch and settle
-/// completions through a single `FuturesUnordered` so backpressure on
-/// either side throttles the other naturally.
-///
-/// `Fetched` is boxed because `FetchManifestResult::Ok` carries a fully-
-/// parsed `FullManifest` (`raw` bytes + parsed envelope), which makes
-/// the variant large enough that clippy flags the size delta with
-/// `Settled`. The cost is one heap allocation per fetched manifest;
-/// trivial against the network round-trip we already paid.
-#[allow(clippy::large_enum_variant)]
+/// One fetch's primary settle outcome — the resolved version + parsed
+/// `CoreVersionManifest` for the spec the fetch was originally issued
+/// for. `None` means the spec didn't match any version (caller treats
+/// as soft skip).
+type PrimarySettle = Option<(String, Arc<CoreVersionManifest>)>;
+
+/// Outcome of a fetch task. Owning `Arc<FullManifest>` (rather than
+/// `FetchManifestResult` by-value) means the fetch task can `Arc::clone`
+/// once for the primary settle, then pass ownership along — no full
+/// `FullManifest` clone (which would copy the 200-entry `time`
+/// HashMap + the `versions` `Vec<String>` per fetch).
+enum FetchOutcome {
+    Ok(Arc<FullManifest>),
+    NotModified,
+    Err,
+}
+
+/// Output of one in-flight future. The main loop merges fetch and
+/// sibling-settle completions through a single `FuturesUnordered`.
 enum FastEvent {
     Fetched {
         name: String,
         primary_spec: String,
-        result: anyhow::Result<FetchManifestResult>,
+        outcome: FetchOutcome,
+        primary_settle: PrimarySettle,
         elapsed_ms: u64,
     },
     Settled {
@@ -101,13 +123,9 @@ fn extract_transitive_deps(manifest: &CoreVersionManifest, peer_deps: PeerDeps)
     deps
 }
 
-/// Resolve `(name, spec)` against `full` off the tokio runtime.
-///
-/// Returns the freshly-extracted version manifest's transitive deps so
-/// the caller can extend its pending queue. The heavy
-/// `simd_json::to_borrowed_value` parse runs inside
-/// `extract_core_version_off_runtime`, which dispatches to rayon — same
-/// path the BFS phase uses for cold extracts.
+/// Off-runtime settle for a `(name, spec)` whose `FullManifest` is
+/// already cached. Used for sibling specs — multiple ranges on the
+/// same package — that arrive after the primary fetch has landed.
 fn settle_future(
     name: String,
     spec: String,
@@ -121,7 +139,6 @@ fn settle_future(
             Err(_) => return FastEvent::Settled { new_deps: vec![] },
         };
         if let Some(cached) = cache.get_version_manifest(&name, &resolved_version) {
-            // Populate the (name, spec) slot too — see comment below.
             cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&cached));
             return FastEvent::Settled {
                 new_deps: extract_transitive_deps(&cached, peer_deps),
@@ -131,17 +148,6 @@ fn settle_future(
             extract_core_version_off_runtime(Arc::clone(&full), resolved_version).await;
         let new_deps = match core {
             Some(core_arc) => {
-                // Populate BOTH cache slots so the subsequent BFS hits the
-                // fast path on its first call:
-                //   * `(name, resolved_version)` — what
-                //     `resolve_via_full_manifest` writes in the cold path,
-                //     and what `extract_core_version_off_runtime`'s callers
-                //     elsewhere expect.
-                //   * `(name, spec)` — what `resolve_version_manifest`'s
-                //     first cache check uses (line 347 in service/registry.rs).
-                //     Without this slot, BFS still pays one OnceMap dispatch
-                //     + `resolve_via_full_manifest` walk per `(name, spec)`,
-                //     even though we've already done that work here.
                 cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&core_arc));
                 cache.set_version_manifest(name, resolved_version, Arc::clone(&core_arc));
                 extract_transitive_deps(&core_arc, peer_deps)
@@ -152,6 +158,35 @@ fn settle_future(
     })
 }
 
+/// Resolve `(name, spec)` against `full` on tokio's blocking pool.
+///
+/// Same shape as `extract_core_version_off_runtime` (which uses rayon),
+/// but stays inside the fetch task so the result lands together with
+/// the network round-trip — no separate `FuturesUnordered` pop, so
+/// `pending` gets the transitive deps the moment the fetch event is
+/// drained. Tokio's blocking pool has a 512-thread cap; rayon's is
+/// `max(num_cpus, 8)`. With many primary settles arriving in waves,
+/// the wider blocking pool absorbs the burst better than rayon would.
+async fn resolve_primary_settle(spec: String, full: Arc<FullManifest>) -> PrimarySettle {
+    #[cfg(not(target_arch = "wasm32"))]
+    {
+        tokio::task::spawn_blocking(move || {
+            let resolved = resolve_target_version((&*full).into(), &spec).ok()?;
+            let core = full.get_core_version(&resolved)?;
+            Some((resolved, Arc::new(core)))
+        })
+        .await
+        .ok()
+        .flatten()
+    }
+    #[cfg(target_arch = "wasm32")]
+    {
+        let resolved = resolve_target_version((&*full).into(), &spec).ok()?;
+        let core = full.get_core_version(&resolved)?;
+        Some((resolved, Arc::new(core)))
+    }
+}
+
 /// Manifest-bench-style flat parallel fetch of all transitively-reachable
 /// registry manifests. Populates `cache` with both `full_manifests` and
 /// `version_manifests` slots so the subsequent BFS does no network and no
@@ -167,14 +202,14 @@ pub async fn fast_preload(
 ) -> FastPreloadStats {
     let mut stats = FastPreloadStats::default();
     let mut pending: VecDeque<Dep> = VecDeque::from(initial_deps);
-    // Specs we've already enqueued (or settled). Prevents duplicate
-    // settles from re-walking the same transitive subtree.
+    // Specs we've already enqueued. Prevents duplicate settles from
+    // re-walking the same transitive subtree.
     let mut seen_specs: HashSet<(String, String)> = HashSet::new();
     // Names whose full manifest is in flight or already cached.
     let mut fetched_names: HashSet<String> = HashSet::new();
     // Sibling specs that arrived while their package's full manifest
-    // was still in flight. The fetch's completion handler drains this
-    // bucket — we stash by name so the lookup is one HashMap probe.
+    // was still in flight. The fetch's completion handler dispatches
+    // settles for them, then drains this bucket.
     let mut deferred_by_name: HashMap<String, Vec<String>> = HashMap::new();
     let mut futs: FuturesUnordered<FastFut> = FuturesUnordered::new();
     let concurrency = config.concurrency;
@@ -189,10 +224,10 @@ pub async fn fast_preload(
                 continue;
             }
 
-            // Hot path: the full manifest is already cached (a sibling
-            // spec for this name has already returned). Dispatch a
-            // settle so the parse work runs on rayon, not on the tokio
-            // worker — keeps the runtime free for ongoing fetches.
+            // Hot path: a sibling spec for this name has already
+            // returned, so the full manifest is cached. Settle on
+            // rayon (off-runtime) — keeps the primary fetch path
+            // (next branch) clean.
             if let Some(full) = cache.get_full_manifest(&name) {
                 futs.push(Box::pin(settle_future(
                     name,
@@ -205,8 +240,8 @@ pub async fn fast_preload(
             }
 
             // A fetch for this name is already in flight: stash this
-            // spec; the fetch's completion handler will dispatch its
-            // settle.
+            // sibling spec; the fetch's completion handler will
+            // dispatch a settle for it.
             if !fetched_names.insert(name.clone()) {
                 deferred_by_name.entry(name).or_default().push(spec);
                 continue;
@@ -225,10 +260,30 @@ pub async fn fast_preload(
                 })
                 .await;
                 let elapsed_ms = start.elapsed().as_millis() as u64;
+                // Fuse the primary settle into the same task so the
+                // main loop sees the resolved version + transitive
+                // deps in the same event — no extra `next().await` to
+                // wait through the FuturesUnordered queue before
+                // `pending` can refill.
+                let (outcome, primary_settle) = match result {
+                    Ok(FetchManifestResult::Ok(manifest, _etag)) => {
+                        let full_arc = Arc::new(manifest);
+                        let settle =
+                            resolve_primary_settle(primary_spec.clone(), Arc::clone(&full_arc))
+                                .await;
+                        (FetchOutcome::Ok(full_arc), settle)
+                    }
+                    Ok(FetchManifestResult::NotModified) => (FetchOutcome::NotModified, None),
+                    Err(e) => {
+                        tracing::debug!("fast_preload failed for {}: {}", n, e);
+                        (FetchOutcome::Err, None)
+                    }
+                };
                 FastEvent::Fetched {
                     name,
                     primary_spec,
-                    result,
+                    outcome,
+                    primary_settle,
                     elapsed_ms,
                 }
             }));
@@ -246,7 +301,8 @@ pub async fn fast_preload(
             FastEvent::Fetched {
                 name,
                 primary_spec,
-                result,
+                outcome,
+                primary_settle,
                 elapsed_ms,
             } => {
                 if stats.success_count == 0 && stats.failed_count == 0 {
@@ -258,24 +314,36 @@ pub async fn fast_preload(
                 }
                 stats.total_request_ms += elapsed_ms;
 
-                match result {
-                    Ok(FetchManifestResult::Ok(manifest, _etag)) => {
+                match outcome {
+                    FetchOutcome::Ok(full_arc) => {
                         stats.success_count += 1;
                         stats.fetched_names += 1;
-                        let full_arc = Arc::new(manifest);
                         cache.set_full_manifest(name.clone(), Arc::clone(&full_arc));
 
-                        // Primary settle.
-                        futs.push(Box::pin(settle_future(
-                            name.clone(),
-                            primary_spec,
-                            Arc::clone(&full_arc),
-                            cache.clone(),
-                            peer_deps,
-                        )));
+                        // Apply the primary settle (already done inside
+                        // the fetch task via spawn_blocking) — populate
+                        // both `(name, primary_spec)` and
+                        // `(name, resolved_version)` cache slots so BFS
+                        // hits the early-return at registry.rs:347 on
+                        // its first probe, then extend `pending` with
+                        // the spec's transitive deps.
+                        if let Some((resolved_version, core_arc)) = primary_settle {
+                            cache.set_version_manifest(
+                                name.clone(),
+                                primary_spec,
+                                Arc::clone(&core_arc),
+                            );
+                            cache.set_version_manifest(
+                                name.clone(),
+                                resolved_version,
+                                Arc::clone(&core_arc),
+                            );
+                            pending.extend(extract_transitive_deps(&core_arc, peer_deps));
+                        }
 
-                        // Sibling settles that were stashed while the
-                        // fetch was in flight.
+                        // Sibling specs that were stashed while the
+                        // fetch was in flight: dispatch each as a
+                        // separate settle future.
                         if let Some(siblings) = deferred_by_name.remove(&name) {
                             for sibling_spec in siblings {
                                 futs.push(Box::pin(settle_future(
@@ -288,14 +356,10 @@ pub async fn fast_preload(
                             }
                         }
                     }
-                    Ok(FetchManifestResult::NotModified) => {
-                        // No ETag was sent on these requests, so 304 is
-                        // unreachable in practice; treat as soft failure.
-                        stats.failed_count += 1;
-                    }
-                    Err(e) => {
+                    FetchOutcome::NotModified | FetchOutcome::Err => {
+                        // 304 is unreachable in practice (no ETag sent);
+                        // both branches treated as soft failure.
                         stats.failed_count += 1;
-                        tracing::debug!("fast_preload failed for {}: {}", name, e);
                     }
                 }
             }

From 671ac98e51e4a7ca4e53149c8bead24b4f144451 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 06:42:55 +0800
Subject: [PATCH 16/32] perf(pm): combined-parse fetch path eliminates
 per-fetch double simd_json
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The fast_preload hot path was paying TWO simd_json passes per
manifest:
  1. fetch_full_manifest's parse_json_off_runtime did a typed
     simd_json::serde::from_slice<FullManifest> (envelope + IgnoredAny
     visitor on `versions` keys, ~3-5ms on a 100KB body).
  2. Primary settle re-parsed the same raw bytes with
     simd_json::to_borrowed_value (~5-10ms) to extract one version's
     subtree.

Both passes went through simd_json's Tape constructor — duplicated
work. CI showed avg_parse 5-7ms × 2700 fetches = 14-19s of CPU sum
on 2-core GHA, where the spawn_blocking pool's overlapping schedule
masked some of the cost but not all.

Adds `service::manifest::fetch_full_manifest_with_settle`: same HTTP
+ retry + ETag machinery as `fetch_full_manifest`, but the parse
step does ONE `to_borrowed_value` and extracts:
  * envelope (`name`, `dist-tags`, `versions` keys) into FullManifest
    manually (no typed serde), and
  * the resolved version's subtree as a typed CoreVersionManifest
    (serde-deserializing that single subtree via the borrowed value).

fast_preload's fetch task switches to this entry point — primary
settle is now a free byproduct of the fetch parse, not a separate
`to_borrowed_value` pass. Sibling specs (same name, different
range) still go through the rayon settle_future path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/resolver/fast_preload.rs |  68 ++----
 crates/ruborist/src/service/manifest.rs      | 208 +++++++++++++++++++
 crates/ruborist/src/service/mod.rs           |   5 +-
 3 files changed, 231 insertions(+), 50 deletions(-)

diff --git a/crates/ruborist/src/resolver/fast_preload.rs b/crates/ruborist/src/resolver/fast_preload.rs
index 008030139..d049321d8 100644
--- a/crates/ruborist/src/resolver/fast_preload.rs
+++ b/crates/ruborist/src/resolver/fast_preload.rs
@@ -51,7 +51,8 @@ use crate::model::node::PeerDeps;
 use crate::resolver::preload::{Dep, PreloadConfig};
 use crate::resolver::version::resolve_target_version;
 use crate::service::{
-    FetchManifestOptions, FetchManifestResult, MemoryCache, MetadataFormat, fetch_full_manifest,
+    FetchManifestOptions, FetchWithSettleResult, MemoryCache, MetadataFormat,
+    fetch_full_manifest_with_settle,
 };
 use crate::spec::SpecStr;
 use crate::util::FETCH_TIMINGS;
@@ -158,35 +159,6 @@ fn settle_future(
     })
 }
 
-/// Resolve `(name, spec)` against `full` on tokio's blocking pool.
-///
-/// Same shape as `extract_core_version_off_runtime` (which uses rayon),
-/// but stays inside the fetch task so the result lands together with
-/// the network round-trip — no separate `FuturesUnordered` pop, so
-/// `pending` gets the transitive deps the moment the fetch event is
-/// drained. Tokio's blocking pool has a 512-thread cap; rayon's is
-/// `max(num_cpus, 8)`. With many primary settles arriving in waves,
-/// the wider blocking pool absorbs the burst better than rayon would.
-async fn resolve_primary_settle(spec: String, full: Arc<FullManifest>) -> PrimarySettle {
-    #[cfg(not(target_arch = "wasm32"))]
-    {
-        tokio::task::spawn_blocking(move || {
-            let resolved = resolve_target_version((&*full).into(), &spec).ok()?;
-            let core = full.get_core_version(&resolved)?;
-            Some((resolved, Arc::new(core)))
-        })
-        .await
-        .ok()
-        .flatten()
-    }
-    #[cfg(target_arch = "wasm32")]
-    {
-        let resolved = resolve_target_version((&*full).into(), &spec).ok()?;
-        let core = full.get_core_version(&resolved)?;
-        Some((resolved, Arc::new(core)))
-    }
-}
-
 /// Manifest-bench-style flat parallel fetch of all transitively-reachable
 /// registry manifests. Populates `cache` with both `full_manifests` and
 /// `version_manifests` slots so the subsequent BFS does no network and no
@@ -252,28 +224,28 @@ pub async fn fast_preload(
             let n = name.clone();
             futs.push(Box::pin(async move {
                 let start = tokio::time::Instant::now();
-                let result = fetch_full_manifest(FetchManifestOptions {
-                    registry_url: &registry_url,
-                    name: &n,
-                    format: MetadataFormat::Abbreviated,
-                    etag: None,
-                })
+                // Combined fetch + envelope parse + primary settle in
+                // a single `to_borrowed_value` pass — replaces the old
+                // pattern of typed-serde envelope parse followed by a
+                // separate `to_borrowed_value` reparse for version
+                // extraction. Halves simd_json work per fetch.
+                let result = fetch_full_manifest_with_settle(
+                    FetchManifestOptions {
+                        registry_url: &registry_url,
+                        name: &n,
+                        format: MetadataFormat::Abbreviated,
+                        etag: None,
+                    },
+                    &primary_spec,
+                )
                 .await;
                 let elapsed_ms = start.elapsed().as_millis() as u64;
-                // Fuse the primary settle into the same task so the
-                // main loop sees the resolved version + transitive
-                // deps in the same event — no extra `next().await` to
-                // wait through the FuturesUnordered queue before
-                // `pending` can refill.
                 let (outcome, primary_settle) = match result {
-                    Ok(FetchManifestResult::Ok(manifest, _etag)) => {
-                        let full_arc = Arc::new(manifest);
-                        let settle =
-                            resolve_primary_settle(primary_spec.clone(), Arc::clone(&full_arc))
-                                .await;
-                        (FetchOutcome::Ok(full_arc), settle)
+                    Ok(FetchWithSettleResult::Ok(payload)) => {
+                        let full_arc = Arc::new(payload.manifest);
+                        (FetchOutcome::Ok(full_arc), payload.primary_settle)
                     }
-                    Ok(FetchManifestResult::NotModified) => (FetchOutcome::NotModified, None),
+                    Ok(FetchWithSettleResult::NotModified) => (FetchOutcome::NotModified, None),
                     Err(e) => {
                         tracing::debug!("fast_preload failed for {}: {}", n, e);
                         (FetchOutcome::Err, None)
diff --git a/crates/ruborist/src/service/manifest.rs b/crates/ruborist/src/service/manifest.rs
index 90f1db71b..38db87969 100644
--- a/crates/ruborist/src/service/manifest.rs
+++ b/crates/ruborist/src/service/manifest.rs
@@ -4,7 +4,11 @@
 //! [`crate::service::fetch`] so retry policy stays uniform across registry
 //! manifest fetches and non-registry resolvers (git, http tarball).
 
+use std::collections::HashMap;
+use std::sync::Arc;
+
 use anyhow::{Result, anyhow};
+use serde::Deserialize;
 use tokio_retry::RetryIf;
 
 use super::fetch::{
@@ -12,6 +16,7 @@ use super::fetch::{
 };
 use super::http::get_client;
 use crate::model::manifest::{CoreVersionManifest, FullManifest};
+use crate::resolver::version::resolve_target_version;
 use crate::util::FETCH_TIMINGS;
 
 /// Parse JSON bytes on tokio's blocking thread pool.
@@ -157,6 +162,209 @@ pub async fn fetch_full_manifest(opts: FetchManifestOptions<'_>) -> Result<Fetch
     })
 }
 
+/// Outcome of [`fetch_full_manifest_with_settle`] — a full manifest
+/// plus the parsed `CoreVersionManifest` for the requested spec, when
+/// it resolves to a known version. Both are produced from a single
+/// `simd_json::to_borrowed_value` pass over the response body, so
+/// callers that need the version subtree never pay the typed-serde
+/// envelope parse + per-version `to_borrowed_value` reparse.
+pub struct FetchWithSettle {
+    pub manifest: FullManifest,
+    pub etag: Option<String>,
+    /// `Some` when the requested spec resolves to a real version in
+    /// `manifest.versions`. `None` only on no-match (rare; usually a
+    /// spec referring to a yanked or moved version).
+    pub primary_settle: Option<PrimarySettleResult>,
+}
+
+/// `(resolved_version, parsed_subtree)` — what
+/// [`fetch_full_manifest_with_settle`] hands back to callers that
+/// supplied a `primary_spec`.
+pub type PrimarySettleResult = (String, Arc<CoreVersionManifest>);
+
+#[allow(clippy::large_enum_variant)]
+pub enum FetchWithSettleResult {
+    Ok(FetchWithSettle),
+    NotModified,
+}
+
+/// Fetch a full manifest and resolve the primary spec from the same
+/// parse pass.
+///
+/// Where [`fetch_full_manifest`] uses `simd_json::serde::from_slice`
+/// to materialize a typed `FullManifest` (cheap envelope, deep
+/// `versions` subtrees skipped via `IgnoredAny`) and leaves version
+/// subtree extraction to a later `simd_json::to_borrowed_value`
+/// reparse, this entry point does the borrowed-value parse once and
+/// extracts:
+///   * envelope fields needed by the resolver (`name`, `dist-tags`,
+///     `versions` keys),
+///   * the resolved-version subtree as a typed
+///     [`CoreVersionManifest`].
+///
+/// Saves one full simd_json pass on the parse hot path —
+/// `fast_preload` uses ~2700 of these per `utoo deps` cold run, so
+/// halving the per-fetch parse work meaningfully reduces CPU on
+/// 2-core CI.
+pub async fn fetch_full_manifest_with_settle(
+    opts: FetchManifestOptions<'_>,
+    primary_spec: &str,
+) -> Result<FetchWithSettleResult> {
+    let url = format!("{}/{}", opts.registry_url, opts.name);
+    let etag_owned = opts.etag.map(|s| s.to_string());
+    let primary_spec_owned = primary_spec.to_string();
+    let accept = match opts.format {
+        MetadataFormat::Abbreviated => "application/vnd.npm.install-v1+json",
+        MetadataFormat::Complete => "application/json",
+    };
+
+    RetryIf::spawn(
+        retry_strategy(),
+        || {
+            let url = url.clone();
+            let etag = etag_owned.clone();
+            let primary_spec = primary_spec_owned.clone();
+            async move {
+                let mut request = get_client()
+                    .map_err(FetchError::Permanent)?
+                    .get(&url)
+                    .header("Accept", accept);
+                if let Some(etag_value) = &etag {
+                    request = request.header("If-None-Match", etag_value);
+                }
+
+                let t_request_start = std::time::Instant::now();
+                let response = request.send().await.map_err(classify_reqwest_error)?;
+                let request_us = t_request_start.elapsed().as_micros() as u64;
+                let status = response.status();
+
+                if status == reqwest::StatusCode::NOT_MODIFIED {
+                    if etag.is_some() {
+                        return Ok(FetchWithSettleResult::NotModified);
+                    }
+                    return Err(classify_status(status, &url));
+                }
+
+                if status.is_success() {
+                    let new_etag = response
+                        .headers()
+                        .get("etag")
+                        .and_then(|v| v.to_str().ok())
+                        .map(|s| s.to_string());
+
+                    let t_body_start = std::time::Instant::now();
+                    let raw_bytes = response
+                        .bytes()
+                        .await
+                        .map_err(|e| FetchError::Permanent(anyhow!("Response read error: {e}")))?
+                        .to_vec();
+                    let body_us = t_body_start.elapsed().as_micros() as u64;
+                    let bytes_len = raw_bytes.len() as u64;
+                    let raw_arc: Arc<[u8]> = Arc::from(raw_bytes);
+
+                    let t_parse_start = std::time::Instant::now();
+                    let parse_result =
+                        parse_envelope_and_settle(Arc::clone(&raw_arc), primary_spec)
+                            .await
+                            .map_err(FetchError::Permanent)?;
+                    let parse_us = t_parse_start.elapsed().as_micros() as u64;
+
+                    FETCH_TIMINGS.record(request_us, body_us, parse_us, bytes_len);
+
+                    let (manifest, primary_settle) = parse_result;
+                    Ok(FetchWithSettleResult::Ok(FetchWithSettle {
+                        manifest,
+                        etag: new_etag,
+                        primary_settle,
+                    }))
+                } else {
+                    Err(classify_status(status, &url))
+                }
+            }
+        },
+        is_retryable,
+    )
+    .await
+    .map_err(|e| match e {
+        FetchError::Retryable(e) | FetchError::Permanent(e) => {
+            anyhow!("Failed to fetch {}: {:#}", opts.name, e)
+        }
+    })
+}
+
+/// Off-runtime combined parse: `simd_json::to_borrowed_value` once,
+/// extract envelope into [`FullManifest`] + resolve `primary_spec`
+/// against the parsed `versions` keys + materialize the resolved
+/// version's subtree into [`CoreVersionManifest`].
+///
+/// Constructs `FullManifest` manually rather than via typed serde so
+/// the work is exactly one parse pass. Other `FullManifest` fields
+/// (`description`, `time`, `maintainers`, etc.) stay at `Default`
+/// values — none are read on the resolver hot path.
+async fn parse_envelope_and_settle(
+    raw: Arc<[u8]>,
+    primary_spec: String,
+) -> Result<(FullManifest, Option<PrimarySettleResult>)> {
+    #[cfg(not(target_arch = "wasm32"))]
+    {
+        tokio::task::spawn_blocking(move || parse_envelope_and_settle_sync(raw, &primary_spec))
+            .await
+            .map_err(|e| anyhow!("spawn_blocking parse panicked: {e}"))?
+    }
+    #[cfg(target_arch = "wasm32")]
+    {
+        parse_envelope_and_settle_sync(raw, &primary_spec)
+    }
+}
+
+fn parse_envelope_and_settle_sync(
+    raw: Arc<[u8]>,
+    primary_spec: &str,
+) -> Result<(FullManifest, Option<PrimarySettleResult>)> {
+    use simd_json::prelude::{ValueAsScalar, ValueObjectAccess};
+
+    let mut buf = (*raw).to_vec();
+    let parsed =
+        simd_json::to_borrowed_value(&mut buf).map_err(|e| anyhow!("JSON parse error: {e}"))?;
+
+    let name = parsed
+        .get("name")
+        .and_then(|v| v.as_str())
+        .map(|s| s.to_string())
+        .unwrap_or_default();
+
+    let dist_tags: HashMap<String, String> = parsed
+        .get("dist-tags")
+        .and_then(|v| HashMap::<String, String>::deserialize(v).ok())
+        .unwrap_or_default();
+
+    let versions_keys: Vec<String> = parsed
+        .get("versions")
+        .and_then(simd_json::prelude::ValueAsObject::as_object)
+        .map(|obj| obj.keys().map(|k| k.to_string()).collect())
+        .unwrap_or_default();
+
+    let manifest = FullManifest {
+        name,
+        dist_tags: dist_tags.clone(),
+        versions: versions_keys,
+        raw,
+        ..Default::default()
+    };
+
+    // Resolve spec against the just-extracted envelope.
+    let primary_settle = match resolve_target_version((&manifest).into(), primary_spec) {
+        Ok(resolved) => parsed
+            .get("versions")
+            .and_then(|v| v.get(resolved.as_str()))
+            .and_then(|version_obj| CoreVersionManifest::deserialize(version_obj).ok())
+            .map(|core| (resolved, Arc::new(core))),
+        Err(_) => None,
+    };
+
+    Ok((manifest, primary_settle))
+}
+
 /// Fetch full manifest without ETag / 304 support.
 ///
 /// Convenience wrapper around [`fetch_full_manifest`] for callers that never
diff --git a/crates/ruborist/src/service/mod.rs b/crates/ruborist/src/service/mod.rs
index 13109e994..5adb6bf0b 100644
--- a/crates/ruborist/src/service/mod.rs
+++ b/crates/ruborist/src/service/mod.rs
@@ -60,8 +60,9 @@ pub use cache::{
 pub use fs::{Glob, NoopGlob, exists, read_to_string};
 pub use http::client_builder;
 pub use manifest::{
-    FetchManifestOptions, FetchManifestResult, FetchVersionManifestOptions, MetadataFormat,
-    fetch_full_manifest, fetch_full_manifest_fresh, fetch_version_manifest,
+    FetchManifestOptions, FetchManifestResult, FetchVersionManifestOptions, FetchWithSettle,
+    FetchWithSettleResult, MetadataFormat, fetch_full_manifest, fetch_full_manifest_fresh,
+    fetch_full_manifest_with_settle, fetch_version_manifest,
 };
 pub use registry::UnifiedRegistry;
 pub use store::{ManifestStore, NoopStore};

From 542d7f144ec700ab5601247eff655399585fedbe Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 07:11:45 +0800
Subject: [PATCH 17/32] =?UTF-8?q?perf(pm):=20bump=20manifests-concurrency-?=
 =?UTF-8?q?limit=2096=20=E2=86=92=20128?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After 671ac98e's combined-parse fetch path eliminated the
double simd_json pass, the spawn_blocking pool's contention
ceiling rose enough that bumping concurrency past 96 no longer
queues parses behind 2-core CPU. manifest-bench's most recent
good-network sweep on GHA showed conc=128 hitting 1500ms vs
conc=96 at 1566ms — small but real headroom for fast_preload's
late-wave saturation now that initial waves fill faster.

Risk: on slower-network runs (npmjs per-IP throttle), conc=128
widens p99. Earlier conc-sweep data was mixed — accepting that
variance for the average-case improvement.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/pm/src/util/user_config.rs | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/crates/pm/src/util/user_config.rs b/crates/pm/src/util/user_config.rs
index f05b0f52f..2f389379e 100644
--- a/crates/pm/src/util/user_config.rs
+++ b/crates/pm/src/util/user_config.rs
@@ -137,18 +137,17 @@ pub fn get_install_scope() -> InstallScope {
 // We tried 256 to match bun's observed parallel streams; on GHA the
 // fetch-breakdown instrumentation showed sum_parse exploded from
 // ~10ms (local Mac, network-bound) to 728s on first cold run with
-// Once we moved fetch parse off rayon to tokio's spawn_blocking pool
-// (cap 512) and settle off the runtime via rayon::spawn, the original
-// 256-concurrency regression mechanism (parses queued behind 2 rayon
-// workers) no longer applies. The standalone manifest-bench HTTP-only
-// sweep on GHA (npmjs, conc 32→256) shows wall bottoming out at conc 96
-// (1817ms) and tracking flat-then-rising past that — beyond ~96
-// in-flight, npmjs's per-IP rate degrades and tail latency widens.
-// 96 is the sweet spot: enough headroom for the wave-shaped transitive
-// dep walk in fast_preload to keep the runtime busy, without paying the
-// p99 widening that 128+ shows.
+// Once parse work shrank (combined `to_borrowed_value` pass replaces
+// the typed-serde envelope parse + reparse), spawn_blocking pool
+// pressure no longer caps us at 96. manifest-bench's HTTP-only sweep
+// on GHA (npmjs, h1) consistently picks 96 or 128 as best wall —
+// in the most recent good-network run, conc=128 hit 1500ms vs
+// conc=96 at 1566ms. Bumping to 128 narrows the gap between
+// fast_preload's wave-shaped concurrency floor (eff_parallel ~48
+// because pending takes ~2 wave depths to fill) and the cap, so
+// the late-wave saturation has more headroom.
 static MANIFESTS_CONCURRENCY_LIMIT: LazyLock<ConfigValue<usize>> =
-    LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 96));
+    LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 128));
 
 pub fn set_manifests_concurrency_limit(value: Option<usize>) {
     MANIFESTS_CONCURRENCY_LIMIT.set(value);

From c8768ac4ce8ca26a60a3313e22dba7ac625665d7 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 07:39:24 +0800
Subject: [PATCH 18/32] revert(pm): manifests-concurrency-limit back to 96

542d7f14's conc=128 bench landed in a slow-network run (mb best
2010ms vs 1500ms in the prior good-network run; bun also bumped
to 2.14s vs 1.83s). Adjusted gap to mb best stayed flat (~700ms
either way), so conc=128 didn't beat 96 across runs.

Picking 96 as the conservative default: at-or-near best on every
GHA run we've measured, never the worst, and leaves headroom for
npmjs's per-IP throttling to absorb without compounding p99.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/pm/src/util/user_config.rs | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/crates/pm/src/util/user_config.rs b/crates/pm/src/util/user_config.rs
index 2f389379e..f6924f5aa 100644
--- a/crates/pm/src/util/user_config.rs
+++ b/crates/pm/src/util/user_config.rs
@@ -137,17 +137,18 @@ pub fn get_install_scope() -> InstallScope {
 // We tried 256 to match bun's observed parallel streams; on GHA the
 // fetch-breakdown instrumentation showed sum_parse exploded from
 // ~10ms (local Mac, network-bound) to 728s on first cold run with
-// Once parse work shrank (combined `to_borrowed_value` pass replaces
-// the typed-serde envelope parse + reparse), spawn_blocking pool
-// pressure no longer caps us at 96. manifest-bench's HTTP-only sweep
-// on GHA (npmjs, h1) consistently picks 96 or 128 as best wall —
-// in the most recent good-network run, conc=128 hit 1500ms vs
-// conc=96 at 1566ms. Bumping to 128 narrows the gap between
-// fast_preload's wave-shaped concurrency floor (eff_parallel ~48
-// because pending takes ~2 wave depths to fill) and the cap, so
-// the late-wave saturation has more headroom.
+// manifest-bench's HTTP-only sweep on GHA (npmjs, h1) bottoms out
+// somewhere in the 96-128 band — which one wins varies with npmjs's
+// per-IP latency on each run (good runs picked 128, slow-network
+// runs flattened the curve and even regressed at 128 due to wider
+// p99 from queued requests). 96 is the conservative pick: it's at
+// or near best on every run we've measured, never the worst, and
+// leaves headroom for npmjs to throttle without compounding queue
+// time. Combined-parse fetch (671ac98e) made the spawn_blocking
+// pool no longer a contention bottleneck, but didn't change the
+// network-side variance — that's what caps the useful concurrency.
 static MANIFESTS_CONCURRENCY_LIMIT: LazyLock<ConfigValue<usize>> =
-    LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 128));
+    LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 96));
 
 pub fn set_manifests_concurrency_limit(value: Option<usize>) {
     MANIFESTS_CONCURRENCY_LIMIT.set(value);

From 3be7487d7ad772667ac125ce82955432c257f8d3 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 09:57:58 +0800
Subject: [PATCH 19/32] perf(pm): mb_resolve experimental fetch path (parallel
 track to fast_preload)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds resolver::mb_resolve module + service::build_deps_mb entry point
as a parallel-track alternative to fast_preload, structured to
match manifest-bench's main-loop shape as closely as correctness
allows. Hypothesis under test: fast_preload's eff_parallel caps at
~50/96 because the FastEvent enum match + cache writes + sibling
deferred bookkeeping in the main loop competes with tokio runtime
workers for the 2 CPU cores on GHA, stalling socket I/O drive.

mb_fetch pushes ALL per-fetch work into the spawned future itself
(including cache writes), so the main loop is reduced to:

  while let Some(deps) = futs.next().await {
      pending.extend(deps);
      refill_to_cap(...);
  }

Sibling specs (multiple ranges on same package) are NOT deferred at
queue level — racing fetches for the same name both proceed. The
race converges naturally: first fetch to land populates
full_manifests, subsequent racers find the cache hit on entry and
short-circuit to a sibling-style settle. Wastes ~5-50 network
requests in real workloads but eliminates the HashMap probe + drain
overhead from the hot loop.

Wired in via UTOO_RESOLVE=mb env var:
- Context::build_deps (utoo deps) routes through build_deps_mb
- pipeline::resolve_with_pipeline (utoo install) also routes
  through it; pipeline workers still start but don't pipeline
  during fetch (mb_fetch emits no PackageResolved events) — install
  becomes phase-sequential, useful for resolve-phase A/B.

bench script enables UTOO_RESOLVE=mb so CI measures the new path
against existing baselines (utoo-next/utoo-npm/bun ignore the env
var). Comment the export line to A/B back against fast_preload.

Old fast_preload + UnifiedRegistry paths untouched.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 bench/pm-bench-phases.sh                   |   7 +
 crates/pm/src/helper/ruborist_context.rs   |  12 +-
 crates/pm/src/service/pipeline/mod.rs      |  17 +-
 crates/ruborist/src/resolver/mb_resolve.rs | 243 +++++++++++++++++++++
 crates/ruborist/src/resolver/mod.rs        |   1 +
 crates/ruborist/src/service/api.rs         | 161 ++++++++++++++
 crates/ruborist/src/service/mod.rs         |   2 +-
 7 files changed, 440 insertions(+), 3 deletions(-)
 create mode 100644 crates/ruborist/src/resolver/mb_resolve.rs

diff --git a/bench/pm-bench-phases.sh b/bench/pm-bench-phases.sh
index 226ffb751..26e43388c 100755
--- a/bench/pm-bench-phases.sh
+++ b/bench/pm-bench-phases.sh
@@ -22,6 +22,13 @@ UTOO_NEXT_CACHE="${UTOO_NEXT_CACHE:-/tmp/utoo-next-bench-cache}"
 BUN_CACHE="${BUN_CACHE:-/tmp/bun-bench-cache}"
 export BUN_INSTALL_CACHE_DIR="$BUN_CACHE"
 
+# Route the current `utoo` binary's resolve phase through the
+# experimental `mb_resolve` flat-fetch path. Other PMs ignore this
+# env var (utoo-next is built from origin/next which doesn't have
+# the flag; utoo-npm/bun ignore unrecognized env vars). Comment out
+# to A/B against the default `fast_preload` path.
+export UTOO_RESOLVE=mb
+
 # Drop optional baselines from the PM list when their binary is not wired
 # up — UTOO_NPM_BIN is set by CI's "Install utoo@npm" step, UTOO_NEXT_BIN
 # by the optional "Build next branch utoo" step. Local runs without them
diff --git a/crates/pm/src/helper/ruborist_context.rs b/crates/pm/src/helper/ruborist_context.rs
index bc4d7faa1..542664f8c 100644
--- a/crates/pm/src/helper/ruborist_context.rs
+++ b/crates/pm/src/helper/ruborist_context.rs
@@ -87,10 +87,20 @@ impl Context {
     /// Used by the lockfile-only path (`utoo deps`). No pipeline consumes
     /// `PackageResolved` events here, so preload is pure overhead — BFS's
     /// own per-level parallel prefetch warms the manifest cache.
+    ///
+    /// Set `UTOO_RESOLVE=mb` to opt into the experimental
+    /// manifest-bench-style fetch path (`build_deps_mb`) for A/B
+    /// benchmarking against the current `fast_preload`.
     pub async fn build_deps(cwd: PathBuf) -> anyhow::Result<BuildDepsOutput> {
         let mut options = Self::deps_options(cwd.clone(), ProgressReceiver).await;
         options.skip_preload = true;
-        let output = utoo_ruborist::service::build_deps(options).await?;
+        let use_mb = std::env::var("UTOO_RESOLVE").as_deref() == Ok("mb");
+        let output = if use_mb {
+            tracing::debug!("UTOO_RESOLVE=mb: routing to build_deps_mb");
+            utoo_ruborist::service::build_deps_mb(options).await?
+        } else {
+            utoo_ruborist::service::build_deps(options).await?
+        };
         spawn_save_project_cache(cwd, output.project_cache.clone());
         Ok(output)
     }
diff --git a/crates/pm/src/service/pipeline/mod.rs b/crates/pm/src/service/pipeline/mod.rs
index 719d31d13..4169ca88d 100644
--- a/crates/pm/src/service/pipeline/mod.rs
+++ b/crates/pm/src/service/pipeline/mod.rs
@@ -41,7 +41,22 @@ pub async fn resolve_with_pipeline(root_path: &std::path::Path) -> anyhow::Resul
     let (options, channels) = Context::pipeline_deps_options(root_path.to_path_buf()).await;
     let handles = worker::start_workers(channels, root_path.to_path_buf());
 
-    let output = utoo_ruborist::service::build_deps(options).await?;
+    // `UTOO_RESOLVE=mb` reroutes install through the experimental
+    // mb-style fetch path. Pipeline workers are still started, but
+    // because mb_fetch doesn't emit `PackageResolved` events, the
+    // pipeline only fires once BFS completes (graph_to_package_lock
+    // emits `PackagePlaced` from BFS). Install becomes
+    // phase-sequential — fetch all manifests, then download +
+    // clone. Useful for A/B benchmarking the resolve phase in
+    // isolation; the pipelining advantage of the default path is
+    // lost.
+    let use_mb = std::env::var("UTOO_RESOLVE").as_deref() == Ok("mb");
+    let output = if use_mb {
+        tracing::debug!("UTOO_RESOLVE=mb: routing install resolve to build_deps_mb");
+        utoo_ruborist::service::build_deps_mb(options).await?
+    } else {
+        utoo_ruborist::service::build_deps(options).await?
+    };
 
     save_package_lock(root_path, &output.lock).await?;
     spawn_save_project_cache(root_path.to_path_buf(), output.project_cache);
diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs
new file mode 100644
index 000000000..2928638be
--- /dev/null
+++ b/crates/ruborist/src/resolver/mb_resolve.rs
@@ -0,0 +1,243 @@
+//! Manifest-bench-style flat manifest fetcher (experimental new pipeline).
+//!
+//! A parallel-track alternative to [`super::fast_preload`], structured
+//! to match `manifest-bench`'s main-loop shape as closely as
+//! correctness allows. The hypothesis under test: `fast_preload`'s
+//! eff_parallel caps at ~50 against a 96-cap because the main loop's
+//! CPU work (FastEvent enum match + cache writes + sibling-deferred
+//! bookkeeping + Box::pin allocation) competes with tokio runtime
+//! workers for the 2 cores on GHA, stalling socket I/O drive.
+//!
+//! `mb_resolve` pushes ALL per-fetch work into the spawned future
+//! itself (cache writes included) so the main loop is reduced to:
+//!
+//! ```ignore
+//! while let Some(deps) = futs.next().await {
+//!     pending.extend(deps);
+//!     refill_to_cap(&mut futs, &mut pending, ...);
+//! }
+//! ```
+//!
+//! Sibling specs (multiple ranges on the same package) are NOT
+//! deferred at queue level — if two specs for the same name race,
+//! both fetch. This wastes a small number of network requests (~5-50
+//! across a real install) but keeps the main loop's per-event cost
+//! minimal (no HashMap probe / drain). The race converges: whichever
+//! fetch lands first populates `full_manifests`; subsequent racers
+//! find the cache hit on entry and short-circuit to a sibling-style
+//! settle without re-fetching.
+//!
+//! Wiring: opt-in via `UTOO_RESOLVE=mb` env var. Both `utoo deps`
+//! and `utoo install` route through this when set; install loses
+//! pipelining (mb_fetch doesn't emit `PackageResolved` events) but
+//! gains the lean main loop for resolve-phase A/B testing.
+
+use std::collections::{HashSet, VecDeque};
+use std::sync::Arc;
+
+use futures::stream::{FuturesUnordered, StreamExt};
+
+use crate::model::manifest::{CoreVersionManifest, FullManifest};
+use crate::model::node::PeerDeps;
+use crate::resolver::preload::{Dep, PreloadConfig};
+use crate::resolver::version::resolve_target_version;
+use crate::service::{
+    FetchManifestOptions, FetchWithSettleResult, MemoryCache, MetadataFormat,
+    fetch_full_manifest_with_settle,
+};
+use crate::spec::SpecStr;
+use crate::util::FETCH_TIMINGS;
+
+#[derive(Debug, Default)]
+pub struct MbFetchStats {
+    pub success: usize,
+    pub fail: usize,
+}
+
+/// Collect dependencies from a deps map, filtering non-registry specs.
+fn collect_deps(map: Option<&std::collections::HashMap<String, String>>) -> Vec<Dep> {
+    map.into_iter()
+        .flatten()
+        .filter(|(_, spec)| spec.is_registry_spec())
+        .map(|(name, spec)| (name.clone(), spec.clone()))
+        .collect()
+}
+
+fn extract_transitive(manifest: &CoreVersionManifest, peer_deps: PeerDeps) -> Vec<Dep> {
+    let mut out = Vec::new();
+    out.extend(collect_deps(manifest.dependencies.as_ref()));
+    if peer_deps == PeerDeps::Include {
+        out.extend(collect_deps(manifest.peer_dependencies.as_ref()));
+    }
+    out.extend(collect_deps(manifest.optional_dependencies.as_ref()));
+    out
+}
+
+/// Settle one (name, spec) against an already-cached `FullManifest`.
+/// Used for sibling specs (or racing-fetch losers) — extracts the
+/// resolved version's `CoreVersionManifest` on the blocking pool,
+/// populates both `(name, spec)` and `(name, resolved_version)` cache
+/// slots so BFS hits the early-return fast path.
+async fn settle_sibling(
+    name: String,
+    spec: String,
+    full: Arc<FullManifest>,
+    cache: MemoryCache,
+    peer_deps: PeerDeps,
+) -> Vec<Dep> {
+    let Ok(resolved) = resolve_target_version((&*full).into(), &spec) else {
+        return Vec::new();
+    };
+    if let Some(cached) = cache.get_version_manifest(&name, &resolved) {
+        cache.set_version_manifest(name, spec, Arc::clone(&cached));
+        return extract_transitive(&cached, peer_deps);
+    }
+
+    let resolved_for_parse = resolved.clone();
+    let full_for_parse = Arc::clone(&full);
+    let core_opt = tokio::task::spawn_blocking(move || {
+        full_for_parse
+            .get_core_version(&resolved_for_parse)
+            .map(Arc::new)
+    })
+    .await
+    .ok()
+    .flatten();
+
+    let Some(core_arc) = core_opt else {
+        return Vec::new();
+    };
+    cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc));
+    cache.set_version_manifest(name, resolved, Arc::clone(&core_arc));
+    extract_transitive(&core_arc, peer_deps)
+}
+
+/// Self-contained per-spec future. Either fetches `(name)`'s full
+/// manifest from the registry (if not yet cached), or settles against
+/// an already-cached one. In both cases it:
+///   * writes `full_manifests` and `version_manifests` cache slots
+///     for the resolved spec,
+///   * returns the spec's transitive deps for the main loop to
+///     enqueue.
+///
+/// Racing-fetch handling: two specs for the same name dispatched
+/// concurrently both enter the fetch branch (no in-flight gate). The
+/// second one re-issues a network round-trip; the cost is bounded by
+/// the small number of sibling specs in real workloads (<2% in
+/// ant-design-x). Last writer to `cache.set_full_manifest` wins;
+/// content is identical so correctness is preserved.
+async fn fetch_or_settle(
+    name: String,
+    spec: String,
+    registry_url: String,
+    cache: MemoryCache,
+    peer_deps: PeerDeps,
+) -> Vec<Dep> {
+    // Sibling fast path: full manifest already cached.
+    if let Some(full) = cache.get_full_manifest(&name) {
+        return settle_sibling(name, spec, full, cache, peer_deps).await;
+    }
+
+    let result = fetch_full_manifest_with_settle(
+        FetchManifestOptions {
+            registry_url: &registry_url,
+            name: &name,
+            format: MetadataFormat::Abbreviated,
+            etag: None,
+        },
+        &spec,
+    )
+    .await;
+
+    let Ok(FetchWithSettleResult::Ok(payload)) = result else {
+        return Vec::new();
+    };
+
+    let full_arc = Arc::new(payload.manifest);
+    cache.set_full_manifest(name.clone(), Arc::clone(&full_arc));
+
+    let Some((resolved, core_arc)) = payload.primary_settle else {
+        return Vec::new();
+    };
+    cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc));
+    cache.set_version_manifest(name, resolved, Arc::clone(&core_arc));
+    extract_transitive(&core_arc, peer_deps)
+}
+
+/// Manifest-bench-style flat parallel fetch. See module docs for the
+/// rationale.
+pub async fn mb_fetch(
+    initial_deps: Vec<Dep>,
+    registry_url: &str,
+    cache: &MemoryCache,
+    config: &PreloadConfig,
+) -> MbFetchStats {
+    let mut stats = MbFetchStats::default();
+    let mut pending: VecDeque<Dep> = initial_deps.into();
+    let mut seen: HashSet<(String, String)> = HashSet::new();
+    let mut futs = FuturesUnordered::new();
+    let cap = config.concurrency;
+    let peer_deps = config.peer_deps;
+    let registry_url = registry_url.to_string();
+
+    let start = tokio::time::Instant::now();
+
+    // Initial fill — same shape as the refill below.
+    while futs.len() < cap {
+        let Some((name, spec)) = pending.pop_front() else {
+            break;
+        };
+        if !seen.insert((name.clone(), spec.clone())) {
+            continue;
+        }
+        futs.push(Box::pin(fetch_or_settle(
+            name,
+            spec,
+            registry_url.clone(),
+            cache.clone(),
+            peer_deps,
+        )));
+    }
+
+    while let Some(transitive) = futs.next().await {
+        if transitive.is_empty() {
+            // Empty result is ambiguous (no transitive deps OR fetch
+            // failed) — `MbFetchStats` only tracks success/fail at a
+            // coarse level. The fetch-timings counters (recorded
+            // inside `fetch_full_manifest_with_settle`) carry the
+            // detailed per-fetch metrics.
+            stats.fail += 1;
+        } else {
+            stats.success += 1;
+        }
+        pending.extend(transitive);
+
+        // Refill — same body as the initial fill above.
+        while futs.len() < cap {
+            let Some((name, spec)) = pending.pop_front() else {
+                break;
+            };
+            if !seen.insert((name.clone(), spec.clone())) {
+                continue;
+            }
+            futs.push(Box::pin(fetch_or_settle(
+                name,
+                spec,
+                registry_url.clone(),
+                cache.clone(),
+                peer_deps,
+            )));
+        }
+    }
+
+    let wall = start.elapsed();
+    tracing::info!(
+        "p1-breakdown mb_fetch wall={}ms ok={} fail={} | {}",
+        wall.as_millis(),
+        stats.success,
+        stats.fail,
+        FETCH_TIMINGS.snapshot().summary_line(),
+    );
+
+    stats
+}
diff --git a/crates/ruborist/src/resolver/mod.rs b/crates/ruborist/src/resolver/mod.rs
index e7baad988..2d0a288d9 100644
--- a/crates/ruborist/src/resolver/mod.rs
+++ b/crates/ruborist/src/resolver/mod.rs
@@ -8,6 +8,7 @@ pub mod fast_preload;
 pub mod git;
 #[cfg(feature = "http-tarball")]
 pub mod http;
+pub mod mb_resolve;
 pub mod preload;
 pub mod registry;
 pub mod runtime;
diff --git a/crates/ruborist/src/service/api.rs b/crates/ruborist/src/service/api.rs
index 3b9b713ea..9687fc875 100644
--- a/crates/ruborist/src/service/api.rs
+++ b/crates/ruborist/src/service/api.rs
@@ -39,6 +39,7 @@ use crate::resolver::builder::{
     gather_preload_deps,
 };
 use crate::resolver::fast_preload::fast_preload;
+use crate::resolver::mb_resolve::mb_fetch;
 use crate::resolver::preload::PreloadConfig;
 use crate::resolver::runtime::install_runtime_from_map;
 use crate::resolver::workspace::WorkspaceDiscovery;
@@ -332,6 +333,166 @@ where
     })
 }
 
+/// Experimental parallel-track entry point: structurally identical to
+/// [`build_deps`] but routes the manifest-fetch phase through
+/// [`crate::resolver::mb_resolve::mb_fetch`] instead of
+/// [`crate::resolver::fast_preload::fast_preload`].
+///
+/// Intended for A/B benchmarking: install + lockfile-only callers can
+/// opt in via `UTOO_RESOLVE=mb` (wired in `pm::helper::ruborist_context`).
+/// All other behavior — workspace discovery, runtime injection, BFS,
+/// graph→lock serialization, project cache export — is the same as
+/// `build_deps`. The `EventReceiver` still receives BFS events; it
+/// does NOT receive `PreloadFetching` / `PreloadProgress` events
+/// because mb_fetch is silent (matches `manifest-bench`'s zero-event
+/// loop).
+///
+/// **Install-path note:** `pipeline_deps_options` callers that need
+/// `PackageResolved` events to drive the download/clone pipeline
+/// won't pipeline under this path — mb_fetch finishes all fetches
+/// before BFS starts. Use only for `utoo deps`-style workloads, or
+/// accept that install becomes phase-sequential.
+pub async fn build_deps_mb<G, R>(options: BuildDepsOptions<G, R>) -> Result<BuildDepsOutput>
+where
+    G: Glob + Clone,
+    R: EventReceiver,
+{
+    let BuildDepsOptions {
+        cwd,
+        registry_url,
+        cache_dir,
+        manifest_store,
+        warm_project_cache,
+        concurrency,
+        peer_deps,
+        glob,
+        receiver,
+        supports_semver,
+        catalogs,
+        skip_preload: _,
+    } = options;
+
+    // Steps 1-6: structurally identical to `build_deps` — read
+    // package.json, inject runtime deps, build initial graph, add
+    // root edges, discover and add workspaces.
+    let discovery = WorkspaceDiscovery::new(glob.clone());
+    let root_path = discovery.find_root_path(&cwd).await?;
+    let pkg_path = root_path.join("package.json");
+    let mut pkg: PackageJson = super::fs::read_json(&pkg_path)
+        .await
+        .map_err(|e| anyhow::anyhow!("Failed to read/parse package.json: {}", e))?;
+
+    if let Some(engines) = &pkg.engines {
+        let runtime_deps = install_runtime_from_map(engines);
+        if !runtime_deps.is_empty() {
+            for (name, version) in runtime_deps {
+                pkg.optional_dependencies
+                    .get_or_insert_with(HashMap::new)
+                    .entry(name)
+                    .or_insert(version);
+            }
+        }
+    }
+
+    let mut graph = DependencyGraph::from_package_json(root_path.clone(), pkg.clone());
+    let root_index = graph.root_index;
+    let edge_ctx = EdgeContext::new(peer_deps, DevDeps::Include).with_catalogs(&catalogs);
+    add_edges_from(&mut graph, root_index, &pkg, &edge_ctx);
+
+    let workspaces = discovery.find_workspaces_from_pkg(&root_path, &pkg).await?;
+    for workspace in workspaces {
+        let ws_pkg = workspace.package_json;
+        let workspace_node =
+            PackageNode::workspace_from_package_json(workspace.path.clone(), ws_pkg.clone());
+        let workspace_index = graph.add_node(workspace_node);
+        let link_node = PackageNode::link_from_package_json(workspace.path.clone(), ws_pkg.clone());
+        let link_index = graph.add_node(link_node);
+        graph.add_physical_edge(root_index, workspace_index);
+        graph.add_physical_edge(root_index, link_index);
+        let dep_edge_id = graph.add_dependency_edge(
+            root_index,
+            workspace.name.clone(),
+            &ws_pkg.version,
+            EdgeType::Prod,
+        );
+        graph.mark_dependency_resolved(dep_edge_id, workspace_index);
+        add_edges_from(&mut graph, workspace_index, &ws_pkg, &edge_ctx);
+    }
+
+    // Step 7-8: cache + registry, same as `build_deps`. Warm project
+    // cache is honored.
+    let package_cache = Arc::new(PackageCache::default());
+    let (cache_count, _) = prepopulate_warm_cache(&package_cache, warm_project_cache.as_ref());
+
+    let mut builder = UnifiedRegistry::builder()
+        .registry(&registry_url)
+        .cache(package_cache)
+        .store(Arc::clone(&manifest_store));
+    if let Some(semver) = supports_semver {
+        builder = builder.supports_semver(semver);
+    }
+    let registry = builder.build();
+
+    // Run mb_fetch instead of fast_preload — pre-warms cache by
+    // walking transitive deps via flat FuturesUnordered. Skipped if
+    // the warm project cache already covers the workload.
+    if cache_count == 0 {
+        let initial_deps = gather_preload_deps(&graph, peer_deps);
+        let preload_config = PreloadConfig {
+            peer_deps,
+            concurrency,
+        };
+        mb_fetch(
+            initial_deps,
+            registry.registry_url(),
+            registry.cache(),
+            &preload_config,
+        )
+        .await;
+    }
+
+    // BFS phase reads the now-warm cache. `skip_preload=true` skips
+    // the receiver-driven preload — mb_fetch already ran.
+    let mut config = BuildDepsConfig::default()
+        .with_peer_deps(peer_deps)
+        .with_concurrency(concurrency)
+        .with_skip_preload(true)
+        .with_catalogs(catalogs);
+    if let Some(dir) = cache_dir {
+        config = config.with_cache_dir(dir);
+    }
+
+    build_deps_with_config(&mut graph, &registry, config, &receiver)
+        .await
+        .map_err(|e| anyhow::Error::new(e).context("Dependency resolution failed"))?;
+
+    let t_serialize_start = std::time::Instant::now();
+    let (packages, _total) = graph.serialize_to_packages(&root_path);
+    let serialize_us = t_serialize_start.elapsed().as_micros() as u64;
+
+    let t_cache_export_start = std::time::Instant::now();
+    let mut project_cache = ProjectCacheData::default();
+    for (key, manifest) in registry.cache().export_version_manifests() {
+        let (name, spec) = parse_package_spec(&key);
+        let version = manifest.version.clone();
+        let pkg_cache = project_cache.cache.entry(name.to_string()).or_default();
+        pkg_cache.specs.insert(spec.to_string(), version.clone());
+        pkg_cache.manifests.insert(version, (*manifest).clone());
+    }
+    let cache_export_us = t_cache_export_start.elapsed().as_micros() as u64;
+
+    tracing::info!(
+        "p1-breakdown serialize_us={} cache_export_us={}",
+        serialize_us,
+        cache_export_us,
+    );
+
+    Ok(BuildDepsOutput {
+        lock: PackageLock::new(&pkg.name, &pkg.version, packages),
+        project_cache,
+    })
+}
+
 /// Pre-populate `cache` from a warm project cache. Returns
 /// `(loaded, missing)` — `loaded` is the count of usable spec→manifest
 /// entries; `missing` counts specs whose resolved version had no manifest
diff --git a/crates/ruborist/src/service/mod.rs b/crates/ruborist/src/service/mod.rs
index 5adb6bf0b..7a7cf8ca8 100644
--- a/crates/ruborist/src/service/mod.rs
+++ b/crates/ruborist/src/service/mod.rs
@@ -52,7 +52,7 @@ mod manifest;
 mod registry;
 mod store;
 
-pub use api::{BuildDepsOptions, BuildDepsOutput, build_deps};
+pub use api::{BuildDepsOptions, BuildDepsOutput, build_deps, build_deps_mb};
 pub use cache::{
     CacheStats, MemoryCache, PackageCache, ProjectCacheData, ProjectPackageCache, Versions,
     VersionsInfo,

From 02cc12e7a23214672215a1ee1efd6317e7ce6d8c Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 10:39:27 +0800
Subject: [PATCH 20/32] =?UTF-8?q?perf(pm):=20mb=5Fresolve=20v3=20=E2=80=94?=
 =?UTF-8?q?=20two-phase=20pure=20HTTP=20+=20rayon=20batch=20parse?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v1/v2 ran parse work in spawn_blocking inside each fetch future,
which competed with tokio runtime workers for the 2 GHA cores. CI
showed eff_parallel capped at 47/96 vs manifest-bench standalone's
75/96 on the same box. Hypothesis: parse CPU starves socket drive.

v3 separates the two phases:

* PHASE 1 — `mb_style_pure_fetch` is a structural copy of
  `manifest-bench`'s main loop: future body does ONLY GET + body
  recv, refill 1-for-1 on completion. Zero per-future CPU work, so
  tokio runtime workers retain full CPU for socket drive.

* PHASE 2 — bulk rayon par_iter parse: for each body, parse
  `FullManifest` envelope via simd_json::to_borrowed_value, resolve
  every queued spec for this name against the just-parsed manifest,
  populate cache slots, collect transitive deps. Runs off the
  tokio runtime entirely (spawn_blocking → rayon par_iter).

Phases alternate until pending exhausted. Typical project: 3-5
iterations as the dep tree fans out wave by wave.

The point of the split is the `phase1_http_wall` trace — measured
in isolation from any parse work, it should match manifest-bench's
standalone wall (~1.5-2.0s for 2733 names @ conc=96). If it does,
the remaining gap to mb is concentrated in phase 2 work, which is
inherent to discovering transitive deps from a non-flat name list.

Tracing per iteration:
  p1-breakdown mb_fetch iter=N phase1_http_wall=Xms n=Y bytes=Z
  p1-breakdown mb_fetch iter=N phase2_parse_wall=Xms settles=Y new_transitives=Z
  p1-breakdown mb_fetch total_wall=Xms iters=Y

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/resolver/mb_resolve.rs | 494 ++++++++++++++-------
 1 file changed, 332 insertions(+), 162 deletions(-)

diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs
index 2928638be..05e1bf038 100644
--- a/crates/ruborist/src/resolver/mb_resolve.rs
+++ b/crates/ruborist/src/resolver/mb_resolve.rs
@@ -1,61 +1,87 @@
-//! Manifest-bench-style flat manifest fetcher (experimental new pipeline).
+//! Two-phase manifest fetcher: phase 1 pure HTTP (mirrors
+//! `manifest-bench` standalone exactly), phase 2 rayon batch parse +
+//! settle.
 //!
-//! A parallel-track alternative to [`super::fast_preload`], structured
-//! to match `manifest-bench`'s main-loop shape as closely as
-//! correctness allows. The hypothesis under test: `fast_preload`'s
-//! eff_parallel caps at ~50 against a 96-cap because the main loop's
-//! CPU work (FastEvent enum match + cache writes + sibling-deferred
-//! bookkeeping + Box::pin allocation) competes with tokio runtime
-//! workers for the 2 cores on GHA, stalling socket I/O drive.
+//! ## Phase split
 //!
-//! `mb_resolve` pushes ALL per-fetch work into the spawned future
-//! itself (cache writes included) so the main loop is reduced to:
+//! Per-fetch parse work was the real bottleneck in v1/v2 — `simd_json`
+//! ran in `spawn_blocking` threads that competed with tokio runtime
+//! workers for CPU on the 2-core GHA box. When 50+ parses ran in
+//! parallel, tokio workers couldn't drive sockets, so `eff_parallel`
+//! capped at ~47 against the 96 cap (vs `manifest-bench` standalone's
+//! 75 on the same box).
 //!
-//! ```ignore
-//! while let Some(deps) = futs.next().await {
-//!     pending.extend(deps);
-//!     refill_to_cap(&mut futs, &mut pending, ...);
-//! }
-//! ```
+//! v3 separates the work:
 //!
-//! Sibling specs (multiple ranges on the same package) are NOT
-//! deferred at queue level — if two specs for the same name race,
-//! both fetch. This wastes a small number of network requests (~5-50
-//! across a real install) but keeps the main loop's per-event cost
-//! minimal (no HashMap probe / drain). The race converges: whichever
-//! fetch lands first populates `full_manifests`; subsequent racers
-//! find the cache hit on entry and short-circuit to a sibling-style
-//! settle without re-fetching.
+//! - **Phase 1** — `mb_style_pure_fetch` is a structural copy of
+//!   `manifest-bench`'s main loop: `spawn_one` (GET + body recv,
+//!   nothing else) + 1-for-1 refill on completion. The future body
+//!   has zero CPU work, so the tokio runtime workers retain full CPU
+//!   to drive sockets and `eff_parallel` reaches the same level as
+//!   the standalone bench.
 //!
-//! Wiring: opt-in via `UTOO_RESOLVE=mb` env var. Both `utoo deps`
-//! and `utoo install` route through this when set; install loses
-//! pipelining (mb_fetch doesn't emit `PackageResolved` events) but
-//! gains the lean main loop for resolve-phase A/B testing.
+//! - **Phase 2** — bulk parse on rayon (off the tokio runtime). For
+//!   each fetched body: parse `FullManifest` envelope, resolve every
+//!   spec we need for this name, materialize `CoreVersionManifest`
+//!   subtrees, populate cache slots, collect transitive deps for the
+//!   next iteration.
+//!
+//! Phases alternate until `pending` is empty (typical project: 3-5
+//! iterations as transitive deps fan out wave by wave).
+//!
+//! Phase 1 is the line we measure against `manifest-bench` —
+//! `p1-breakdown mb_fetch_iter=N phase1_http_wall=...` traces let us
+//! check eff_parallel directly.
+//!
+//! Wired in via `UTOO_RESOLVE=mb` env var (see
+//! `pm::helper::ruborist_context::Context::build_deps`).
 
-use std::collections::{HashSet, VecDeque};
+use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
 
+use bytes::Bytes;
 use futures::stream::{FuturesUnordered, StreamExt};
+use rayon::prelude::*;
+use serde::Deserialize;
 
 use crate::model::manifest::{CoreVersionManifest, FullManifest};
 use crate::model::node::PeerDeps;
 use crate::resolver::preload::{Dep, PreloadConfig};
 use crate::resolver::version::resolve_target_version;
-use crate::service::{
-    FetchManifestOptions, FetchWithSettleResult, MemoryCache, MetadataFormat,
-    fetch_full_manifest_with_settle,
-};
+use crate::service::MemoryCache;
+use crate::service::http::get_client;
 use crate::spec::SpecStr;
-use crate::util::FETCH_TIMINGS;
 
 #[derive(Debug, Default)]
 pub struct MbFetchStats {
     pub success: usize,
     pub fail: usize,
+    pub iterations: usize,
+}
+
+/// Phase 1 result: one body per fetched name. `bytes` is `None` on
+/// transport / non-2xx — kept in the result vector so phase 2 can
+/// account for it, but contributes no settle work.
+struct FetchOutcome {
+    name: String,
+    bytes: Option<Bytes>,
 }
 
-/// Collect dependencies from a deps map, filtering non-registry specs.
-fn collect_deps(map: Option<&std::collections::HashMap<String, String>>) -> Vec<Dep> {
+/// Phase 2 per-name output. `full` is `None` on parse failure.
+struct ParseOutcome {
+    name: String,
+    full: Option<Arc<FullManifest>>,
+    /// Per-spec settled subtrees: `(spec, resolved_version, core)`.
+    /// Empty when the body failed to fetch / parse, or when no spec
+    /// resolves against the manifest.
+    settled: Vec<(String, String, Arc<CoreVersionManifest>)>,
+    /// Transitive deps collected across all settled subtrees for this
+    /// name. Already filtered to registry specs; the main loop dedups
+    /// against `done_names` before queueing.
+    transitives: Vec<Dep>,
+}
+
+fn collect_deps(map: Option<&HashMap<String, String>>) -> Vec<Dep> {
     map.into_iter()
         .flatten()
         .filter(|(_, spec)| spec.is_registry_spec())
@@ -73,99 +99,177 @@ fn extract_transitive(manifest: &CoreVersionManifest, peer_deps: PeerDeps) -> Ve
     out
 }
 
-/// Settle one (name, spec) against an already-cached `FullManifest`.
-/// Used for sibling specs (or racing-fetch losers) — extracts the
-/// resolved version's `CoreVersionManifest` on the blocking pool,
-/// populates both `(name, spec)` and `(name, resolved_version)` cache
-/// slots so BFS hits the early-return fast path.
-async fn settle_sibling(
-    name: String,
-    spec: String,
-    full: Arc<FullManifest>,
-    cache: MemoryCache,
-    peer_deps: PeerDeps,
-) -> Vec<Dep> {
-    let Ok(resolved) = resolve_target_version((&*full).into(), &spec) else {
-        return Vec::new();
+/// Phase 1 — structural copy of `manifest-bench`'s main loop. Future
+/// body does ONLY GET + body recv; no parse, no cache writes, no
+/// dedup. Returns one `FetchOutcome` per input name in arrival order.
+async fn mb_style_pure_fetch(
+    names: Vec<String>,
+    registry_url: &str,
+    concurrency: usize,
+) -> Vec<FetchOutcome> {
+    let client = match get_client() {
+        Ok(c) => c.clone(),
+        Err(e) => {
+            tracing::warn!("get_client failed: {e}");
+            return Vec::new();
+        }
     };
-    if let Some(cached) = cache.get_version_manifest(&name, &resolved) {
-        cache.set_version_manifest(name, spec, Arc::clone(&cached));
-        return extract_transitive(&cached, peer_deps);
-    }
 
-    let resolved_for_parse = resolved.clone();
-    let full_for_parse = Arc::clone(&full);
-    let core_opt = tokio::task::spawn_blocking(move || {
-        full_for_parse
-            .get_core_version(&resolved_for_parse)
-            .map(Arc::new)
-    })
-    .await
-    .ok()
-    .flatten();
+    let mut results: Vec<FetchOutcome> = Vec::with_capacity(names.len());
+    let mut futs = FuturesUnordered::new();
+    let mut idx = 0usize;
 
-    let Some(core_arc) = core_opt else {
-        return Vec::new();
+    let spawn_one = |client: &reqwest::Client,
+                     registry_url: &str,
+                     name: String,
+                     futs: &mut FuturesUnordered<_>| {
+        let url = format!("{}/{}", registry_url, name);
+        let client = client.clone();
+        futs.push(Box::pin(async move {
+            let bytes = match client
+                .get(&url)
+                .header("accept", "application/vnd.npm.install-v1+json")
+                .send()
+                .await
+            {
+                Ok(resp) if resp.status().is_success() => resp.bytes().await.ok(),
+                _ => None,
+            };
+            FetchOutcome { name, bytes }
+        }));
     };
-    cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc));
-    cache.set_version_manifest(name, resolved, Arc::clone(&core_arc));
-    extract_transitive(&core_arc, peer_deps)
+
+    while idx < names.len() && futs.len() < concurrency {
+        spawn_one(&client, registry_url, names[idx].clone(), &mut futs);
+        idx += 1;
+    }
+
+    while let Some(outcome) = futs.next().await {
+        results.push(outcome);
+        if idx < names.len() {
+            spawn_one(&client, registry_url, names[idx].clone(), &mut futs);
+            idx += 1;
+        }
+    }
+
+    results
 }
 
-/// Self-contained per-spec future. Either fetches `(name)`'s full
-/// manifest from the registry (if not yet cached), or settles against
-/// an already-cached one. In both cases it:
-///   * writes `full_manifests` and `version_manifests` cache slots
-///     for the resolved spec,
-///   * returns the spec's transitive deps for the main loop to
-///     enqueue.
-///
-/// Racing-fetch handling: two specs for the same name dispatched
-/// concurrently both enter the fetch branch (no in-flight gate). The
-/// second one re-issues a network round-trip; the cost is bounded by
-/// the small number of sibling specs in real workloads (<2% in
-/// ant-design-x). Last writer to `cache.set_full_manifest` wins;
-/// content is identical so correctness is preserved.
-async fn fetch_or_settle(
+/// Sync phase 2 worker: parse one body, settle all specs we need for
+/// this name. Runs on rayon (called from `par_iter` in
+/// `parse_settle_batch`).
+fn parse_one_body(
     name: String,
-    spec: String,
-    registry_url: String,
-    cache: MemoryCache,
+    raw: Bytes,
+    specs: Vec<String>,
     peer_deps: PeerDeps,
-) -> Vec<Dep> {
-    // Sibling fast path: full manifest already cached.
-    if let Some(full) = cache.get_full_manifest(&name) {
-        return settle_sibling(name, spec, full, cache, peer_deps).await;
-    }
+) -> ParseOutcome {
+    use simd_json::prelude::{ValueAsScalar, ValueObjectAccess};
 
-    let result = fetch_full_manifest_with_settle(
-        FetchManifestOptions {
-            registry_url: &registry_url,
-            name: &name,
-            format: MetadataFormat::Abbreviated,
-            etag: None,
-        },
-        &spec,
-    )
-    .await;
-
-    let Ok(FetchWithSettleResult::Ok(payload)) = result else {
-        return Vec::new();
+    let raw_arc: Arc<[u8]> = Arc::from(raw.as_ref());
+    let mut buf = raw.to_vec();
+    let parsed = match simd_json::to_borrowed_value(&mut buf) {
+        Ok(v) => v,
+        Err(_) => {
+            return ParseOutcome {
+                name,
+                full: None,
+                settled: Vec::new(),
+                transitives: Vec::new(),
+            };
+        }
     };
 
-    let full_arc = Arc::new(payload.manifest);
-    cache.set_full_manifest(name.clone(), Arc::clone(&full_arc));
+    let envelope_name = parsed
+        .get("name")
+        .and_then(|v| v.as_str())
+        .map(|s| s.to_string())
+        .unwrap_or_else(|| name.clone());
+    let dist_tags: HashMap<String, String> = parsed
+        .get("dist-tags")
+        .and_then(|v| HashMap::<String, String>::deserialize(v).ok())
+        .unwrap_or_default();
+    let versions_keys: Vec<String> = parsed
+        .get("versions")
+        .and_then(simd_json::prelude::ValueAsObject::as_object)
+        .map(|obj| obj.keys().map(|k| k.to_string()).collect())
+        .unwrap_or_default();
 
-    let Some((resolved, core_arc)) = payload.primary_settle else {
-        return Vec::new();
+    let full = FullManifest {
+        name: envelope_name,
+        dist_tags,
+        versions: versions_keys,
+        raw: Arc::clone(&raw_arc),
+        ..Default::default()
     };
-    cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc));
-    cache.set_version_manifest(name, resolved, Arc::clone(&core_arc));
-    extract_transitive(&core_arc, peer_deps)
+    let full_arc = Arc::new(full);
+
+    // For each requested spec, resolve + extract version subtree.
+    // Cache the per-(name, version) `CoreVersionManifest` so sibling
+    // specs that resolve to the same version reuse the same Arc.
+    let mut version_cache: HashMap<String, Arc<CoreVersionManifest>> = HashMap::new();
+    let mut settled = Vec::with_capacity(specs.len());
+    let mut transitives = Vec::new();
+
+    for spec in specs {
+        let Ok(resolved_version) = resolve_target_version((&*full_arc).into(), &spec) else {
+            continue;
+        };
+        let core_arc = if let Some(cached) = version_cache.get(&resolved_version) {
+            Arc::clone(cached)
+        } else {
+            let Some(core) = parsed
+                .get("versions")
+                .and_then(|v| v.get(resolved_version.as_str()))
+                .and_then(|version_obj| CoreVersionManifest::deserialize(version_obj).ok())
+            else {
+                continue;
+            };
+            let arc = Arc::new(core);
+            version_cache.insert(resolved_version.clone(), Arc::clone(&arc));
+            arc
+        };
+        transitives.extend(extract_transitive(&core_arc, peer_deps));
+        settled.push((spec, resolved_version, core_arc));
+    }
+
+    ParseOutcome {
+        name,
+        full: Some(full_arc),
+        settled,
+        transitives,
+    }
+}
+
+/// Phase 2 dispatcher: hands the batch to rayon, awaits the result.
+async fn parse_settle_batch(
+    bodies: Vec<FetchOutcome>,
+    by_name: HashMap<String, Vec<String>>,
+    peer_deps: PeerDeps,
+) -> Vec<ParseOutcome> {
+    let work: Vec<(String, Bytes, Vec<String>)> = bodies
+        .into_iter()
+        .filter_map(|f| {
+            let bytes = f.bytes?;
+            let specs = by_name.get(&f.name).cloned().unwrap_or_default();
+            Some((f.name, bytes, specs))
+        })
+        .collect();
+
+    if work.is_empty() {
+        return Vec::new();
+    }
+
+    tokio::task::spawn_blocking(move || {
+        work.into_par_iter()
+            .map(|(name, raw, specs)| parse_one_body(name, raw, specs, peer_deps))
+            .collect::<Vec<_>>()
+    })
+    .await
+    .unwrap_or_default()
 }
 
-/// Manifest-bench-style flat parallel fetch. See module docs for the
-/// rationale.
+/// Two-phase mb-style fetch with rayon batch parse. See module docs.
 pub async fn mb_fetch(
     initial_deps: Vec<Dep>,
     registry_url: &str,
@@ -173,70 +277,136 @@ pub async fn mb_fetch(
     config: &PreloadConfig,
 ) -> MbFetchStats {
     let mut stats = MbFetchStats::default();
-    let mut pending: VecDeque<Dep> = initial_deps.into();
-    let mut seen: HashSet<(String, String)> = HashSet::new();
-    let mut futs = FuturesUnordered::new();
-    let cap = config.concurrency;
+    let mut pending_specs: Vec<Dep> = initial_deps;
+    let mut done_names: HashSet<String> = HashSet::new();
+    let conc = config.concurrency;
     let peer_deps = config.peer_deps;
-    let registry_url = registry_url.to_string();
+    let total_start = tokio::time::Instant::now();
 
-    let start = tokio::time::Instant::now();
+    while !pending_specs.is_empty() {
+        stats.iterations += 1;
+        let iter = stats.iterations;
 
-    // Initial fill — same shape as the refill below.
-    while futs.len() < cap {
-        let Some((name, spec)) = pending.pop_front() else {
-            break;
-        };
-        if !seen.insert((name.clone(), spec.clone())) {
-            continue;
+        // Group this iteration's pending specs by name.
+        let mut by_name: HashMap<String, Vec<String>> = HashMap::new();
+        for (name, spec) in pending_specs.drain(..) {
+            by_name.entry(name).or_default().push(spec);
         }
-        futs.push(Box::pin(fetch_or_settle(
-            name,
-            spec,
-            registry_url.clone(),
-            cache.clone(),
-            peer_deps,
-        )));
-    }
 
-    while let Some(transitive) = futs.next().await {
-        if transitive.is_empty() {
-            // Empty result is ambiguous (no transitive deps OR fetch
-            // failed) — `MbFetchStats` only tracks success/fail at a
-            // coarse level. The fetch-timings counters (recorded
-            // inside `fetch_full_manifest_with_settle`) carry the
-            // detailed per-fetch metrics.
-            stats.fail += 1;
-        } else {
-            stats.success += 1;
+        // Names whose full manifest is already cached from a prior
+        // iteration: settle their siblings synchronously (cheap
+        // semver match + cache lookup; no parse if version_manifest
+        // already cached, otherwise quick simd_json subtree extract).
+        let mut sibling_only: Vec<(String, Vec<String>)> = Vec::new();
+        let mut to_fetch: Vec<String> = Vec::with_capacity(by_name.len());
+        for (name, specs) in &by_name {
+            if done_names.contains(name) {
+                sibling_only.push((name.clone(), specs.clone()));
+            } else {
+                to_fetch.push(name.clone());
+            }
         }
-        pending.extend(transitive);
 
-        // Refill — same body as the initial fill above.
-        while futs.len() < cap {
-            let Some((name, spec)) = pending.pop_front() else {
-                break;
+        // Sibling settles (rare on real workloads — most names appear
+        // exactly once across the whole walk).
+        for (name, specs) in sibling_only {
+            let Some(full) = cache.get_full_manifest(&name) else {
+                continue;
             };
-            if !seen.insert((name.clone(), spec.clone())) {
+            for spec in specs {
+                let Ok(resolved) = resolve_target_version((&*full).into(), &spec) else {
+                    continue;
+                };
+                if let Some(cached) = cache.get_version_manifest(&name, &resolved) {
+                    cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&cached));
+                    pending_specs.extend(extract_transitive(&cached, peer_deps));
+                    continue;
+                }
+                if let Some(core) = full.get_core_version(&resolved) {
+                    let core_arc = Arc::new(core);
+                    cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&core_arc));
+                    cache.set_version_manifest(name.clone(), resolved, Arc::clone(&core_arc));
+                    pending_specs.extend(extract_transitive(&core_arc, peer_deps));
+                }
+            }
+        }
+
+        if to_fetch.is_empty() {
+            // Iteration drained pending entirely via sibling settles.
+            continue;
+        }
+
+        // PHASE 1 — pure HTTP, mb-style.
+        let p1_start = tokio::time::Instant::now();
+        let bodies = mb_style_pure_fetch(to_fetch.clone(), registry_url, conc).await;
+        let p1_wall = p1_start.elapsed().as_millis();
+        let total_bytes: usize = bodies
+            .iter()
+            .map(|b| b.bytes.as_ref().map(|v| v.len()).unwrap_or(0))
+            .sum();
+        tracing::info!(
+            "p1-breakdown mb_fetch iter={} phase1_http_wall={}ms n={} bytes={}",
+            iter,
+            p1_wall,
+            to_fetch.len(),
+            total_bytes,
+        );
+
+        // PHASE 2 — rayon batch parse + settle.
+        let p2_start = tokio::time::Instant::now();
+        let by_name_for_parse = by_name
+            .iter()
+            .filter(|(name, _)| !done_names.contains(*name))
+            .map(|(n, s)| (n.clone(), s.clone()))
+            .collect::<HashMap<_, _>>();
+        let parsed = parse_settle_batch(bodies, by_name_for_parse, peer_deps).await;
+        let p2_wall = p2_start.elapsed().as_millis();
+
+        let mut new_transitives: Vec<Dep> = Vec::new();
+        let mut settle_count = 0usize;
+        let mut fail_count = 0usize;
+        for outcome in parsed {
+            done_names.insert(outcome.name.clone());
+            let Some(full_arc) = outcome.full else {
+                fail_count += 1;
                 continue;
+            };
+            cache.set_full_manifest(outcome.name.clone(), Arc::clone(&full_arc));
+            for (spec, resolved, core) in outcome.settled {
+                cache.set_version_manifest(outcome.name.clone(), spec, Arc::clone(&core));
+                cache.set_version_manifest(outcome.name.clone(), resolved, Arc::clone(&core));
+                settle_count += 1;
             }
-            futs.push(Box::pin(fetch_or_settle(
-                name,
-                spec,
-                registry_url.clone(),
-                cache.clone(),
-                peer_deps,
-            )));
+            new_transitives.extend(outcome.transitives);
         }
+        // Names that fetched but failed parse — still mark done so we
+        // don't refetch them next iteration.
+        for name in to_fetch {
+            done_names.insert(name);
+        }
+
+        stats.success += settle_count;
+        stats.fail += fail_count;
+
+        tracing::info!(
+            "p1-breakdown mb_fetch iter={} phase2_parse_wall={}ms settles={} fail={} new_transitives={}",
+            iter,
+            p2_wall,
+            settle_count,
+            fail_count,
+            new_transitives.len(),
+        );
+
+        pending_specs.extend(new_transitives);
     }
 
-    let wall = start.elapsed();
+    let total_wall = total_start.elapsed().as_millis();
     tracing::info!(
-        "p1-breakdown mb_fetch wall={}ms ok={} fail={} | {}",
-        wall.as_millis(),
+        "p1-breakdown mb_fetch total_wall={}ms iters={} settled={} fail={}",
+        total_wall,
+        stats.iterations,
         stats.success,
         stats.fail,
-        FETCH_TIMINGS.snapshot().summary_line(),
     );
 
     stats

From 24165fb6d355d78cc606b69773fe2dc466560834 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 11:09:29 +0800
Subject: [PATCH 21/32] =?UTF-8?q?fix(pm):=20mb=5Fresolve=20v3=20=E2=80=94?=
 =?UTF-8?q?=20restore=20spec-level=20dedup=20to=20terminate?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v3 dropped the (name, spec) HashSet from v1/v2 thinking name-level
dedup via done_names was sufficient. It wasn't: sibling-settle's
extract_transitive can re-introduce specs we've already settled
(peer/optional dep cycles trivially trigger this), so the outer
while-loop never terminated.

CI 25589397823 hung on `Run phase-isolated benchmark · npmjs` for
~25 min before being cancelled — the bench's first utoo p1_resolve
hyperfine run got stuck in an infinite settle loop.

Fix: maintain `seen_specs: HashSet<(String, String)>` across all
iterations; filter both initial seed and every wave of new
transitives through it before extending pending_specs.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/resolver/mb_resolve.rs | 42 ++++++++++++++++------
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs
index 05e1bf038..7ef0b5d85 100644
--- a/crates/ruborist/src/resolver/mb_resolve.rs
+++ b/crates/ruborist/src/resolver/mb_resolve.rs
@@ -278,11 +278,20 @@ pub async fn mb_fetch(
 ) -> MbFetchStats {
     let mut stats = MbFetchStats::default();
     let mut pending_specs: Vec<Dep> = initial_deps;
+    // (name, spec) pairs we've already processed (settled or queued
+    // to settle). Without this, sibling-settle's transitive deps can
+    // re-introduce already-walked specs and the outer loop never
+    // terminates — peer / optional dep cycles trivially trigger this.
+    let mut seen_specs: HashSet<(String, String)> = HashSet::new();
     let mut done_names: HashSet<String> = HashSet::new();
     let conc = config.concurrency;
     let peer_deps = config.peer_deps;
     let total_start = tokio::time::Instant::now();
 
+    // Filter the initial seed through `seen_specs` too — root + workspace
+    // edges can list the same dep multiple times across workspaces.
+    pending_specs.retain(|(n, s)| seen_specs.insert((n.clone(), s.clone())));
+
     while !pending_specs.is_empty() {
         stats.iterations += 1;
         let iter = stats.iterations;
@@ -308,7 +317,8 @@ pub async fn mb_fetch(
         }
 
         // Sibling settles (rare on real workloads — most names appear
-        // exactly once across the whole walk).
+        // exactly once across the whole walk). New transitives go
+        // through `seen_specs` dedup before joining `pending_specs`.
         for (name, specs) in sibling_only {
             let Some(full) = cache.get_full_manifest(&name) else {
                 continue;
@@ -317,17 +327,22 @@ pub async fn mb_fetch(
                 let Ok(resolved) = resolve_target_version((&*full).into(), &spec) else {
                     continue;
                 };
-                if let Some(cached) = cache.get_version_manifest(&name, &resolved) {
+                let new_deps = if let Some(cached) = cache.get_version_manifest(&name, &resolved) {
                     cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&cached));
-                    pending_specs.extend(extract_transitive(&cached, peer_deps));
-                    continue;
-                }
-                if let Some(core) = full.get_core_version(&resolved) {
+                    extract_transitive(&cached, peer_deps)
+                } else if let Some(core) = full.get_core_version(&resolved) {
                     let core_arc = Arc::new(core);
                     cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&core_arc));
                     cache.set_version_manifest(name.clone(), resolved, Arc::clone(&core_arc));
-                    pending_specs.extend(extract_transitive(&core_arc, peer_deps));
-                }
+                    extract_transitive(&core_arc, peer_deps)
+                } else {
+                    Vec::new()
+                };
+                pending_specs.extend(
+                    new_deps
+                        .into_iter()
+                        .filter(|(n, s)| seen_specs.insert((n.clone(), s.clone()))),
+                );
             }
         }
 
@@ -388,16 +403,21 @@ pub async fn mb_fetch(
         stats.success += settle_count;
         stats.fail += fail_count;
 
+        let new_unique: Vec<Dep> = new_transitives
+            .into_iter()
+            .filter(|(n, s)| seen_specs.insert((n.clone(), s.clone())))
+            .collect();
+
         tracing::info!(
-            "p1-breakdown mb_fetch iter={} phase2_parse_wall={}ms settles={} fail={} new_transitives={}",
+            "p1-breakdown mb_fetch iter={} phase2_parse_wall={}ms settles={} fail={} new_unique={}",
             iter,
             p2_wall,
             settle_count,
             fail_count,
-            new_transitives.len(),
+            new_unique.len(),
         );
 
-        pending_specs.extend(new_transitives);
+        pending_specs.extend(new_unique);
     }
 
     let total_wall = total_start.elapsed().as_millis();

From 41822b081c713758fdbd633513d7257258f39d45 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 11:51:06 +0800
Subject: [PATCH 22/32] =?UTF-8?q?perf(pm):=20preload-bench=20=E2=80=94=20s?=
 =?UTF-8?q?elf-contained=20streaming=20preload=20baseline?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New crate `crates/preload-bench/` is a fully-standalone bench that:
* Uses the SAME HTTP setup as `manifest-bench` (own reqwest::Client
  built per rep with aws-lc-rs TLS, pool_max_idle_per_host(256), no
  proxy, default DNS, no retry, h1_only).
* Discovers names by walking transitive deps from a package.json
  root — instead of consuming a flat name list like manifest-bench.
* Per-future does GET + body recv + spawn_blocking parse → returns
  transitive deps → main loop refills on completion.
* No dependency on ruborist or any utoo internals (own simd_json,
  own dedup, own everything).

The point: prove (or disprove) that a fully ruborist-independent
streaming preload can hit standalone manifest-bench's wall on the
same workload. ruborist's path runs at ~2.18s for ant-design's
~2700 names; manifest-bench standalone runs the same workload at
~1.6s. The gap could be in any number of things — DNS layer, retry,
pool config, parse-CPU contention, registry single-flight gates.
preload-bench eliminates all of those simultaneously so we can read
the wall directly.

Wired into bench-phases-linux: builds + uploads preload-bench
binary alongside manifest-bench, then runs a conc=64/96/128 sweep
against the same project after the standalone manifest-bench sweep.

bench script reverts UTOO_RESOLVE=mb so utoo runs default
fast_preload — gives a third datapoint (utoo wall on integrated
path) alongside manifest-bench (HTTP-only ceiling) and preload-bench
(streaming-with-walk ceiling).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/pm-e2e-bench.yml |  69 ++++
 Cargo.toml                         |   1 +
 bench/pm-bench-phases.sh           |  12 +-
 crates/preload-bench/Cargo.toml    |  38 +++
 crates/preload-bench/src/main.rs   | 505 +++++++++++++++++++++++++++++
 5 files changed, 619 insertions(+), 6 deletions(-)
 create mode 100644 crates/preload-bench/Cargo.toml
 create mode 100644 crates/preload-bench/src/main.rs

diff --git a/.github/workflows/pm-e2e-bench.yml b/.github/workflows/pm-e2e-bench.yml
index b25f5c380..eb560969b 100644
--- a/.github/workflows/pm-e2e-bench.yml
+++ b/.github/workflows/pm-e2e-bench.yml
@@ -161,6 +161,25 @@ jobs:
           name: manifest-bench-linux-x64
           path: target/x86_64-unknown-linux-gnu/release/manifest-bench
           retention-days: 1
+      # preload-bench: same HTTP setup as manifest-bench, but discovers
+      # names by walking transitive deps from a package.json root —
+      # tests whether a fully self-contained streaming preload can match
+      # standalone manifest-bench's wall on the same workload that
+      # ruborist's path runs at ~2.18s.
+      - name: Build preload-bench
+        if: >
+          (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'benchmark')) ||
+          (github.event_name == 'workflow_dispatch' && (inputs.target == 'pm-bench-phases' || inputs.target == 'pm-bench-pcap'))
+        run: cargo build --release --target x86_64-unknown-linux-gnu -p preload-bench
+      - name: Upload preload-bench binary
+        if: >
+          (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'benchmark')) ||
+          (github.event_name == 'workflow_dispatch' && (inputs.target == 'pm-bench-phases' || inputs.target == 'pm-bench-pcap'))
+        uses: actions/upload-artifact@v4
+        with:
+          name: preload-bench-linux-x64
+          path: target/x86_64-unknown-linux-gnu/release/preload-bench
+          retention-days: 1
       # Piggyback on the already-built target/ from the step above: when the
       # PR is labeled `benchmark`, overlay origin/next's tree onto the current
       # workdir and re-run cargo build. cargo's incremental compile only
@@ -547,6 +566,20 @@ jobs:
           chmod +x /tmp/manifest-bench-dist/manifest-bench
           mv /tmp/manifest-bench-dist/manifest-bench /tmp/manifest-bench
           echo "MANIFEST_BENCH_BIN=/tmp/manifest-bench" >> $GITHUB_ENV
+      # Self-contained streaming preload bench — same HTTP setup as
+      # manifest-bench but discovers names via transitive walk from a
+      # package.json. Used to test whether a fully-isolated path can
+      # match standalone manifest-bench's wall on the same workload.
+      - name: Download preload-bench binary
+        uses: actions/download-artifact@v4
+        with:
+          name: preload-bench-linux-x64
+          path: /tmp/preload-bench-dist
+      - name: Install preload-bench
+        run: |
+          chmod +x /tmp/preload-bench-dist/preload-bench
+          mv /tmp/preload-bench-dist/preload-bench /tmp/preload-bench
+          echo "PRELOAD_BENCH_BIN=/tmp/preload-bench" >> $GITHUB_ENV
       - name: Verify tools
         run: |
           hyperfine --version
@@ -645,6 +678,42 @@ jobs:
             "$MANIFEST_BENCH_BIN" --lockfile package-lock.json --registry "$REGISTRY" \
               --concurrency 128 --reps 2 --http1-only --user-agent "Bun/1.2.21" || true
           } 2>&1 | tee "$MB_LOG"
+      # Self-contained streaming preload (transitive walk from
+      # package.json) — same HTTP setup as manifest-bench but with a
+      # streaming FuturesUnordered + per-future parse. This tests
+      # whether a fully ruborist-independent path can hit standalone
+      # manifest-bench's wall under the same project workload.
+      - name: Standalone preload-bench (transitive walk sweep)
+        env:
+          PROJECT: ${{ github.event.inputs.project || 'ant-design' }}
+          REGISTRY: 'https://registry.npmjs.org'
+        run: |
+          set -eu
+          mkdir -p /tmp/pm-bench-output
+          PROJECT_DIR="/tmp/pm-bench/$PROJECT"
+          if [ ! -d "$PROJECT_DIR" ]; then
+            echo "no project dir; skipping preload-bench"; exit 0
+          fi
+          PJ="$PROJECT_DIR/package.json"
+          if [ ! -f "$PJ" ]; then
+            echo "no package.json; skipping preload-bench"; exit 0
+          fi
+
+          PB_LOG=/tmp/pm-bench-output/preload-bench-npmjs.log
+          {
+            echo "============================================================"
+            echo "preload-bench: streaming transitive-walk preload"
+            echo "  Self-contained (no ruborist deps). Same HTTP setup as"
+            echo "  manifest-bench, but discovers names by walking transitive"
+            echo "  deps from package.json instead of consuming a flat list."
+            echo "============================================================"
+            for CAP in 64 96 128; do
+              echo
+              echo "--- concurrency=$CAP, h1, transitive walk ---"
+              "$PRELOAD_BENCH_BIN" --package-json "$PJ" --registry "$REGISTRY" \
+                --concurrency "$CAP" --reps 4 || true
+            done
+          } 2>&1 | tee "$PB_LOG"
       - name: Upload bench logs
         if: always()
         uses: actions/upload-artifact@v4
diff --git a/Cargo.toml b/Cargo.toml
index 0574a185a..4b2836c06 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,6 +2,7 @@
 resolver = "2"
 members  = [
   "crates/manifest-bench",
+  "crates/preload-bench",
   "crates/pack-api",
   "crates/pack-cli",
   "crates/pack-core",
diff --git a/bench/pm-bench-phases.sh b/bench/pm-bench-phases.sh
index 26e43388c..b025ebc6f 100755
--- a/bench/pm-bench-phases.sh
+++ b/bench/pm-bench-phases.sh
@@ -22,12 +22,12 @@ UTOO_NEXT_CACHE="${UTOO_NEXT_CACHE:-/tmp/utoo-next-bench-cache}"
 BUN_CACHE="${BUN_CACHE:-/tmp/bun-bench-cache}"
 export BUN_INSTALL_CACHE_DIR="$BUN_CACHE"
 
-# Route the current `utoo` binary's resolve phase through the
-# experimental `mb_resolve` flat-fetch path. Other PMs ignore this
-# env var (utoo-next is built from origin/next which doesn't have
-# the flag; utoo-npm/bun ignore unrecognized env vars). Comment out
-# to A/B against the default `fast_preload` path.
-export UTOO_RESOLVE=mb
+# utoo path defaults to fast_preload (combined-parse) so we have a
+# stable baseline to compare against. preload-bench is run as a
+# separate standalone tool by the CI workflow — its wall is the
+# self-contained-streaming reference, ruborist's utoo p1_resolve
+# wall is the integrated path. The gap between them is what
+# remains to close.
 
 # Drop optional baselines from the PM list when their binary is not wired
 # up — UTOO_NPM_BIN is set by CI's "Install utoo@npm" step, UTOO_NEXT_BIN
diff --git a/crates/preload-bench/Cargo.toml b/crates/preload-bench/Cargo.toml
new file mode 100644
index 000000000..9d37d7769
--- /dev/null
+++ b/crates/preload-bench/Cargo.toml
@@ -0,0 +1,38 @@
+[package]
+name        = "preload-bench"
+version     = "0.0.0"
+edition     = "2024"
+license     = "MIT"
+publish     = false
+description = "Self-contained streaming-with-transitive-walk manifest preload bench. Reproduces manifest-bench's standalone fetch loop but discovers transitive deps from package.json instead of consuming a flat name list. No dependency on ruborist or any utoo internals."
+
+[[bin]]
+name = "preload-bench"
+path = "src/main.rs"
+
+# tombi: format.rules.table-keys-order.disabled = true
+[dependencies]
+anyhow      = { workspace = true }
+clap        = { workspace = true }
+futures     = "0.3"
+serde       = { version = "1", features = ["derive"] }
+serde_json  = { workspace = true }
+simd-json   = "0.17"
+tokio       = { workspace = true, features = ["macros", "rt-multi-thread", "fs", "time"] }
+
+# Same TLS/DNS choices as manifest-bench so the only delta vs that bench
+# is the transitive-walk loop.
+reqwest             = { version = "0.12", default-features = false, features = [
+  "brotli",
+  "gzip",
+  "http2",
+  "rustls-tls-native-roots-no-provider",
+  "socks"
+] }
+rustls              = { version = "0.23", default-features = false, features = [
+  "aws-lc-rs",
+  "logging",
+  "std",
+  "tls12"
+] }
+rustls-native-certs = "0.8"
diff --git a/crates/preload-bench/src/main.rs b/crates/preload-bench/src/main.rs
new file mode 100644
index 000000000..46f917d19
--- /dev/null
+++ b/crates/preload-bench/src/main.rs
@@ -0,0 +1,505 @@
+//! Self-contained streaming preload bench with transitive walking.
+//!
+//! Same HTTP setup as `manifest-bench` (own `reqwest::Client` built
+//! per rep with `aws-lc-rs` TLS, `pool_max_idle_per_host(256)`, no
+//! proxy, default DNS, no retry). The only delta vs `manifest-bench`
+//! is that this bench discovers names by walking transitive deps
+//! from a `package.json` root, instead of consuming a flat name
+//! list.
+//!
+//! Why a separate crate: ruborist's manifest-fetch path goes through
+//! several service layers (custom DNS resolver, retry, cache,
+//! single-flight gates, event receivers). Each layer might add
+//! overhead. This bench bypasses all of them — same shape as
+//! manifest-bench, just with a streaming `FuturesUnordered` that
+//! refills from a pending queue extended by parsed transitive deps.
+//!
+//! Reports both the standalone preload wall and a per-rep eff_parallel
+//! number so we can compare directly against manifest-bench's
+//! `phase_wall` + `avg_conc` for the same workload.
+//!
+//! Output (one line per rep, matching manifest-bench shape):
+//!   [rep N] preload_wall=Xms n=Y bytes=Z avg_conc=N.N parse_sum=Wms 200=A 4xx=B err=C
+
+use std::collections::{HashMap, HashSet, VecDeque};
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::Instant;
+
+use anyhow::{Context, Result, anyhow};
+use clap::Parser;
+use futures::stream::{FuturesUnordered, StreamExt};
+use serde::Deserialize;
+
+#[derive(Parser, Debug)]
+#[command(
+    name = "preload-bench",
+    about = "Streaming preload bench with transitive walking (self-contained)"
+)]
+struct Args {
+    /// Registry base URL.
+    #[arg(long, default_value = "https://registry.npmjs.org")]
+    registry: String,
+
+    /// Path to a `package.json` to walk from. Reads `dependencies` +
+    /// `devDependencies` + `optionalDependencies` as the initial seed.
+    #[arg(long)]
+    package_json: PathBuf,
+
+    /// Maximum concurrent in-flight requests.
+    #[arg(long, default_value_t = 96)]
+    concurrency: usize,
+
+    /// Number of times to repeat the whole walk (fresh client per rep).
+    #[arg(long, default_value_t = 4)]
+    reps: usize,
+
+    /// Force HTTP/1.1.
+    #[arg(long, default_value_t = true)]
+    http1_only: bool,
+
+    /// Override `User-Agent`.
+    #[arg(long)]
+    user_agent: Option<String>,
+
+    /// Include `peerDependencies` when walking transitives. Off by
+    /// default (matches utoo's default).
+    #[arg(long)]
+    include_peer: bool,
+}
+
+#[derive(Deserialize)]
+struct PackageJson {
+    #[serde(default)]
+    dependencies: HashMap<String, String>,
+    #[serde(default, rename = "devDependencies")]
+    dev_dependencies: HashMap<String, String>,
+    #[serde(default, rename = "optionalDependencies")]
+    optional_dependencies: HashMap<String, String>,
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let args = Args::parse();
+
+    let raw = std::fs::read_to_string(&args.package_json)
+        .with_context(|| format!("read {:?}", args.package_json))?;
+    let pkg: PackageJson = serde_json::from_str(&raw).context("parse package.json")?;
+    let initial: Vec<(String, String)> = pkg
+        .dependencies
+        .into_iter()
+        .chain(pkg.dev_dependencies)
+        .chain(pkg.optional_dependencies)
+        .filter(|(_, spec)| is_registry_spec(spec))
+        .collect();
+
+    println!(
+        "preload-bench: registry={} concurrency={} reps={} initial={} h1_only={} ua={} include_peer={}",
+        args.registry,
+        args.concurrency,
+        args.reps,
+        initial.len(),
+        args.http1_only,
+        args.user_agent.as_deref().unwrap_or("<reqwest default>"),
+        args.include_peer,
+    );
+
+    for rep in 1..=args.reps {
+        run_once(&args, &initial, rep).await?;
+    }
+
+    Ok(())
+}
+
+/// Quick registry-spec check (a `^...` / `~...` / `latest` / etc).
+/// Excludes `file:`, `link:`, `workspace:`, `git+`, `https://`, and
+/// `<user>/<repo>` shorthand. Same intent as ruborist's
+/// `SpecStr::is_registry_spec` but inlined to keep this crate
+/// dependency-free.
+fn is_registry_spec(spec: &str) -> bool {
+    if spec.is_empty() {
+        return true; // bare entries default to "*"
+    }
+    let lower = spec.to_ascii_lowercase();
+    if lower.starts_with("file:")
+        || lower.starts_with("link:")
+        || lower.starts_with("workspace:")
+        || lower.starts_with("portal:")
+        || lower.starts_with("git+")
+        || lower.starts_with("git://")
+        || lower.starts_with("github:")
+        || lower.starts_with("https://")
+        || lower.starts_with("http://")
+    {
+        return false;
+    }
+    // `<user>/<repo>` shorthand — exactly one '/' and no '@' prefix on
+    // first segment (rules out scoped names like `@scope/pkg`).
+    if let Some((head, tail)) = spec.split_once('/')
+        && !head.starts_with('@')
+        && !tail.is_empty()
+        && !tail.contains('/')
+    {
+        return false;
+    }
+    true
+}
+
+#[derive(Debug, Default)]
+struct RepStats {
+    n: usize,
+    bytes: usize,
+    parse_sum_us: u128,
+    busy_us: u128,
+    sum_us: u128,
+    ok_200: usize,
+    err_4xx: usize,
+    err_other: usize,
+}
+
+async fn run_once(args: &Args, initial: &[(String, String)], rep: usize) -> Result<()> {
+    let client = build_client(args)?;
+    let registry = Arc::new(args.registry.trim_end_matches('/').to_string());
+    let concurrency = args.concurrency;
+    let include_peer = args.include_peer;
+
+    let phase_start = Instant::now();
+    let mut stats = RepStats::default();
+
+    // (name, spec) dedup — same shape as ruborist's seen_specs but
+    // self-contained. We dedup the *spec* level because two specs on
+    // the same name might resolve to different versions.
+    let mut seen: HashSet<(String, String)> = HashSet::new();
+    let mut pending: VecDeque<(String, String)> = VecDeque::new();
+    for (name, spec) in initial {
+        if seen.insert((name.clone(), spec.clone())) {
+            pending.push_back((name.clone(), spec.clone()));
+        }
+    }
+
+    // Sibling-fetch dedup: when two specs for the same name are both
+    // pending, only one fetch is issued; subsequent specs settle from
+    // the cached body. Keyed by name. Maps name → cached parsed body
+    // (`Arc<Vec<u8>>`) once the first fetch lands.
+    let body_cache: Arc<std::sync::Mutex<HashMap<String, Arc<Vec<u8>>>>> =
+        Arc::new(std::sync::Mutex::new(HashMap::new()));
+    let mut in_flight_names: HashSet<String> = HashSet::new();
+    let mut deferred_by_name: HashMap<String, Vec<String>> = HashMap::new();
+
+    let mut futs: FuturesUnordered<Fut> = FuturesUnordered::new();
+
+    loop {
+        while futs.len() < concurrency {
+            let Some((name, spec)) = pending.pop_front() else {
+                break;
+            };
+
+            // If the body is already cached (sibling spec for an
+            // already-fetched name), spawn a settle-only future.
+            if let Some(raw) = body_cache.lock().unwrap().get(&name).cloned() {
+                let n = name.clone();
+                let s = spec.clone();
+                let fut: Fut = Box::pin(settle_only(n, s, raw, include_peer));
+                futs.push(fut);
+                continue;
+            }
+
+            // First time seeing this name: fetch + settle. Stash any
+            // sibling specs that arrive while in-flight.
+            if !in_flight_names.insert(name.clone()) {
+                deferred_by_name.entry(name).or_default().push(spec);
+                continue;
+            }
+
+            spawn_fetch(
+                &client,
+                &registry,
+                name,
+                spec,
+                Arc::clone(&body_cache),
+                include_peer,
+                &mut futs,
+            );
+        }
+
+        if futs.is_empty() {
+            break;
+        }
+
+        let Some(out) = futs.next().await else { break };
+        stats.n += 1;
+        stats.busy_us += out.busy_us;
+        stats.sum_us += out.sum_us;
+        stats.parse_sum_us += out.parse_us;
+        stats.bytes += out.bytes;
+        match out.status {
+            200 => stats.ok_200 += 1,
+            400..=499 => stats.err_4xx += 1,
+            _ => stats.err_other += 1,
+        }
+
+        // Drain sibling specs for this name now that body is cached.
+        if out.fetched
+            && let Some(siblings) = deferred_by_name.remove(&out.name)
+            && let Some(raw) = body_cache.lock().unwrap().get(&out.name).cloned()
+        {
+            for sibling_spec in siblings {
+                let n = out.name.clone();
+                let r = Arc::clone(&raw);
+                let fut: Fut = Box::pin(settle_only(n, sibling_spec, r, include_peer));
+                futs.push(fut);
+            }
+        }
+
+        // Extend pending with new transitives, dedup by (name, spec).
+        for (name, spec) in out.transitives {
+            if seen.insert((name.clone(), spec.clone())) {
+                pending.push_back((name, spec));
+            }
+        }
+    }
+
+    let phase_wall_ms = phase_start.elapsed().as_millis();
+    let parse_sum_ms = stats.parse_sum_us / 1000;
+    // avg_conc = sum_request_us / busy_window_us. busy_us isn't a true
+    // merged-interval here (we don't track per-req start/end timestamps
+    // for that), so use phase_wall as the denominator — slightly
+    // pessimistic but consistent.
+    let avg_conc = if phase_wall_ms > 0 {
+        stats.sum_us as f64 / 1000.0 / phase_wall_ms as f64
+    } else {
+        0.0
+    };
+
+    println!(
+        "[rep {rep}] preload_wall={phase_wall_ms}ms n={} bytes={} parse_sum={parse_sum_ms}ms avg_conc={avg_conc:.1} 200={} 4xx={} err={}",
+        stats.n, stats.bytes, stats.ok_200, stats.err_4xx, stats.err_other,
+    );
+    Ok(())
+}
+
+#[derive(Debug)]
+struct FetchOutcome {
+    name: String,
+    /// `(name, spec)` transitive deps unfolded by parsing the resolved
+    /// version's `dependencies` / `optionalDependencies` (and
+    /// optionally `peerDependencies`).
+    transitives: Vec<(String, String)>,
+    /// `true` if this future fetched the body (vs settle-only on a
+    /// cached body); only fetchers populate `body_cache` and trigger
+    /// sibling drain.
+    fetched: bool,
+    /// HTTP status code (200 / 4xx / 5xx / 0 on transport error).
+    status: u16,
+    /// Body byte count (0 on error).
+    bytes: usize,
+    /// Self-reported per-future busy_us — `end - start`. Approximate.
+    busy_us: u128,
+    /// Sum of all per-future durations summed by the main loop.
+    sum_us: u128,
+    /// Parse work done inside this future (for accounting).
+    parse_us: u128,
+}
+
+type Fut = std::pin::Pin<Box<dyn std::future::Future<Output = FetchOutcome> + Send>>;
+
+fn spawn_fetch(
+    client: &reqwest::Client,
+    registry: &Arc<String>,
+    name: String,
+    spec: String,
+    body_cache: Arc<std::sync::Mutex<HashMap<String, Arc<Vec<u8>>>>>,
+    include_peer: bool,
+    futs: &mut FuturesUnordered<Fut>,
+) {
+    let url = format!("{}/{}", registry, name);
+    let client = client.clone();
+    let fut: Fut = Box::pin(async move {
+        let start = Instant::now();
+        let req = client
+            .get(&url)
+            .header("accept", "application/vnd.npm.install-v1+json")
+            .send();
+        let (raw_bytes, status) = match req.await {
+            Ok(resp) => {
+                let status = resp.status().as_u16();
+                let body = resp.bytes().await.map(|b| b.to_vec()).unwrap_or_default();
+                (body, status)
+            }
+            Err(_) => (Vec::new(), 0),
+        };
+        let bytes = raw_bytes.len();
+
+        let (parse_us, transitives) = if status == 200 && !raw_bytes.is_empty() {
+            let raw_arc = Arc::new(raw_bytes);
+            body_cache
+                .lock()
+                .unwrap()
+                .insert(name.clone(), Arc::clone(&raw_arc));
+            // Move the Arc<Vec<u8>> into spawn_blocking; the parser
+            // mutates a clone, so the cached copy is unaffected.
+            let spec_for_parse = spec.clone();
+            let parse_start = Instant::now();
+            let result = tokio::task::spawn_blocking(move || {
+                parse_and_extract(&raw_arc, &spec_for_parse, include_peer)
+            })
+            .await
+            .ok()
+            .flatten()
+            .unwrap_or_default();
+            (parse_start.elapsed().as_micros(), result)
+        } else {
+            (0, Vec::new())
+        };
+
+        let end = Instant::now();
+        let busy_us = end.duration_since(start).as_micros();
+        FetchOutcome {
+            name,
+            transitives,
+            fetched: true,
+            status,
+            bytes,
+            busy_us,
+            sum_us: busy_us,
+            parse_us,
+        }
+    });
+    futs.push(fut);
+}
+
+async fn settle_only(
+    name: String,
+    spec: String,
+    raw: Arc<Vec<u8>>,
+    include_peer: bool,
+) -> FetchOutcome {
+    let start = Instant::now();
+    let parse_start = start;
+    let transitives = tokio::task::spawn_blocking(move || {
+        parse_and_extract(&raw, &spec, include_peer).unwrap_or_default()
+    })
+    .await
+    .unwrap_or_default();
+    let parse_us = parse_start.elapsed().as_micros();
+    let end = Instant::now();
+    let busy_us = end.duration_since(start).as_micros();
+    FetchOutcome {
+        name,
+        transitives,
+        fetched: false,
+        status: 200,
+        bytes: 0,
+        busy_us,
+        sum_us: busy_us,
+        parse_us,
+    }
+}
+
+/// Parse a manifest body, resolve `spec` against the version list,
+/// extract that version's transitive deps. Single
+/// `simd_json::to_borrowed_value` pass for the whole body — same as
+/// ruborist's combined-parse path, but inlined here so this crate
+/// has no ruborist dependency.
+fn parse_and_extract(
+    raw: &Arc<Vec<u8>>,
+    spec: &str,
+    include_peer: bool,
+) -> Option<Vec<(String, String)>> {
+    use simd_json::prelude::{ValueAsObject, ValueObjectAccess};
+
+    let mut buf = (**raw).clone();
+    let parsed = simd_json::to_borrowed_value(&mut buf).ok()?;
+
+    let dist_tags: HashMap<String, String> = parsed
+        .get("dist-tags")
+        .and_then(|v| HashMap::<String, String>::deserialize(v).ok())
+        .unwrap_or_default();
+    let versions_obj = parsed.get("versions").and_then(ValueAsObject::as_object)?;
+
+    // Resolve spec. Three cases: dist-tag match, exact-version key, or
+    // semver range (we approximate with "first version that satisfies"
+    // — preload-bench is a measurement tool, not a real resolver, so
+    // we tolerate slight selection differences vs ruborist for the
+    // purpose of timing the network path).
+    let resolved = if let Some(via_tag) = dist_tags.get(spec) {
+        via_tag.clone()
+    } else if versions_obj.contains_key(spec) {
+        spec.to_string()
+    } else if let Some(latest) = dist_tags.get("latest")
+        && spec_satisfied_by(spec, latest)
+    {
+        latest.clone()
+    } else {
+        // Last-resort: pick the lexicographically-largest version. Not
+        // semver-correct but bounded by the version set, and good
+        // enough for timing.
+        versions_obj.keys().max().map(|k| k.to_string())?
+    };
+
+    let version_obj = versions_obj.get(resolved.as_str())?;
+    let mut out: Vec<(String, String)> = Vec::new();
+
+    if let Some(deps) = version_obj.get("dependencies")
+        && let Ok(map) = HashMap::<String, String>::deserialize(deps)
+    {
+        out.extend(map.into_iter().filter(|(_, s)| is_registry_spec(s)));
+    }
+    if include_peer
+        && let Some(deps) = version_obj.get("peerDependencies")
+        && let Ok(map) = HashMap::<String, String>::deserialize(deps)
+    {
+        out.extend(map.into_iter().filter(|(_, s)| is_registry_spec(s)));
+    }
+    if let Some(deps) = version_obj.get("optionalDependencies")
+        && let Ok(map) = HashMap::<String, String>::deserialize(deps)
+    {
+        out.extend(map.into_iter().filter(|(_, s)| is_registry_spec(s)));
+    }
+    Some(out)
+}
+
+/// Crude semver-satisfies check: only handles `^X.Y.Z` and `~X.Y.Z`
+/// against an exact target. Sufficient for "does latest satisfy spec"
+/// in this measurement context — full semver is in the resolver, not
+/// the bench.
+fn spec_satisfied_by(spec: &str, target: &str) -> bool {
+    let s = spec.trim();
+    let body = s
+        .strip_prefix('^')
+        .or_else(|| s.strip_prefix('~'))
+        .unwrap_or(s);
+    target.starts_with(body) || target == body
+}
+
+fn build_client(args: &Args) -> Result<reqwest::Client> {
+    // Install aws-lc-rs as the default crypto provider (idempotent —
+    // first call wins). Same setup as manifest-bench.
+    let _ = rustls::crypto::aws_lc_rs::default_provider().install_default();
+
+    let mut roots = rustls::RootCertStore::empty();
+    let native = rustls_native_certs::load_native_certs();
+    for cert in native.certs {
+        let _ = roots.add(cert);
+    }
+
+    let tls_config = rustls::ClientConfig::builder_with_provider(std::sync::Arc::new(
+        rustls::crypto::aws_lc_rs::default_provider(),
+    ))
+    .with_safe_default_protocol_versions()
+    .map_err(|e| anyhow!("rustls protocol versions: {e}"))?
+    .with_root_certificates(roots)
+    .with_no_client_auth();
+
+    let mut builder = reqwest::Client::builder()
+        .use_preconfigured_tls(tls_config)
+        .no_proxy()
+        .pool_max_idle_per_host(256);
+    if args.http1_only {
+        builder = builder.http1_only();
+    }
+    if let Some(ua) = &args.user_agent {
+        builder = builder.user_agent(ua);
+    }
+    builder.build().context("build reqwest client")
+}

From 01d15130d01cb6768d2fe5b4d4c577a7b4139a03 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 13:55:56 +0800
Subject: [PATCH 23/32] perf(pm): integrate standalone preload into ruborist
 for lockfile-only path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Step 1 of staged service-layer ablation. Rewrites mb_resolve as a
fully self-contained streaming preload mirroring preload-bench's
loop shape verbatim, but living inside ruborist so it can populate
MemoryCache for the BFS phase.

Bypasses every other ruborist service layer:
  * service::http::get_client — own reqwest::Client built per call,
    no global LazyLock, no shared_resolver dns layer, no
    connect_timeout, pool_max_idle_per_host(256).
  * service::manifest::fetch_full_manifest_with_settle — own GET +
    body.bytes() + spawn_blocking(simd_json::to_borrowed_value),
    no RetryIf, no FETCH_TIMINGS.
  * service::registry::UnifiedRegistry — no OnceMap, no
    ManifestStore, no EventReceiver.

Only service::* touched is MemoryCache writes (DashMap inserts) so
BFS has data to read from.

PM is unaware: dispatch happens entirely inside
service::api::build_deps when skip_preload=true and no warm cache.
Removes the previous UTOO_RESOLVE=mb env-var gating from
pm::helper::ruborist_context::Context::build_deps and
pipeline::resolve_with_pipeline. Removes the now-unused
service::api::build_deps_mb sibling entry point.

Expected: utoo p1_resolve drops from ~2.67s toward preload-bench's
~2.57s (or better since ruborist fetches fewer names than
preload-bench). The remaining gap to mb's ~1.99s would isolate
incremental layer effects we add back next:
  - tokio runtime config / cooperative scheduling
  - reqwest::Client provider differences (TLS, DNS)
  - cache layer (DashMap vs DiskManifestStore reads on the cold path)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/pm/src/helper/ruborist_context.rs   |  22 +-
 crates/pm/src/service/pipeline/mod.rs      |  17 +-
 crates/ruborist/src/resolver/mb_resolve.rs | 597 ++++++++++-----------
 crates/ruborist/src/service/api.rs         | 175 +-----
 crates/ruborist/src/service/mod.rs         |   2 +-
 5 files changed, 289 insertions(+), 524 deletions(-)

diff --git a/crates/pm/src/helper/ruborist_context.rs b/crates/pm/src/helper/ruborist_context.rs
index 542664f8c..c8b758a6f 100644
--- a/crates/pm/src/helper/ruborist_context.rs
+++ b/crates/pm/src/helper/ruborist_context.rs
@@ -84,23 +84,17 @@ impl Context {
     /// [`BuildDepsOutput`] (lock + project cache); the project cache is
     /// persisted in the background.
     ///
-    /// Used by the lockfile-only path (`utoo deps`). No pipeline consumes
-    /// `PackageResolved` events here, so preload is pure overhead — BFS's
-    /// own per-level parallel prefetch warms the manifest cache.
-    ///
-    /// Set `UTOO_RESOLVE=mb` to opt into the experimental
-    /// manifest-bench-style fetch path (`build_deps_mb`) for A/B
-    /// benchmarking against the current `fast_preload`.
+    /// Used by the lockfile-only path (`utoo deps`). With
+    /// `skip_preload=true`, ruborist's `service::api::build_deps`
+    /// internally routes through `mb_resolve::mb_fetch` — a
+    /// standalone manifest-bench-style preload that bypasses
+    /// `service::http` / `service::manifest` / `service::registry`
+    /// for the cold-cache lockfile-only workload. PM doesn't see
+    /// the dispatch.
     pub async fn build_deps(cwd: PathBuf) -> anyhow::Result<BuildDepsOutput> {
         let mut options = Self::deps_options(cwd.clone(), ProgressReceiver).await;
         options.skip_preload = true;
-        let use_mb = std::env::var("UTOO_RESOLVE").as_deref() == Ok("mb");
-        let output = if use_mb {
-            tracing::debug!("UTOO_RESOLVE=mb: routing to build_deps_mb");
-            utoo_ruborist::service::build_deps_mb(options).await?
-        } else {
-            utoo_ruborist::service::build_deps(options).await?
-        };
+        let output = utoo_ruborist::service::build_deps(options).await?;
         spawn_save_project_cache(cwd, output.project_cache.clone());
         Ok(output)
     }
diff --git a/crates/pm/src/service/pipeline/mod.rs b/crates/pm/src/service/pipeline/mod.rs
index 4169ca88d..719d31d13 100644
--- a/crates/pm/src/service/pipeline/mod.rs
+++ b/crates/pm/src/service/pipeline/mod.rs
@@ -41,22 +41,7 @@ pub async fn resolve_with_pipeline(root_path: &std::path::Path) -> anyhow::Resul
     let (options, channels) = Context::pipeline_deps_options(root_path.to_path_buf()).await;
     let handles = worker::start_workers(channels, root_path.to_path_buf());
 
-    // `UTOO_RESOLVE=mb` reroutes install through the experimental
-    // mb-style fetch path. Pipeline workers are still started, but
-    // because mb_fetch doesn't emit `PackageResolved` events, the
-    // pipeline only fires once BFS completes (graph_to_package_lock
-    // emits `PackagePlaced` from BFS). Install becomes
-    // phase-sequential — fetch all manifests, then download +
-    // clone. Useful for A/B benchmarking the resolve phase in
-    // isolation; the pipelining advantage of the default path is
-    // lost.
-    let use_mb = std::env::var("UTOO_RESOLVE").as_deref() == Ok("mb");
-    let output = if use_mb {
-        tracing::debug!("UTOO_RESOLVE=mb: routing install resolve to build_deps_mb");
-        utoo_ruborist::service::build_deps_mb(options).await?
-    } else {
-        utoo_ruborist::service::build_deps(options).await?
-    };
+    let output = utoo_ruborist::service::build_deps(options).await?;
 
     save_package_lock(root_path, &output.lock).await?;
     spawn_save_project_cache(root_path.to_path_buf(), output.project_cache);
diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs
index 7ef0b5d85..7e1376330 100644
--- a/crates/ruborist/src/resolver/mb_resolve.rs
+++ b/crates/ruborist/src/resolver/mb_resolve.rs
@@ -1,47 +1,42 @@
-//! Two-phase manifest fetcher: phase 1 pure HTTP (mirrors
-//! `manifest-bench` standalone exactly), phase 2 rayon batch parse +
-//! settle.
+//! Standalone manifest preload for the lockfile-only path.
 //!
-//! ## Phase split
+//! Mirrors `crates/preload-bench`'s loop shape verbatim, but lives
+//! inside ruborist so it can populate `MemoryCache` for the BFS phase
+//! to read. Used by `service::api::build_deps` whenever the caller
+//! has `skip_preload=true` and no warm project cache — i.e. the
+//! `utoo deps` (lockfile-only) path.
 //!
-//! Per-fetch parse work was the real bottleneck in v1/v2 — `simd_json`
-//! ran in `spawn_blocking` threads that competed with tokio runtime
-//! workers for CPU on the 2-core GHA box. When 50+ parses ran in
-//! parallel, tokio workers couldn't drive sockets, so `eff_parallel`
-//! capped at ~47 against the 96 cap (vs `manifest-bench` standalone's
-//! 75 on the same box).
+//! Bypasses every other ruborist service layer:
+//!   * `service::http::get_client` — own `reqwest::Client` built per
+//!     call, no global LazyLock, no `dns_resolver(shared_resolver)`,
+//!     no `connect_timeout`, `pool_max_idle_per_host(256)` matching
+//!     `preload-bench` / `manifest-bench`.
+//!   * `service::manifest::fetch_full_manifest_with_settle` — own
+//!     `reqwest::get + body.bytes() + spawn_blocking(simd_json
+//!     to_borrowed_value)`, no `RetryIf`, no `FETCH_TIMINGS`.
+//!   * `service::registry::UnifiedRegistry` — no `OnceMap` inflight
+//!     gates, no `ManifestStore`, no `EventReceiver`.
 //!
-//! v3 separates the work:
+//! The only `service::*` touched is `MemoryCache::set_full_manifest`
+//! and `MemoryCache::set_version_manifest` — thin DashMap wrappers
+//! the BFS phase reads from. Without that, BFS would have nothing to
+//! resolve against.
 //!
-//! - **Phase 1** — `mb_style_pure_fetch` is a structural copy of
-//!   `manifest-bench`'s main loop: `spawn_one` (GET + body recv,
-//!   nothing else) + 1-for-1 refill on completion. The future body
-//!   has zero CPU work, so the tokio runtime workers retain full CPU
-//!   to drive sockets and `eff_parallel` reaches the same level as
-//!   the standalone bench.
-//!
-//! - **Phase 2** — bulk parse on rayon (off the tokio runtime). For
-//!   each fetched body: parse `FullManifest` envelope, resolve every
-//!   spec we need for this name, materialize `CoreVersionManifest`
-//!   subtrees, populate cache slots, collect transitive deps for the
-//!   next iteration.
-//!
-//! Phases alternate until `pending` is empty (typical project: 3-5
-//! iterations as transitive deps fan out wave by wave).
-//!
-//! Phase 1 is the line we measure against `manifest-bench` —
-//! `p1-breakdown mb_fetch_iter=N phase1_http_wall=...` traces let us
-//! check eff_parallel directly.
-//!
-//! Wired in via `UTOO_RESOLVE=mb` env var (see
-//! `pm::helper::ruborist_context::Context::build_deps`).
-
-use std::collections::{HashMap, HashSet};
+//! Why a separate path: same-run CI data shows `preload-bench`
+//! (self-contained, transitive walk, 4153 fetches) lands at ~2.57s
+//! while ruborist's existing `fast_preload` path (combined parse via
+//! service layers, 2733 fetches) lands at ~2.67s on the same network
+//! — so on a per-fetch basis the service-layer path is ~50 % slower.
+//! Removing the layers should close that gap.
+
+use std::collections::{HashMap, HashSet, VecDeque};
+use std::pin::Pin;
 use std::sync::Arc;
+use std::time::Instant;
 
-use bytes::Bytes;
+use anyhow::{Context, Result};
 use futures::stream::{FuturesUnordered, StreamExt};
-use rayon::prelude::*;
+use parking_lot::Mutex;
 use serde::Deserialize;
 
 use crate::model::manifest::{CoreVersionManifest, FullManifest};
@@ -49,38 +44,29 @@ use crate::model::node::PeerDeps;
 use crate::resolver::preload::{Dep, PreloadConfig};
 use crate::resolver::version::resolve_target_version;
 use crate::service::MemoryCache;
-use crate::service::http::get_client;
 use crate::spec::SpecStr;
 
 #[derive(Debug, Default)]
 pub struct MbFetchStats {
     pub success: usize,
     pub fail: usize,
-    pub iterations: usize,
-}
-
-/// Phase 1 result: one body per fetched name. `bytes` is `None` on
-/// transport / non-2xx — kept in the result vector so phase 2 can
-/// account for it, but contributes no settle work.
-struct FetchOutcome {
-    name: String,
-    bytes: Option<Bytes>,
 }
 
-/// Phase 2 per-name output. `full` is `None` on parse failure.
-struct ParseOutcome {
-    name: String,
-    full: Option<Arc<FullManifest>>,
-    /// Per-spec settled subtrees: `(spec, resolved_version, core)`.
-    /// Empty when the body failed to fetch / parse, or when no spec
-    /// resolves against the manifest.
-    settled: Vec<(String, String, Arc<CoreVersionManifest>)>,
-    /// Transitive deps collected across all settled subtrees for this
-    /// name. Already filtered to registry specs; the main loop dedups
-    /// against `done_names` before queueing.
-    transitives: Vec<Dep>,
+/// Build a fresh `reqwest::Client` matching `preload-bench` /
+/// `manifest-bench` exactly, except for the TLS provider — those
+/// benches use aws-lc-rs but we keep ruborist's existing default
+/// rustls (ring on Linux). If A/B data shows TLS is the remaining
+/// gap, we'll add the aws-lc-rs deps separately.
+fn build_mb_client() -> Result<reqwest::Client> {
+    reqwest::Client::builder()
+        .no_proxy()
+        .pool_max_idle_per_host(256)
+        .http1_only()
+        .build()
+        .context("build reqwest client for mb_resolve")
 }
 
+/// Collect deps from a deps map, filtering non-registry specs.
 fn collect_deps(map: Option<&HashMap<String, String>>) -> Vec<Dep> {
     map.into_iter()
         .flatten()
@@ -99,177 +85,183 @@ fn extract_transitive(manifest: &CoreVersionManifest, peer_deps: PeerDeps) -> Ve
     out
 }
 
-/// Phase 1 — structural copy of `manifest-bench`'s main loop. Future
-/// body does ONLY GET + body recv; no parse, no cache writes, no
-/// dedup. Returns one `FetchOutcome` per input name in arrival order.
-async fn mb_style_pure_fetch(
-    names: Vec<String>,
-    registry_url: &str,
-    concurrency: usize,
-) -> Vec<FetchOutcome> {
-    let client = match get_client() {
-        Ok(c) => c.clone(),
-        Err(e) => {
-            tracing::warn!("get_client failed: {e}");
-            return Vec::new();
-        }
-    };
-
-    let mut results: Vec<FetchOutcome> = Vec::with_capacity(names.len());
-    let mut futs = FuturesUnordered::new();
-    let mut idx = 0usize;
-
-    let spawn_one = |client: &reqwest::Client,
-                     registry_url: &str,
-                     name: String,
-                     futs: &mut FuturesUnordered<_>| {
-        let url = format!("{}/{}", registry_url, name);
-        let client = client.clone();
-        futs.push(Box::pin(async move {
-            let bytes = match client
-                .get(&url)
-                .header("accept", "application/vnd.npm.install-v1+json")
-                .send()
-                .await
-            {
-                Ok(resp) if resp.status().is_success() => resp.bytes().await.ok(),
-                _ => None,
-            };
-            FetchOutcome { name, bytes }
-        }));
-    };
+/// What a future returns when it lands. The main loop uses
+/// `transitives` to extend `pending`, plus the cache writes already
+/// happened inside the future. Only `fetched=true` futures populate
+/// `body_cache` and trigger sibling drain.
+struct FetchOutcome {
+    name: String,
+    transitives: Vec<Dep>,
+    fetched: bool,
+}
 
-    while idx < names.len() && futs.len() < concurrency {
-        spawn_one(&client, registry_url, names[idx].clone(), &mut futs);
-        idx += 1;
-    }
+type Fut = Pin<Box<dyn std::future::Future<Output = FetchOutcome> + Send>>;
 
-    while let Some(outcome) = futs.next().await {
-        results.push(outcome);
-        if idx < names.len() {
-            spawn_one(&client, registry_url, names[idx].clone(), &mut futs);
-            idx += 1;
-        }
-    }
+/// `(name, spec) → (FullManifest, resolved_version, version_subtree, transitive_deps)`.
+type ParseResult = (
+    Arc<FullManifest>,
+    String,
+    Arc<CoreVersionManifest>,
+    Vec<Dep>,
+);
 
-    results
-}
+/// Single combined parse: one `simd_json::to_borrowed_value` over the
+/// raw body extracts the envelope (name, dist-tags, versions keys)
+/// AND deserializes the resolved version's `CoreVersionManifest`
+/// subtree. Same shape as the parse step in `preload-bench`.
+fn parse_combined(raw: Arc<[u8]>, spec: &str, peer_deps: PeerDeps) -> Option<ParseResult> {
+    use simd_json::prelude::{ValueAsObject, ValueAsScalar, ValueObjectAccess};
 
-/// Sync phase 2 worker: parse one body, settle all specs we need for
-/// this name. Runs on rayon (called from `par_iter` in
-/// `parse_settle_batch`).
-fn parse_one_body(
-    name: String,
-    raw: Bytes,
-    specs: Vec<String>,
-    peer_deps: PeerDeps,
-) -> ParseOutcome {
-    use simd_json::prelude::{ValueAsScalar, ValueObjectAccess};
-
-    let raw_arc: Arc<[u8]> = Arc::from(raw.as_ref());
-    let mut buf = raw.to_vec();
-    let parsed = match simd_json::to_borrowed_value(&mut buf) {
-        Ok(v) => v,
-        Err(_) => {
-            return ParseOutcome {
-                name,
-                full: None,
-                settled: Vec::new(),
-                transitives: Vec::new(),
-            };
-        }
-    };
+    let mut buf = (*raw).to_vec();
+    let parsed = simd_json::to_borrowed_value(&mut buf).ok()?;
 
-    let envelope_name = parsed
+    let name = parsed
         .get("name")
         .and_then(|v| v.as_str())
         .map(|s| s.to_string())
-        .unwrap_or_else(|| name.clone());
+        .unwrap_or_default();
     let dist_tags: HashMap<String, String> = parsed
         .get("dist-tags")
         .and_then(|v| HashMap::<String, String>::deserialize(v).ok())
         .unwrap_or_default();
     let versions_keys: Vec<String> = parsed
         .get("versions")
-        .and_then(simd_json::prelude::ValueAsObject::as_object)
+        .and_then(ValueAsObject::as_object)
         .map(|obj| obj.keys().map(|k| k.to_string()).collect())
         .unwrap_or_default();
 
     let full = FullManifest {
-        name: envelope_name,
+        name,
         dist_tags,
         versions: versions_keys,
-        raw: Arc::clone(&raw_arc),
+        raw: Arc::clone(&raw),
         ..Default::default()
     };
-    let full_arc = Arc::new(full);
-
-    // For each requested spec, resolve + extract version subtree.
-    // Cache the per-(name, version) `CoreVersionManifest` so sibling
-    // specs that resolve to the same version reuse the same Arc.
-    let mut version_cache: HashMap<String, Arc<CoreVersionManifest>> = HashMap::new();
-    let mut settled = Vec::with_capacity(specs.len());
-    let mut transitives = Vec::new();
-
-    for spec in specs {
-        let Ok(resolved_version) = resolve_target_version((&*full_arc).into(), &spec) else {
-            continue;
+
+    let resolved = resolve_target_version((&full).into(), spec).ok()?;
+    let core = parsed
+        .get("versions")
+        .and_then(|v| v.get(resolved.as_str()))
+        .and_then(|version_obj| CoreVersionManifest::deserialize(version_obj).ok())?;
+    let core_arc = Arc::new(core);
+    let transitives = extract_transitive(&core_arc, peer_deps);
+
+    Some((Arc::new(full), resolved, core_arc, transitives))
+}
+
+/// Fetch + combined parse + cache write for one `(name, spec)`.
+/// Future body owns all per-fetch work; main loop only extends
+/// `pending` from the returned transitives and refills `futs`.
+fn spawn_fetch(
+    client: reqwest::Client,
+    registry_url: Arc<String>,
+    name: String,
+    spec: String,
+    cache: MemoryCache,
+    body_cache: Arc<Mutex<HashMap<String, Arc<[u8]>>>>,
+    peer_deps: PeerDeps,
+) -> Fut {
+    Box::pin(async move {
+        let url = format!("{}/{}", registry_url, name);
+        let resp = match client
+            .get(&url)
+            .header("accept", "application/vnd.npm.install-v1+json")
+            .send()
+            .await
+        {
+            Ok(r) if r.status().is_success() => r,
+            _ => {
+                return FetchOutcome {
+                    name,
+                    transitives: Vec::new(),
+                    fetched: true,
+                };
+            }
         };
-        let core_arc = if let Some(cached) = version_cache.get(&resolved_version) {
-            Arc::clone(cached)
-        } else {
-            let Some(core) = parsed
-                .get("versions")
-                .and_then(|v| v.get(resolved_version.as_str()))
-                .and_then(|version_obj| CoreVersionManifest::deserialize(version_obj).ok())
-            else {
-                continue;
-            };
-            let arc = Arc::new(core);
-            version_cache.insert(resolved_version.clone(), Arc::clone(&arc));
-            arc
+        let raw_bytes = match resp.bytes().await {
+            Ok(b) => b,
+            Err(_) => {
+                return FetchOutcome {
+                    name,
+                    transitives: Vec::new(),
+                    fetched: true,
+                };
+            }
+        };
+        let raw_arc: Arc<[u8]> = Arc::from(raw_bytes.as_ref());
+        // Stash in body_cache early so concurrent sibling specs
+        // arriving slightly after see it on their pending pop.
+        body_cache.lock().insert(name.clone(), Arc::clone(&raw_arc));
+
+        let spec_for_parse = spec.clone();
+        let peer = peer_deps;
+        let parsed =
+            tokio::task::spawn_blocking(move || parse_combined(raw_arc, &spec_for_parse, peer))
+                .await
+                .ok()
+                .flatten();
+
+        let transitives = match parsed {
+            Some((full_arc, resolved, core_arc, transitives)) => {
+                cache.set_full_manifest(name.clone(), Arc::clone(&full_arc));
+                cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc));
+                cache.set_version_manifest(name.clone(), resolved, core_arc);
+                transitives
+            }
+            None => Vec::new(),
         };
-        transitives.extend(extract_transitive(&core_arc, peer_deps));
-        settled.push((spec, resolved_version, core_arc));
-    }
 
-    ParseOutcome {
-        name,
-        full: Some(full_arc),
-        settled,
-        transitives,
-    }
+        FetchOutcome {
+            name,
+            transitives,
+            fetched: true,
+        }
+    })
 }
 
-/// Phase 2 dispatcher: hands the batch to rayon, awaits the result.
-async fn parse_settle_batch(
-    bodies: Vec<FetchOutcome>,
-    by_name: HashMap<String, Vec<String>>,
+/// Settle-only future for a sibling spec whose `(name)` body already
+/// landed via a sibling fetch. Same combined parse, no network.
+fn spawn_settle(
+    name: String,
+    spec: String,
+    raw: Arc<[u8]>,
+    cache: MemoryCache,
     peer_deps: PeerDeps,
-) -> Vec<ParseOutcome> {
-    let work: Vec<(String, Bytes, Vec<String>)> = bodies
-        .into_iter()
-        .filter_map(|f| {
-            let bytes = f.bytes?;
-            let specs = by_name.get(&f.name).cloned().unwrap_or_default();
-            Some((f.name, bytes, specs))
+) -> Fut {
+    Box::pin(async move {
+        let spec_for_parse = spec.clone();
+        let peer = peer_deps;
+        let parsed = tokio::task::spawn_blocking(move || {
+            parse_combined(Arc::clone(&raw), &spec_for_parse, peer)
         })
-        .collect();
-
-    if work.is_empty() {
-        return Vec::new();
-    }
+        .await
+        .ok()
+        .flatten();
+
+        let transitives = match parsed {
+            Some((full_arc, resolved, core_arc, transitives)) => {
+                // Don't overwrite full_manifest — the original fetcher
+                // already set it. Only populate the version-manifest
+                // slots so BFS hits the (name, spec) early-return.
+                cache.set_full_manifest(name.clone(), full_arc);
+                cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc));
+                cache.set_version_manifest(name.clone(), resolved, core_arc);
+                transitives
+            }
+            None => Vec::new(),
+        };
 
-    tokio::task::spawn_blocking(move || {
-        work.into_par_iter()
-            .map(|(name, raw, specs)| parse_one_body(name, raw, specs, peer_deps))
-            .collect::<Vec<_>>()
+        FetchOutcome {
+            name,
+            transitives,
+            fetched: false,
+        }
     })
-    .await
-    .unwrap_or_default()
 }
 
-/// Two-phase mb-style fetch with rayon batch parse. See module docs.
+/// Streaming preload with transitive walk. Self-contained — no
+/// dependency on `service::http` / `service::manifest` /
+/// `service::registry` beyond `MemoryCache` writes.
 pub async fn mb_fetch(
     initial_deps: Vec<Dep>,
     registry_url: &str,
@@ -277,154 +269,109 @@ pub async fn mb_fetch(
     config: &PreloadConfig,
 ) -> MbFetchStats {
     let mut stats = MbFetchStats::default();
-    let mut pending_specs: Vec<Dep> = initial_deps;
-    // (name, spec) pairs we've already processed (settled or queued
-    // to settle). Without this, sibling-settle's transitive deps can
-    // re-introduce already-walked specs and the outer loop never
-    // terminates — peer / optional dep cycles trivially trigger this.
-    let mut seen_specs: HashSet<(String, String)> = HashSet::new();
-    let mut done_names: HashSet<String> = HashSet::new();
-    let conc = config.concurrency;
+    let total_start = Instant::now();
+
+    let client = match build_mb_client() {
+        Ok(c) => c,
+        Err(e) => {
+            tracing::warn!("mb_resolve client build failed: {e}");
+            return stats;
+        }
+    };
+    let registry = Arc::new(registry_url.trim_end_matches('/').to_string());
+    let cap = config.concurrency;
     let peer_deps = config.peer_deps;
-    let total_start = tokio::time::Instant::now();
 
-    // Filter the initial seed through `seen_specs` too — root + workspace
-    // edges can list the same dep multiple times across workspaces.
-    pending_specs.retain(|(n, s)| seen_specs.insert((n.clone(), s.clone())));
+    // Spec-level dedup across the entire run.
+    let mut seen: HashSet<(String, String)> = HashSet::new();
+    let mut pending: VecDeque<Dep> = VecDeque::new();
+    for (name, spec) in initial_deps {
+        if seen.insert((name.clone(), spec.clone())) {
+            pending.push_back((name, spec));
+        }
+    }
 
-    while !pending_specs.is_empty() {
-        stats.iterations += 1;
-        let iter = stats.iterations;
+    // Sibling-fetch dedup: when two specs for the same name are both
+    // in flight, only the first fires a fetch; the second arrives at
+    // the cached body and goes through `spawn_settle` instead.
+    let body_cache: Arc<Mutex<HashMap<String, Arc<[u8]>>>> = Arc::new(Mutex::new(HashMap::new()));
+    let mut in_flight_names: HashSet<String> = HashSet::new();
+    let mut deferred_by_name: HashMap<String, Vec<String>> = HashMap::new();
 
-        // Group this iteration's pending specs by name.
-        let mut by_name: HashMap<String, Vec<String>> = HashMap::new();
-        for (name, spec) in pending_specs.drain(..) {
-            by_name.entry(name).or_default().push(spec);
-        }
+    let mut futs: FuturesUnordered<Fut> = FuturesUnordered::new();
 
-        // Names whose full manifest is already cached from a prior
-        // iteration: settle their siblings synchronously (cheap
-        // semver match + cache lookup; no parse if version_manifest
-        // already cached, otherwise quick simd_json subtree extract).
-        let mut sibling_only: Vec<(String, Vec<String>)> = Vec::new();
-        let mut to_fetch: Vec<String> = Vec::with_capacity(by_name.len());
-        for (name, specs) in &by_name {
-            if done_names.contains(name) {
-                sibling_only.push((name.clone(), specs.clone()));
-            } else {
-                to_fetch.push(name.clone());
+    loop {
+        // Refill to cap.
+        while futs.len() < cap {
+            let Some((name, spec)) = pending.pop_front() else {
+                break;
+            };
+            // Sibling fast path: body already cached.
+            if let Some(raw) = body_cache.lock().get(&name).cloned() {
+                futs.push(spawn_settle(name, spec, raw, cache.clone(), peer_deps));
+                continue;
             }
-        }
-
-        // Sibling settles (rare on real workloads — most names appear
-        // exactly once across the whole walk). New transitives go
-        // through `seen_specs` dedup before joining `pending_specs`.
-        for (name, specs) in sibling_only {
-            let Some(full) = cache.get_full_manifest(&name) else {
+            // Defer if a fetch for this name is already in flight.
+            if !in_flight_names.insert(name.clone()) {
+                deferred_by_name.entry(name).or_default().push(spec);
                 continue;
-            };
-            for spec in specs {
-                let Ok(resolved) = resolve_target_version((&*full).into(), &spec) else {
-                    continue;
-                };
-                let new_deps = if let Some(cached) = cache.get_version_manifest(&name, &resolved) {
-                    cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&cached));
-                    extract_transitive(&cached, peer_deps)
-                } else if let Some(core) = full.get_core_version(&resolved) {
-                    let core_arc = Arc::new(core);
-                    cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&core_arc));
-                    cache.set_version_manifest(name.clone(), resolved, Arc::clone(&core_arc));
-                    extract_transitive(&core_arc, peer_deps)
-                } else {
-                    Vec::new()
-                };
-                pending_specs.extend(
-                    new_deps
-                        .into_iter()
-                        .filter(|(n, s)| seen_specs.insert((n.clone(), s.clone()))),
-                );
             }
+            futs.push(spawn_fetch(
+                client.clone(),
+                Arc::clone(&registry),
+                name,
+                spec,
+                cache.clone(),
+                Arc::clone(&body_cache),
+                peer_deps,
+            ));
         }
 
-        if to_fetch.is_empty() {
-            // Iteration drained pending entirely via sibling settles.
-            continue;
+        if futs.is_empty() {
+            break;
         }
 
-        // PHASE 1 — pure HTTP, mb-style.
-        let p1_start = tokio::time::Instant::now();
-        let bodies = mb_style_pure_fetch(to_fetch.clone(), registry_url, conc).await;
-        let p1_wall = p1_start.elapsed().as_millis();
-        let total_bytes: usize = bodies
-            .iter()
-            .map(|b| b.bytes.as_ref().map(|v| v.len()).unwrap_or(0))
-            .sum();
-        tracing::info!(
-            "p1-breakdown mb_fetch iter={} phase1_http_wall={}ms n={} bytes={}",
-            iter,
-            p1_wall,
-            to_fetch.len(),
-            total_bytes,
-        );
-
-        // PHASE 2 — rayon batch parse + settle.
-        let p2_start = tokio::time::Instant::now();
-        let by_name_for_parse = by_name
-            .iter()
-            .filter(|(name, _)| !done_names.contains(*name))
-            .map(|(n, s)| (n.clone(), s.clone()))
-            .collect::<HashMap<_, _>>();
-        let parsed = parse_settle_batch(bodies, by_name_for_parse, peer_deps).await;
-        let p2_wall = p2_start.elapsed().as_millis();
-
-        let mut new_transitives: Vec<Dep> = Vec::new();
-        let mut settle_count = 0usize;
-        let mut fail_count = 0usize;
-        for outcome in parsed {
-            done_names.insert(outcome.name.clone());
-            let Some(full_arc) = outcome.full else {
-                fail_count += 1;
-                continue;
-            };
-            cache.set_full_manifest(outcome.name.clone(), Arc::clone(&full_arc));
-            for (spec, resolved, core) in outcome.settled {
-                cache.set_version_manifest(outcome.name.clone(), spec, Arc::clone(&core));
-                cache.set_version_manifest(outcome.name.clone(), resolved, Arc::clone(&core));
-                settle_count += 1;
-            }
-            new_transitives.extend(outcome.transitives);
-        }
-        // Names that fetched but failed parse — still mark done so we
-        // don't refetch them next iteration.
-        for name in to_fetch {
-            done_names.insert(name);
+        let Some(out) = futs.next().await else { break };
+
+        if out.transitives.is_empty() && out.fetched {
+            // Empty result from a fetch is ambiguous (no transitives
+            // OR a fetch/parse failure). Track conservatively as
+            // success — the FETCH_TIMINGS-equivalent counter is
+            // omitted in this path on purpose to keep the future
+            // body lean.
+            stats.success += 1;
+        } else if out.fetched {
+            stats.success += 1;
         }
 
-        stats.success += settle_count;
-        stats.fail += fail_count;
-
-        let new_unique: Vec<Dep> = new_transitives
-            .into_iter()
-            .filter(|(n, s)| seen_specs.insert((n.clone(), s.clone())))
-            .collect();
-
-        tracing::info!(
-            "p1-breakdown mb_fetch iter={} phase2_parse_wall={}ms settles={} fail={} new_unique={}",
-            iter,
-            p2_wall,
-            settle_count,
-            fail_count,
-            new_unique.len(),
-        );
+        // Drain sibling specs deferred while the fetch was in flight.
+        if out.fetched
+            && let Some(siblings) = deferred_by_name.remove(&out.name)
+            && let Some(raw) = body_cache.lock().get(&out.name).cloned()
+        {
+            for sibling_spec in siblings {
+                futs.push(spawn_settle(
+                    out.name.clone(),
+                    sibling_spec,
+                    Arc::clone(&raw),
+                    cache.clone(),
+                    peer_deps,
+                ));
+            }
+        }
 
-        pending_specs.extend(new_unique);
+        // Extend pending with new transitive specs, dedup.
+        for (name, spec) in out.transitives {
+            if seen.insert((name.clone(), spec.clone())) {
+                pending.push_back((name, spec));
+            }
+        }
     }
 
     let total_wall = total_start.elapsed().as_millis();
     tracing::info!(
-        "p1-breakdown mb_fetch total_wall={}ms iters={} settled={} fail={}",
+        "p1-breakdown mb_fetch wall={}ms ok={} fail={}",
         total_wall,
-        stats.iterations,
         stats.success,
         stats.fail,
     );
diff --git a/crates/ruborist/src/service/api.rs b/crates/ruborist/src/service/api.rs
index 9687fc875..06079b248 100644
--- a/crates/ruborist/src/service/api.rs
+++ b/crates/ruborist/src/service/api.rs
@@ -38,7 +38,6 @@ use crate::resolver::builder::{
     BuildDepsConfig, DevDeps, EdgeContext, PeerDeps, add_edges_from, build_deps_with_config,
     gather_preload_deps,
 };
-use crate::resolver::fast_preload::fast_preload;
 use crate::resolver::mb_resolve::mb_fetch;
 use crate::resolver::preload::PreloadConfig;
 use crate::resolver::runtime::install_runtime_from_map;
@@ -275,19 +274,19 @@ where
 
     // Lockfile-only callers (`utoo deps`) skip the receiver-driven
     // `run_preload_phase` because they have no pipeline consumer for
-    // `BuildEvent::PackageResolved`. Run `fast_preload` instead — a flat
-    // `FuturesUnordered` over `fetch_full_manifest` that warms the
-    // `MemoryCache` so the BFS phase below is pure cache-hit. This is
-    // the manifest-bench-style path; the heavier `preload_manifests`
-    // path (with `OnceMap` gates + `EventReceiver` events) only runs
-    // for install paths that need the pipeline signal.
+    // `BuildEvent::PackageResolved`. Route through `mb_fetch` — a
+    // ruborist-internal standalone preload that bypasses
+    // `service::http`, `service::manifest`, and `service::registry`
+    // to match `manifest-bench`'s loop shape directly. PM is
+    // unaware: this dispatch happens entirely inside ruborist when
+    // `skip_preload=true` and there's no warm project cache.
     if skip_preload_caller && cache_count == 0 {
         let initial_deps = gather_preload_deps(&graph, peer_deps);
         let preload_config = PreloadConfig {
             peer_deps,
             concurrency,
         };
-        fast_preload(
+        mb_fetch(
             initial_deps,
             registry.registry_url(),
             registry.cache(),
@@ -333,166 +332,6 @@ where
     })
 }
 
-/// Experimental parallel-track entry point: structurally identical to
-/// [`build_deps`] but routes the manifest-fetch phase through
-/// [`crate::resolver::mb_resolve::mb_fetch`] instead of
-/// [`crate::resolver::fast_preload::fast_preload`].
-///
-/// Intended for A/B benchmarking: install + lockfile-only callers can
-/// opt in via `UTOO_RESOLVE=mb` (wired in `pm::helper::ruborist_context`).
-/// All other behavior — workspace discovery, runtime injection, BFS,
-/// graph→lock serialization, project cache export — is the same as
-/// `build_deps`. The `EventReceiver` still receives BFS events; it
-/// does NOT receive `PreloadFetching` / `PreloadProgress` events
-/// because mb_fetch is silent (matches `manifest-bench`'s zero-event
-/// loop).
-///
-/// **Install-path note:** `pipeline_deps_options` callers that need
-/// `PackageResolved` events to drive the download/clone pipeline
-/// won't pipeline under this path — mb_fetch finishes all fetches
-/// before BFS starts. Use only for `utoo deps`-style workloads, or
-/// accept that install becomes phase-sequential.
-pub async fn build_deps_mb<G, R>(options: BuildDepsOptions<G, R>) -> Result<BuildDepsOutput>
-where
-    G: Glob + Clone,
-    R: EventReceiver,
-{
-    let BuildDepsOptions {
-        cwd,
-        registry_url,
-        cache_dir,
-        manifest_store,
-        warm_project_cache,
-        concurrency,
-        peer_deps,
-        glob,
-        receiver,
-        supports_semver,
-        catalogs,
-        skip_preload: _,
-    } = options;
-
-    // Steps 1-6: structurally identical to `build_deps` — read
-    // package.json, inject runtime deps, build initial graph, add
-    // root edges, discover and add workspaces.
-    let discovery = WorkspaceDiscovery::new(glob.clone());
-    let root_path = discovery.find_root_path(&cwd).await?;
-    let pkg_path = root_path.join("package.json");
-    let mut pkg: PackageJson = super::fs::read_json(&pkg_path)
-        .await
-        .map_err(|e| anyhow::anyhow!("Failed to read/parse package.json: {}", e))?;
-
-    if let Some(engines) = &pkg.engines {
-        let runtime_deps = install_runtime_from_map(engines);
-        if !runtime_deps.is_empty() {
-            for (name, version) in runtime_deps {
-                pkg.optional_dependencies
-                    .get_or_insert_with(HashMap::new)
-                    .entry(name)
-                    .or_insert(version);
-            }
-        }
-    }
-
-    let mut graph = DependencyGraph::from_package_json(root_path.clone(), pkg.clone());
-    let root_index = graph.root_index;
-    let edge_ctx = EdgeContext::new(peer_deps, DevDeps::Include).with_catalogs(&catalogs);
-    add_edges_from(&mut graph, root_index, &pkg, &edge_ctx);
-
-    let workspaces = discovery.find_workspaces_from_pkg(&root_path, &pkg).await?;
-    for workspace in workspaces {
-        let ws_pkg = workspace.package_json;
-        let workspace_node =
-            PackageNode::workspace_from_package_json(workspace.path.clone(), ws_pkg.clone());
-        let workspace_index = graph.add_node(workspace_node);
-        let link_node = PackageNode::link_from_package_json(workspace.path.clone(), ws_pkg.clone());
-        let link_index = graph.add_node(link_node);
-        graph.add_physical_edge(root_index, workspace_index);
-        graph.add_physical_edge(root_index, link_index);
-        let dep_edge_id = graph.add_dependency_edge(
-            root_index,
-            workspace.name.clone(),
-            &ws_pkg.version,
-            EdgeType::Prod,
-        );
-        graph.mark_dependency_resolved(dep_edge_id, workspace_index);
-        add_edges_from(&mut graph, workspace_index, &ws_pkg, &edge_ctx);
-    }
-
-    // Step 7-8: cache + registry, same as `build_deps`. Warm project
-    // cache is honored.
-    let package_cache = Arc::new(PackageCache::default());
-    let (cache_count, _) = prepopulate_warm_cache(&package_cache, warm_project_cache.as_ref());
-
-    let mut builder = UnifiedRegistry::builder()
-        .registry(&registry_url)
-        .cache(package_cache)
-        .store(Arc::clone(&manifest_store));
-    if let Some(semver) = supports_semver {
-        builder = builder.supports_semver(semver);
-    }
-    let registry = builder.build();
-
-    // Run mb_fetch instead of fast_preload — pre-warms cache by
-    // walking transitive deps via flat FuturesUnordered. Skipped if
-    // the warm project cache already covers the workload.
-    if cache_count == 0 {
-        let initial_deps = gather_preload_deps(&graph, peer_deps);
-        let preload_config = PreloadConfig {
-            peer_deps,
-            concurrency,
-        };
-        mb_fetch(
-            initial_deps,
-            registry.registry_url(),
-            registry.cache(),
-            &preload_config,
-        )
-        .await;
-    }
-
-    // BFS phase reads the now-warm cache. `skip_preload=true` skips
-    // the receiver-driven preload — mb_fetch already ran.
-    let mut config = BuildDepsConfig::default()
-        .with_peer_deps(peer_deps)
-        .with_concurrency(concurrency)
-        .with_skip_preload(true)
-        .with_catalogs(catalogs);
-    if let Some(dir) = cache_dir {
-        config = config.with_cache_dir(dir);
-    }
-
-    build_deps_with_config(&mut graph, &registry, config, &receiver)
-        .await
-        .map_err(|e| anyhow::Error::new(e).context("Dependency resolution failed"))?;
-
-    let t_serialize_start = std::time::Instant::now();
-    let (packages, _total) = graph.serialize_to_packages(&root_path);
-    let serialize_us = t_serialize_start.elapsed().as_micros() as u64;
-
-    let t_cache_export_start = std::time::Instant::now();
-    let mut project_cache = ProjectCacheData::default();
-    for (key, manifest) in registry.cache().export_version_manifests() {
-        let (name, spec) = parse_package_spec(&key);
-        let version = manifest.version.clone();
-        let pkg_cache = project_cache.cache.entry(name.to_string()).or_default();
-        pkg_cache.specs.insert(spec.to_string(), version.clone());
-        pkg_cache.manifests.insert(version, (*manifest).clone());
-    }
-    let cache_export_us = t_cache_export_start.elapsed().as_micros() as u64;
-
-    tracing::info!(
-        "p1-breakdown serialize_us={} cache_export_us={}",
-        serialize_us,
-        cache_export_us,
-    );
-
-    Ok(BuildDepsOutput {
-        lock: PackageLock::new(&pkg.name, &pkg.version, packages),
-        project_cache,
-    })
-}
-
 /// Pre-populate `cache` from a warm project cache. Returns
 /// `(loaded, missing)` — `loaded` is the count of usable spec→manifest
 /// entries; `missing` counts specs whose resolved version had no manifest
diff --git a/crates/ruborist/src/service/mod.rs b/crates/ruborist/src/service/mod.rs
index 7a7cf8ca8..5adb6bf0b 100644
--- a/crates/ruborist/src/service/mod.rs
+++ b/crates/ruborist/src/service/mod.rs
@@ -52,7 +52,7 @@ mod manifest;
 mod registry;
 mod store;
 
-pub use api::{BuildDepsOptions, BuildDepsOutput, build_deps, build_deps_mb};
+pub use api::{BuildDepsOptions, BuildDepsOutput, build_deps};
 pub use cache::{
     CacheStats, MemoryCache, PackageCache, ProjectCacheData, ProjectPackageCache, Versions,
     VersionsInfo,

From d9fb207026b1429a2a5c897df3077ed5e0377022 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 14:42:10 +0800
Subject: [PATCH 24/32] perf(pm): aws-lc-rs TLS for mb_resolve + per-stage
 breakdown for BFS/mb_fetch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Step 2 of staged service-layer ablation. Targets the two gaps
left after step 1:

1. mb_fetch (in ruborist): 2300ms / 2735 = 0.84 ms/name
   manifest-bench (standalone): 2010ms / 2735 = 0.72 ms/name
   ~290ms gap on same workload, same conc.

2. BFS phase: 305ms wall against a fully-warm MemoryCache.
   Origin unclear — could be graph mutations, repeated cache
   lookups via the inflight gate, or event dispatch.

Changes:

* TLS provider — adds rustls (aws-lc-rs) + rustls-native-certs to
  non-wasm-non-macos targets. mb_resolve's `build_mb_client` now
  uses `use_preconfigured_tls(aws_lc_rs)` matching
  preload-bench / manifest-bench exactly. The reqwest crate's
  `rustls-tls-native-roots` feature on Linux still bundles ring
  for service::http's global client; the two providers coexist.

* mb_fetch instrumentation — per-future `wall_us` (network +
  parse + cache writes) and `net_us` (network only) reported in
  the trace line as `eff_par_full`, `eff_par_net`, `avg_wall`,
  `avg_net`. Same shape as manifest-bench's `avg_conc` so we can
  compare directly.

* BFS instrumentation — splits run_bfs_phase wall into:
    - `collect_us`: collect_unresolved_edges sum
    - `resolve_us`: process_dependency .await sum
    - `event_us`: post-resolve event dispatch (Resolved /
      PackagePlaced / Reused / Skipped) sum
  Plus `levels` and `edges` counters. Trace line lets us
  attribute the 305ms.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/Cargo.toml                 |  12 ++-
 crates/ruborist/src/resolver/builder.rs    |  28 ++++-
 crates/ruborist/src/resolver/mb_resolve.rs | 113 +++++++++++++++++++--
 3 files changed, 141 insertions(+), 12 deletions(-)

diff --git a/crates/ruborist/Cargo.toml b/crates/ruborist/Cargo.toml
index fdda5ea5e..57d96f187 100644
--- a/crates/ruborist/Cargo.toml
+++ b/crates/ruborist/Cargo.toml
@@ -52,9 +52,17 @@ workspace = true
 [dev-dependencies]
 tokio = { workspace = true, features = ["macros", "rt"] }
 
-# Native (non-macOS) targets: reqwest's default rustls + ring.
+# Native (non-macOS) targets: reqwest's default rustls + ring (used by
+# `service::http`'s global client). `mb_resolve` separately brings
+# `rustls` (with aws-lc-rs) and `rustls-native-certs` to build its
+# own client via `use_preconfigured_tls(aws_lc_rs)` — same TLS choice
+# as `manifest-bench` / `preload-bench`. The two providers coexist:
+# reqwest's internal client uses ring; `mb_resolve`'s explicit client
+# uses aws-lc-rs.
 [target.'cfg(not(any(target_arch = "wasm32", target_os = "macos")))'.dependencies]
-reqwest = { version = "0.12", default-features = false, features = ["rustls-tls-native-roots"] }
+reqwest             = { version = "0.12", default-features = false, features = ["rustls-tls-native-roots"] }
+rustls              = { version = "0.23", default-features = false, features = ["aws-lc-rs", "logging", "std", "tls12"] }
+rustls-native-certs = "0.8"
 
 # Native-only dependencies (not compiled for WASM)
 [target.'cfg(not(target_arch = "wasm32"))'.dependencies]
diff --git a/crates/ruborist/src/resolver/builder.rs b/crates/ruborist/src/resolver/builder.rs
index 156622502..a5d3e12a4 100644
--- a/crates/ruborist/src/resolver/builder.rs
+++ b/crates/ruborist/src/resolver/builder.rs
@@ -825,7 +825,18 @@ async fn run_bfs_phase<R: RegistryClient, E: EventReceiver>(
     let start = tokio::time::Instant::now();
     let mut current_level = vec![graph.root_index];
 
+    // Per-stage instrumentation. The full BFS wall is `bfs_elapsed`
+    // below; these split it into work types so we can see whether
+    // graph traversal, edge resolution, or post-resolve event
+    // dispatch dominates.
+    let mut total_collect_us: u64 = 0;
+    let mut total_resolve_us: u64 = 0;
+    let mut total_event_us: u64 = 0;
+    let mut total_edges: u64 = 0;
+    let mut total_levels: u64 = 0;
+
     while !current_level.is_empty() {
+        total_levels += 1;
         receiver.on_event(BuildEvent::LevelStart {
             node_count: current_level.len(),
         });
@@ -846,7 +857,10 @@ async fn run_bfs_phase<R: RegistryClient, E: EventReceiver>(
             }
 
             // Process unresolved dependencies
+            let collect_start = std::time::Instant::now();
             let unresolved = collect_unresolved_edges(graph, node_index);
+            total_collect_us += collect_start.elapsed().as_micros() as u64;
+            total_edges += unresolved.len() as u64;
             receiver.on_event(BuildEvent::DependencyCount {
                 count: unresolved.len(),
             });
@@ -855,6 +869,7 @@ async fn run_bfs_phase<R: RegistryClient, E: EventReceiver>(
                 receiver.on_event(BuildEvent::Resolving {
                     name: &edge_info.name,
                 });
+                let resolve_start = std::time::Instant::now();
                 let result = process_dependency(graph, registry, node_index, &edge_info, config)
                     .await
                     .map_err(|inner| {
@@ -865,7 +880,10 @@ async fn run_bfs_phase<R: RegistryClient, E: EventReceiver>(
                             source: Box::new(inner),
                         }
                     });
-                match result? {
+                total_resolve_us += resolve_start.elapsed().as_micros() as u64;
+                let event_start = std::time::Instant::now();
+                let processed = result?;
+                match processed {
                     ProcessResult::Created(idx) => {
                         // Extract node info for events
                         if let Some(node) = graph.get_node(idx) {
@@ -905,6 +923,7 @@ async fn run_bfs_phase<R: RegistryClient, E: EventReceiver>(
                         });
                     }
                 }
+                total_event_us += event_start.elapsed().as_micros() as u64;
             }
         }
 
@@ -917,8 +936,13 @@ async fn run_bfs_phase<R: RegistryClient, E: EventReceiver>(
     let bfs_elapsed = start.elapsed();
     tracing::debug!("Build phase: {:?}", bfs_elapsed);
     tracing::info!(
-        "p1-breakdown bfs_wall={}ms | {}",
+        "p1-breakdown bfs_wall={}ms levels={} edges={} collect={}us resolve={}us event={}us | {}",
         bfs_elapsed.as_millis(),
+        total_levels,
+        total_edges,
+        total_collect_us,
+        total_resolve_us,
+        total_event_us,
         crate::util::FETCH_TIMINGS.snapshot().summary_line(),
     );
     Ok(())
diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs
index 7e1376330..a4b2ba8c1 100644
--- a/crates/ruborist/src/resolver/mb_resolve.rs
+++ b/crates/ruborist/src/resolver/mb_resolve.rs
@@ -34,7 +34,7 @@ use std::pin::Pin;
 use std::sync::Arc;
 use std::time::Instant;
 
-use anyhow::{Context, Result};
+use anyhow::{Context, Result, anyhow};
 use futures::stream::{FuturesUnordered, StreamExt};
 use parking_lot::Mutex;
 use serde::Deserialize;
@@ -53,12 +53,37 @@ pub struct MbFetchStats {
 }
 
 /// Build a fresh `reqwest::Client` matching `preload-bench` /
-/// `manifest-bench` exactly, except for the TLS provider — those
-/// benches use aws-lc-rs but we keep ruborist's existing default
-/// rustls (ring on Linux). If A/B data shows TLS is the remaining
-/// gap, we'll add the aws-lc-rs deps separately.
+/// `manifest-bench` exactly: aws-lc-rs TLS provider via
+/// `use_preconfigured_tls`, `pool_max_idle_per_host(256)`, no
+/// proxy, `http1_only`. The reqwest crate's
+/// `rustls-tls-native-roots` feature on Linux still bundles ring
+/// for `service::http`'s global client, but this client overrides
+/// at construction time — both providers coexist in the binary.
+#[cfg(not(target_arch = "wasm32"))]
 fn build_mb_client() -> Result<reqwest::Client> {
+    // Idempotent: first install_default wins; subsequent calls are
+    // no-ops. Sets the process-wide default for any rustls consumer
+    // that builds a `ClientConfig` without explicit provider.
+    let _ = rustls::crypto::aws_lc_rs::default_provider().install_default();
+
+    let mut roots = rustls::RootCertStore::empty();
+    let native = rustls_native_certs::load_native_certs();
+    for cert in native.certs {
+        // Tolerate individual bad roots — same defensive load pattern
+        // as `service::http::build_rustls_config`.
+        let _ = roots.add(cert);
+    }
+
+    let tls_config = rustls::ClientConfig::builder_with_provider(std::sync::Arc::new(
+        rustls::crypto::aws_lc_rs::default_provider(),
+    ))
+    .with_safe_default_protocol_versions()
+    .map_err(|e| anyhow!("rustls protocol versions: {e}"))?
+    .with_root_certificates(roots)
+    .with_no_client_auth();
+
     reqwest::Client::builder()
+        .use_preconfigured_tls(tls_config)
         .no_proxy()
         .pool_max_idle_per_host(256)
         .http1_only()
@@ -66,6 +91,14 @@ fn build_mb_client() -> Result<reqwest::Client> {
         .context("build reqwest client for mb_resolve")
 }
 
+#[cfg(target_arch = "wasm32")]
+fn build_mb_client() -> Result<reqwest::Client> {
+    reqwest::Client::builder()
+        .no_proxy()
+        .build()
+        .context("build reqwest client for mb_resolve")
+}
+
 /// Collect deps from a deps map, filtering non-registry specs.
 fn collect_deps(map: Option<&HashMap<String, String>>) -> Vec<Dep> {
     map.into_iter()
@@ -93,6 +126,14 @@ struct FetchOutcome {
     name: String,
     transitives: Vec<Dep>,
     fetched: bool,
+    /// Per-future wall (network + body recv + spawn_blocking parse).
+    /// Summed across all futures, divided by mb_fetch total wall =
+    /// eff_parallel — the same number `manifest-bench` reports as
+    /// `avg_conc`. Used to spot wave-shape underutilization.
+    wall_us: u64,
+    /// Per-future network-only wall (request.send + body.bytes).
+    /// `wall_us - net_us` is the spawn_blocking parse contribution.
+    net_us: u64,
 }
 
 type Fut = Pin<Box<dyn std::future::Future<Output = FetchOutcome> + Send>>;
@@ -162,6 +203,7 @@ fn spawn_fetch(
     peer_deps: PeerDeps,
 ) -> Fut {
     Box::pin(async move {
+        let fut_start = Instant::now();
         let url = format!("{}/{}", registry_url, name);
         let resp = match client
             .get(&url)
@@ -171,23 +213,30 @@ fn spawn_fetch(
         {
             Ok(r) if r.status().is_success() => r,
             _ => {
+                let wall_us = fut_start.elapsed().as_micros() as u64;
                 return FetchOutcome {
                     name,
                     transitives: Vec::new(),
                     fetched: true,
+                    wall_us,
+                    net_us: wall_us,
                 };
             }
         };
         let raw_bytes = match resp.bytes().await {
             Ok(b) => b,
             Err(_) => {
+                let wall_us = fut_start.elapsed().as_micros() as u64;
                 return FetchOutcome {
                     name,
                     transitives: Vec::new(),
                     fetched: true,
+                    wall_us,
+                    net_us: wall_us,
                 };
             }
         };
+        let net_us = fut_start.elapsed().as_micros() as u64;
         let raw_arc: Arc<[u8]> = Arc::from(raw_bytes.as_ref());
         // Stash in body_cache early so concurrent sibling specs
         // arriving slightly after see it on their pending pop.
@@ -211,10 +260,13 @@ fn spawn_fetch(
             None => Vec::new(),
         };
 
+        let wall_us = fut_start.elapsed().as_micros() as u64;
         FetchOutcome {
             name,
             transitives,
             fetched: true,
+            wall_us,
+            net_us,
         }
     })
 }
@@ -229,6 +281,7 @@ fn spawn_settle(
     peer_deps: PeerDeps,
 ) -> Fut {
     Box::pin(async move {
+        let fut_start = Instant::now();
         let spec_for_parse = spec.clone();
         let peer = peer_deps;
         let parsed = tokio::task::spawn_blocking(move || {
@@ -251,10 +304,14 @@ fn spawn_settle(
             None => Vec::new(),
         };
 
+        let wall_us = fut_start.elapsed().as_micros() as u64;
         FetchOutcome {
             name,
             transitives,
             fetched: false,
+            wall_us,
+            // Settle-only futures have no network component.
+            net_us: 0,
         }
     })
 }
@@ -269,6 +326,15 @@ pub async fn mb_fetch(
     config: &PreloadConfig,
 ) -> MbFetchStats {
     let mut stats = MbFetchStats::default();
+    // Per-future wall + net sums for eff_parallel computation.
+    // sum_wall_us / total_wall_ms / 1000 = eff_parallel for the
+    // whole future-body span (network + parse + cache writes).
+    // sum_net_us / total_wall_ms / 1000 = network-only eff_parallel,
+    // directly comparable to manifest-bench's avg_conc.
+    let mut sum_wall_us: u64 = 0;
+    let mut sum_net_us: u64 = 0;
+    let mut fetch_count: u64 = 0;
+    let mut settle_count: u64 = 0;
     let total_start = Instant::now();
 
     let client = match build_mb_client() {
@@ -333,6 +399,14 @@ pub async fn mb_fetch(
 
         let Some(out) = futs.next().await else { break };
 
+        sum_wall_us += out.wall_us;
+        sum_net_us += out.net_us;
+        if out.fetched {
+            fetch_count += 1;
+        } else {
+            settle_count += 1;
+        }
+
         if out.transitives.is_empty() && out.fetched {
             // Empty result from a fetch is ambiguous (no transitives
             // OR a fetch/parse failure). Track conservatively as
@@ -368,12 +442,35 @@ pub async fn mb_fetch(
         }
     }
 
-    let total_wall = total_start.elapsed().as_millis();
+    let total_wall_ms = total_start.elapsed().as_millis();
+    let total_wall_us = (total_wall_ms as u64).saturating_mul(1000);
+    let eff_par_full = if total_wall_us > 0 {
+        sum_wall_us as f64 / total_wall_us as f64
+    } else {
+        0.0
+    };
+    let eff_par_net = if total_wall_us > 0 {
+        sum_net_us as f64 / total_wall_us as f64
+    } else {
+        0.0
+    };
+    let avg_wall_us = sum_wall_us
+        .checked_div(fetch_count + settle_count)
+        .unwrap_or(0);
+    let avg_net_us = sum_net_us.checked_div(fetch_count).unwrap_or(0);
     tracing::info!(
-        "p1-breakdown mb_fetch wall={}ms ok={} fail={}",
-        total_wall,
+        "p1-breakdown mb_fetch wall={}ms ok={} fail={} fetch={} settle={} sum_wall={}ms sum_net={}ms avg_wall={}us avg_net={}us eff_par_full={:.1} eff_par_net={:.1}",
+        total_wall_ms,
         stats.success,
         stats.fail,
+        fetch_count,
+        settle_count,
+        sum_wall_us / 1000,
+        sum_net_us / 1000,
+        avg_wall_us,
+        avg_net_us,
+        eff_par_full,
+        eff_par_net,
     );
 
     stats

From c02bb15280e5132fc96495a3cf1d3206c423ec31 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 15:19:38 +0800
Subject: [PATCH 25/32] perf(pm): fold preload + BFS into mb_fetch_with_graph
 for utoo deps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Step 3 of staged service-layer ablation. Targets the 305 ms BFS
phase observed against a fully-warm MemoryCache — 100 % attributed
to process_dependency.await sum (graph mutations) per d9fb2070's
new bfs instrumentation.

Adds:
* `process_dependency_with_resolved` in builder.rs — sync variant
  of process_dependency for the registry-resolved case. Skips
  spec-routing (only Registry handled), skips resolve_registry_dep
  (resolved is the parameter), skips override re-resolve. Reuses
  existing helpers (find_compatible_node, create_package_node,
  add_edges_from, mark_dependency_resolved, update_node_type_from_edge).
* `mb_fetch_with_graph` in mb_resolve.rs — folded streaming preload
  + graph build. Each fetch result triggers inline
  process_dependency_with_resolved for every parent edge waiting
  on (name, spec). New nodes' edges feed back into pending /
  edge_targets, so the walk continues streaming-style.
  CPU work (graph mutations, ~305 ms total) overlaps with network
  IO (mb_fetch's wall ~2.4 s).

Wires `service::api::build_deps` to use mb_fetch_with_graph for
the lockfile-only path (skip_preload + cold cache). The
follow-up build_deps_with_config still runs to handle any
non-registry edges left unresolved (workspace / git / http /
file); on registry-only workloads it's near no-op.

Install path unchanged — pipeline_deps_options keeps preload +
PackageResolved early-start signal for tgz download.

Expected: utoo p1 wall drops from ~2.76 s toward mb_fetch wall +
serialize ≈ 2.4-2.5 s on good network. Tracing line:
  p1-breakdown mb_fetch_with_graph wall=Xms ok=N fetch=N
  settle=N sum_wall=Xms sum_net=Xms sum_graph=Xms avg_net=Xus
  eff_par_full=N.N eff_par_net=N.N unresolved_targets=N

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/resolver/builder.rs    |  49 ++++
 crates/ruborist/src/resolver/mb_resolve.rs | 297 +++++++++++++++++++++
 crates/ruborist/src/service/api.rs         |  36 +--
 3 files changed, 367 insertions(+), 15 deletions(-)

diff --git a/crates/ruborist/src/resolver/builder.rs b/crates/ruborist/src/resolver/builder.rs
index a5d3e12a4..97db89e79 100644
--- a/crates/ruborist/src/resolver/builder.rs
+++ b/crates/ruborist/src/resolver/builder.rs
@@ -651,6 +651,55 @@ pub async fn process_dependency<R: RegistryClient>(
     }
 }
 
+/// Sync variant of [`process_dependency`] for callers that already
+/// have a resolved registry manifest in hand (the
+/// `mb_fetch_with_graph` lockfile-only path populates one
+/// per fetch). Skips:
+///   * spec-routing (`Git` / `Http` / `Local` / `Workspace`) — only
+///     the `Registry` branch is handled. Non-registry edges are
+///     left unresolved for the caller to defer.
+///   * `resolve_registry_dep` (the resolved package is the
+///     parameter).
+///   * Override re-resolve (uses the original resolved package even
+///     if `graph.check_override` would re-route the spec). Override
+///     re-resolve requires another network round-trip; the
+///     lockfile-only fast path skips it intentionally — overridden
+///     specs that diverge from the original resolution will need a
+///     follow-up BFS sweep.
+///
+/// Returns the same [`ProcessResult`] shape as `process_dependency`
+/// so the caller can register newly-created nodes' edges with
+/// `edge_targets` for the streaming graph build.
+pub fn process_dependency_with_resolved(
+    graph: &mut DependencyGraph,
+    parent_idx: NodeIndex,
+    edge_info: &DependencyEdgeInfo,
+    resolved: &ResolvedPackage,
+    config: &BuildDepsConfig,
+) -> ProcessResult {
+    match graph.find_compatible_node(parent_idx, &edge_info.name, &edge_info.spec) {
+        FindResult::Reuse(existing_index) => {
+            graph.mark_dependency_resolved(edge_info.edge_id, existing_index);
+            update_node_type_from_edge(graph, parent_idx, existing_index, &edge_info.edge_type);
+            ProcessResult::Reused(existing_index)
+        }
+        FindResult::Conflict(conflict_parent) | FindResult::New(conflict_parent) => {
+            let new_node = create_package_node(&edge_info.name, resolved, conflict_parent, graph);
+            let new_index = graph.add_node(new_node);
+            graph.add_physical_edge(conflict_parent, new_index);
+            graph.mark_dependency_resolved(edge_info.edge_id, new_index);
+            update_node_type_from_edge(graph, parent_idx, new_index, &edge_info.edge_type);
+            add_edges_from(
+                graph,
+                new_index,
+                &*resolved.manifest,
+                &EdgeContext::new(config.peer_deps, DevDeps::Exclude),
+            );
+            ProcessResult::Created(new_index)
+        }
+    }
+}
+
 /// Build the complete dependency tree using BFS traversal.
 ///
 /// This is the main entry point for dependency resolution. It starts from
diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs
index a4b2ba8c1..4e2f8cc85 100644
--- a/crates/ruborist/src/resolver/mb_resolve.rs
+++ b/crates/ruborist/src/resolver/mb_resolve.rs
@@ -37,14 +37,20 @@ use std::time::Instant;
 use anyhow::{Context, Result, anyhow};
 use futures::stream::{FuturesUnordered, StreamExt};
 use parking_lot::Mutex;
+use petgraph::graph::{EdgeIndex, NodeIndex};
 use serde::Deserialize;
 
+use crate::model::graph::DependencyGraph;
 use crate::model::manifest::{CoreVersionManifest, FullManifest};
 use crate::model::node::PeerDeps;
+use crate::resolver::builder::{
+    BuildDepsConfig, ProcessResult, collect_unresolved_edges, process_dependency_with_resolved,
+};
 use crate::resolver::preload::{Dep, PreloadConfig};
 use crate::resolver::version::resolve_target_version;
 use crate::service::MemoryCache;
 use crate::spec::SpecStr;
+use crate::traits::registry::ResolvedPackage;
 
 #[derive(Debug, Default)]
 pub struct MbFetchStats {
@@ -475,3 +481,294 @@ pub async fn mb_fetch(
 
     stats
 }
+
+// ============================================================================
+// Folded streaming graph build — preload + BFS in one phase
+// ============================================================================
+
+/// Edges waiting on a `(name, spec)` fetch. Multiple parents can need
+/// the same registry dep; we track them all and process inline as
+/// soon as the manifest lands.
+type EdgeTargets = HashMap<(String, String), Vec<(NodeIndex, EdgeIndex)>>;
+
+/// Collect the unresolved registry edges from `node_idx` into
+/// pending + edge_targets, dedup by spec via `seen_specs`.
+/// Non-registry edges (workspace / git / http / file) are
+/// deliberately left for the follow-up BFS sweep.
+fn enqueue_node_edges(
+    graph: &DependencyGraph,
+    node_idx: NodeIndex,
+    pending: &mut VecDeque<Dep>,
+    seen_specs: &mut HashSet<(String, String)>,
+    edge_targets: &mut EdgeTargets,
+) {
+    for edge in collect_unresolved_edges(graph, node_idx) {
+        if !edge.spec.is_registry_spec() {
+            continue;
+        }
+        let key = (edge.name.clone(), edge.spec.clone());
+        edge_targets
+            .entry(key.clone())
+            .or_default()
+            .push((node_idx, edge.edge_id));
+        if seen_specs.insert(key.clone()) {
+            pending.push_back(key);
+        }
+    }
+}
+
+/// Folded variant: combines `mb_fetch`'s streaming preload with the
+/// graph mutations that BFS would otherwise do in a separate phase.
+/// Each fetch result triggers inline `process_dependency_with_resolved`
+/// for every parent edge waiting on `(name, spec)`. New nodes' edges
+/// feed back into pending / edge_targets, so the walk continues
+/// streaming-style without a separate level-by-level traversal.
+///
+/// CPU work (graph mutations) overlaps with network IO (more fetches
+/// in flight via `FuturesUnordered`), so the 305 ms BFS phase
+/// observed against a fully-warm cache is collapsed into mb_fetch's
+/// wall instead of running serially after it.
+///
+/// Non-registry edges (workspace / git / http / file) and any edges
+/// added after the streaming loop converges (override re-resolves
+/// that diverge from the original spec) are left unresolved — the
+/// caller must run a follow-up BFS sweep to handle them. For
+/// `utoo deps` on registry-only workloads (the common case), the
+/// sweep is a no-op.
+pub async fn mb_fetch_with_graph(
+    graph: &mut DependencyGraph,
+    registry_url: &str,
+    cache: &MemoryCache,
+    preload_config: &PreloadConfig,
+    build_config: &BuildDepsConfig,
+) -> Result<MbFetchStats> {
+    let mut stats = MbFetchStats::default();
+    let total_start = Instant::now();
+
+    let client = match build_mb_client() {
+        Ok(c) => c,
+        Err(e) => {
+            tracing::warn!("mb_resolve client build failed: {e}");
+            return Ok(stats);
+        }
+    };
+    let registry = Arc::new(registry_url.trim_end_matches('/').to_string());
+    let cap = preload_config.concurrency;
+    let peer_deps = preload_config.peer_deps;
+
+    // Initial seed: walk root + workspace nodes for unresolved
+    // registry edges. (Workspace nodes were created during graph
+    // initialization in `service::api::build_deps`.)
+    let mut seen_specs: HashSet<(String, String)> = HashSet::new();
+    let mut pending: VecDeque<Dep> = VecDeque::new();
+    let mut edge_targets: EdgeTargets = HashMap::new();
+
+    let root_index = graph.root_index;
+    enqueue_node_edges(
+        graph,
+        root_index,
+        &mut pending,
+        &mut seen_specs,
+        &mut edge_targets,
+    );
+    // Workspace nodes' direct edges. Workspace deps may be
+    // workspace: (resolved at graph init) or registry; registry
+    // ones land in pending.
+    for node_idx in graph.graph.node_indices() {
+        if let Some(node) = graph.get_node(node_idx)
+            && node.is_workspace()
+        {
+            enqueue_node_edges(
+                graph,
+                node_idx,
+                &mut pending,
+                &mut seen_specs,
+                &mut edge_targets,
+            );
+        }
+    }
+
+    // Sibling-fetch dedup carries over from `mb_fetch`.
+    let body_cache: Arc<Mutex<HashMap<String, Arc<[u8]>>>> = Arc::new(Mutex::new(HashMap::new()));
+    let mut in_flight_names: HashSet<String> = HashSet::new();
+    let mut deferred_by_name: HashMap<String, Vec<String>> = HashMap::new();
+
+    let mut futs: FuturesUnordered<Fut> = FuturesUnordered::new();
+
+    // Per-fetch-future timing accumulators (same as `mb_fetch`).
+    let mut sum_wall_us: u64 = 0;
+    let mut sum_net_us: u64 = 0;
+    let mut fetch_count: u64 = 0;
+    let mut settle_count: u64 = 0;
+    // Sum of CPU spent in inline graph mutations across all fetched
+    // events. Reported alongside the fetch totals so we can attribute
+    // the mb_fetch wall split between IO and CPU.
+    let mut sum_graph_us: u64 = 0;
+
+    loop {
+        while futs.len() < cap {
+            let Some((name, spec)) = pending.pop_front() else {
+                break;
+            };
+            if let Some(raw) = body_cache.lock().get(&name).cloned() {
+                futs.push(spawn_settle(name, spec, raw, cache.clone(), peer_deps));
+                continue;
+            }
+            if !in_flight_names.insert(name.clone()) {
+                deferred_by_name.entry(name).or_default().push(spec);
+                continue;
+            }
+            futs.push(spawn_fetch(
+                client.clone(),
+                Arc::clone(&registry),
+                name,
+                spec,
+                cache.clone(),
+                Arc::clone(&body_cache),
+                peer_deps,
+            ));
+        }
+
+        if futs.is_empty() {
+            break;
+        }
+
+        let Some(out) = futs.next().await else { break };
+
+        sum_wall_us += out.wall_us;
+        sum_net_us += out.net_us;
+        if out.fetched {
+            fetch_count += 1;
+        } else {
+            settle_count += 1;
+        }
+        if out.fetched {
+            stats.success += 1;
+        }
+
+        // Drain sibling specs deferred while the fetch was in flight.
+        if out.fetched
+            && let Some(siblings) = deferred_by_name.remove(&out.name)
+            && let Some(raw) = body_cache.lock().get(&out.name).cloned()
+        {
+            for sibling_spec in siblings {
+                futs.push(spawn_settle(
+                    out.name.clone(),
+                    sibling_spec,
+                    Arc::clone(&raw),
+                    cache.clone(),
+                    peer_deps,
+                ));
+            }
+        }
+
+        // Graph mutations: process every parent edge waiting on
+        // `(name, spec)` for each transitive spec the fetch resolved
+        // (the fetch itself touched only the primary spec; sibling
+        // settles touch their own specs). Each settle path covers
+        // its own bucket via the `out.transitives` path below.
+        //
+        // The fetched/settled (name, spec) pair has already been
+        // written to the cache by the future. Look up the version
+        // manifest to get the ResolvedPackage handed to
+        // process_dependency_with_resolved.
+        let graph_start = Instant::now();
+        let process_key_specs: Vec<(String, String)> = out
+            .transitives
+            .iter()
+            .map(|(n, s)| (n.clone(), s.clone()))
+            .collect();
+        // The primary fetched/settled spec itself: resolve it now.
+        let primary_keys: Vec<(String, String)> = edge_targets
+            .keys()
+            .filter(|(n, _)| n == &out.name)
+            .cloned()
+            .collect();
+        for (k_name, k_spec) in primary_keys {
+            // Pull resolved manifest from cache for this spec.
+            let Some(core_arc) = cache.get_version_manifest(&k_name, &k_spec) else {
+                continue;
+            };
+            let resolved = ResolvedPackage {
+                name: k_name.clone(),
+                version: core_arc.version.clone(),
+                manifest: core_arc,
+            };
+            let waiting = edge_targets.remove(&(k_name.clone(), k_spec.clone()));
+            if let Some(targets) = waiting {
+                for (parent_idx, edge_id) in targets {
+                    let edge_info = crate::resolver::edges::DependencyEdgeInfo {
+                        edge_id,
+                        name: k_name.clone(),
+                        spec: k_spec.clone(),
+                        // edge_type carried separately on the graph; we
+                        // re-look-up the actual edge here for
+                        // correctness.
+                        edge_type: graph
+                            .graph
+                            .edge_weight(edge_id)
+                            .and_then(|e| match e {
+                                crate::model::graph::GraphEdge::Dependency(d) => Some(d.edge_type),
+                                _ => None,
+                            })
+                            .unwrap_or(crate::model::node::EdgeType::Prod),
+                    };
+                    let result = process_dependency_with_resolved(
+                        graph,
+                        parent_idx,
+                        &edge_info,
+                        &resolved,
+                        build_config,
+                    );
+                    if let ProcessResult::Created(new_idx) = result {
+                        // The new node's transitive edges become new
+                        // pending entries. Same dedup as the seed.
+                        enqueue_node_edges(
+                            graph,
+                            new_idx,
+                            &mut pending,
+                            &mut seen_specs,
+                            &mut edge_targets,
+                        );
+                    }
+                }
+            }
+        }
+        sum_graph_us += graph_start.elapsed().as_micros() as u64;
+        // Suppress an unused-vars warning when the transitive list is
+        // identical to the keys we just pulled from edge_targets —
+        // we keep collecting it for tracing parity with `mb_fetch`.
+        let _ = process_key_specs;
+    }
+
+    let total_wall_ms = total_start.elapsed().as_millis();
+    let total_wall_us = (total_wall_ms as u64).saturating_mul(1000);
+    let eff_par_full = if total_wall_us > 0 {
+        sum_wall_us as f64 / total_wall_us as f64
+    } else {
+        0.0
+    };
+    let eff_par_net = if total_wall_us > 0 {
+        sum_net_us as f64 / total_wall_us as f64
+    } else {
+        0.0
+    };
+    let avg_net_us = sum_net_us.checked_div(fetch_count).unwrap_or(0);
+    let unresolved_remaining = edge_targets.len();
+    tracing::info!(
+        "p1-breakdown mb_fetch_with_graph wall={}ms ok={} fetch={} settle={} sum_wall={}ms sum_net={}ms sum_graph={}ms avg_net={}us eff_par_full={:.1} eff_par_net={:.1} unresolved_targets={}",
+        total_wall_ms,
+        stats.success,
+        fetch_count,
+        settle_count,
+        sum_wall_us / 1000,
+        sum_net_us / 1000,
+        sum_graph_us / 1000,
+        avg_net_us,
+        eff_par_full,
+        eff_par_net,
+        unresolved_remaining,
+    );
+
+    Ok(stats)
+}
diff --git a/crates/ruborist/src/service/api.rs b/crates/ruborist/src/service/api.rs
index 06079b248..837ebfc5b 100644
--- a/crates/ruborist/src/service/api.rs
+++ b/crates/ruborist/src/service/api.rs
@@ -36,9 +36,8 @@ use crate::model::package_lock::PackageLock;
 use crate::model::util::parse_package_spec;
 use crate::resolver::builder::{
     BuildDepsConfig, DevDeps, EdgeContext, PeerDeps, add_edges_from, build_deps_with_config,
-    gather_preload_deps,
 };
-use crate::resolver::mb_resolve::mb_fetch;
+use crate::resolver::mb_resolve::mb_fetch_with_graph;
 use crate::resolver::preload::PreloadConfig;
 use crate::resolver::runtime::install_runtime_from_map;
 use crate::resolver::workspace::WorkspaceDiscovery;
@@ -272,32 +271,39 @@ where
         );
     }
 
-    // Lockfile-only callers (`utoo deps`) skip the receiver-driven
-    // `run_preload_phase` because they have no pipeline consumer for
-    // `BuildEvent::PackageResolved`. Route through `mb_fetch` — a
-    // ruborist-internal standalone preload that bypasses
-    // `service::http`, `service::manifest`, and `service::registry`
-    // to match `manifest-bench`'s loop shape directly. PM is
-    // unaware: this dispatch happens entirely inside ruborist when
-    // `skip_preload=true` and there's no warm project cache.
-    if skip_preload_caller && cache_count == 0 {
-        let initial_deps = gather_preload_deps(&graph, peer_deps);
+    // Lockfile-only callers (`utoo deps`) route through
+    // `mb_fetch_with_graph` — a folded streaming preload + graph
+    // build. The fetch loop drives manifest IO; per-result inline
+    // `process_dependency_with_resolved` mutates the graph. Result:
+    // no separate BFS phase. The follow-up
+    // `build_deps_with_config` call still runs to handle any
+    // non-registry edges (workspace / git / http / file) the fold
+    // path skipped, but on registry-only workloads it's near no-op.
+    let folded = skip_preload_caller && cache_count == 0;
+    if folded {
         let preload_config = PreloadConfig {
             peer_deps,
             concurrency,
         };
-        mb_fetch(
-            initial_deps,
+        mb_fetch_with_graph(
+            &mut graph,
             registry.registry_url(),
             registry.cache(),
             &preload_config,
+            &config,
         )
-        .await;
+        .await
+        .map_err(|e| e.context("mb_fetch_with_graph failed"))?;
     }
 
     // Preserve the typed error via `Error::new` + `.context(...)` so CLI
     // renderers (e.g. pm's format_print) can downcast and pretty-print the
     // dependency chain carried by `ResolveError::WithChain`.
+    //
+    // For the folded path this BFS sweeps remaining unresolved edges
+    // (non-registry: workspace / git / http / file). On
+    // registry-only workloads (the common case) the graph is fully
+    // built already, BFS walks nothing.
     build_deps_with_config(&mut graph, &registry, config, &receiver)
         .await
         .map_err(|e| anyhow::Error::new(e).context("Dependency resolution failed"))?;

From 63928a73b09d42953a8a41eba9d95deb0c4a47ad Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 15:44:43 +0800
Subject: [PATCH 26/32] =?UTF-8?q?fix(pm):=20mb=5Ffetch=5Fwith=5Fgraph=20?=
 =?UTF-8?q?=E2=80=94=20drain=20edge=5Ftargets=20via=20inline=20cache=20hit?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

c02bb152 had unresolved_targets=583 in trace — `enqueue_node_edges`
was unconditionally pushing (parent, edge_id) into edge_targets
without checking if the (name, spec) was already cached. When a
later transitive's edge referenced an already-fetched (name, spec),
no fetch result would land to drain that bucket — the parent edges
sat unresolved, potentially missing packages from the lockfile.

Fix: enqueue_node_edges now checks cache.get_version_manifest
first. Cache hit → process_dependency_with_resolved inline (with a
work_stack to recurse into newly-Created nodes' edges). Cache
miss → original behavior (stash in edge_targets, push to pending).

Side effect: more inline graph mutation work in the seed phase
(workspace + root edges that hit warm cache from previous specs in
the same root). Should reduce the number of fetch-result events
that need to do graph mutations downstream, since orphan edges no
longer accumulate.

Targets the correctness bug from c02bb152 trace; perf impact TBD.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/resolver/mb_resolve.rs | 109 ++++++++++++++++-----
 1 file changed, 84 insertions(+), 25 deletions(-)

diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs
index 4e2f8cc85..4252e7efd 100644
--- a/crates/ruborist/src/resolver/mb_resolve.rs
+++ b/crates/ruborist/src/resolver/mb_resolve.rs
@@ -495,24 +495,73 @@ type EdgeTargets = HashMap<(String, String), Vec<(NodeIndex, EdgeIndex)>>;
 /// pending + edge_targets, dedup by spec via `seen_specs`.
 /// Non-registry edges (workspace / git / http / file) are
 /// deliberately left for the follow-up BFS sweep.
+/// Process this node's unresolved registry edges:
+/// * If the (name, spec) is already cached (a sibling subtree
+///   resolved it earlier), call `process_dependency_with_resolved`
+///   inline now. Newly-created child nodes recurse via this same
+///   function so their edges are also enqueued/processed.
+/// * Otherwise, register the (parent, edge_id) under `edge_targets`
+///   so the eventual fetch result drains it; push to `pending` if
+///   this `(name, spec)` hasn't been seen.
+///
+/// Without the inline-process path, `(name, spec)` keys added
+/// AFTER their fetch already landed would never be drained — they'd
+/// sit in `edge_targets` and the corresponding parent edges would
+/// stay unresolved. CI run c02bb152 showed ~580 such orphans.
 fn enqueue_node_edges(
-    graph: &DependencyGraph,
+    graph: &mut DependencyGraph,
     node_idx: NodeIndex,
     pending: &mut VecDeque<Dep>,
     seen_specs: &mut HashSet<(String, String)>,
     edge_targets: &mut EdgeTargets,
+    cache: &MemoryCache,
+    build_config: &BuildDepsConfig,
 ) {
-    for edge in collect_unresolved_edges(graph, node_idx) {
-        if !edge.spec.is_registry_spec() {
-            continue;
-        }
-        let key = (edge.name.clone(), edge.spec.clone());
-        edge_targets
-            .entry(key.clone())
-            .or_default()
-            .push((node_idx, edge.edge_id));
-        if seen_specs.insert(key.clone()) {
-            pending.push_back(key);
+    let mut work_stack: Vec<NodeIndex> = vec![node_idx];
+    while let Some(idx) = work_stack.pop() {
+        let edges = collect_unresolved_edges(graph, idx);
+        for edge in edges {
+            if !edge.spec.is_registry_spec() {
+                continue;
+            }
+            let key = (edge.name.clone(), edge.spec.clone());
+
+            // Cache-hit fast path: process immediately, no
+            // edge_targets stash. Reuses the same process logic the
+            // main loop uses on fetch result.
+            if let Some(core_arc) = cache.get_version_manifest(&edge.name, &edge.spec) {
+                let resolved = ResolvedPackage {
+                    name: edge.name.clone(),
+                    version: core_arc.version.clone(),
+                    manifest: core_arc,
+                };
+                let edge_info = crate::resolver::edges::DependencyEdgeInfo {
+                    edge_id: edge.edge_id,
+                    name: edge.name.clone(),
+                    spec: edge.spec.clone(),
+                    edge_type: edge.edge_type,
+                };
+                if let ProcessResult::Created(new_idx) = process_dependency_with_resolved(
+                    graph,
+                    idx,
+                    &edge_info,
+                    &resolved,
+                    build_config,
+                ) {
+                    work_stack.push(new_idx);
+                }
+                // Whether Created or Reused, this edge is now
+                // resolved — don't queue.
+                continue;
+            }
+
+            edge_targets
+                .entry(key.clone())
+                .or_default()
+                .push((idx, edge.edge_id));
+            if seen_specs.insert(key.clone()) {
+                pending.push_back(key);
+            }
         }
     }
 }
@@ -570,22 +619,27 @@ pub async fn mb_fetch_with_graph(
         &mut pending,
         &mut seen_specs,
         &mut edge_targets,
+        cache,
+        build_config,
     );
     // Workspace nodes' direct edges. Workspace deps may be
     // workspace: (resolved at graph init) or registry; registry
     // ones land in pending.
-    for node_idx in graph.graph.node_indices() {
-        if let Some(node) = graph.get_node(node_idx)
-            && node.is_workspace()
-        {
-            enqueue_node_edges(
-                graph,
-                node_idx,
-                &mut pending,
-                &mut seen_specs,
-                &mut edge_targets,
-            );
-        }
+    let workspace_indices: Vec<NodeIndex> = graph
+        .graph
+        .node_indices()
+        .filter(|&i| graph.get_node(i).is_some_and(|n| n.is_workspace()))
+        .collect();
+    for node_idx in workspace_indices {
+        enqueue_node_edges(
+            graph,
+            node_idx,
+            &mut pending,
+            &mut seen_specs,
+            &mut edge_targets,
+            cache,
+            build_config,
+        );
     }
 
     // Sibling-fetch dedup carries over from `mb_fetch`.
@@ -722,13 +776,18 @@ pub async fn mb_fetch_with_graph(
                     );
                     if let ProcessResult::Created(new_idx) = result {
                         // The new node's transitive edges become new
-                        // pending entries. Same dedup as the seed.
+                        // pending entries. enqueue handles cache-hit
+                        // inline-process so we don't orphan
+                        // edge_targets entries after their fetch
+                        // already landed.
                         enqueue_node_edges(
                             graph,
                             new_idx,
                             &mut pending,
                             &mut seen_specs,
                             &mut edge_targets,
+                            cache,
+                            build_config,
                         );
                     }
                 }

From 2527137b4bbb7a7e2622abeefedf3d03e1070c2a Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 16:09:41 +0800
Subject: [PATCH 27/32] ci(pcap): add manifest-bench + preload-bench captures
 for TCP-level diff
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 700ms gap between utoo p1 (folded mb_fetch_with_graph) and
manifest-bench standalone needs network-layer evidence. Same
workload, same conc, same network → why does utoo wall trail by
700ms when per-fetch latency is matched (avg_net=53us = mb p50=40us
ish)?

Hypotheses to test via pcap diff:
* Fewer concurrent TCP streams in flight at any moment (utoo's
  main loop CPU steals tokio dispatch capacity → in-flight count
  drops below conc cap)
* More TLS handshakes (utoo's connection pool isn't reusing as
  effectively as mb's per-rep fresh client)
* Larger inter-packet gaps per stream (utoo's runtime pauses mid
  download)
* Different concurrent-stream-time profile (wave shape)

Adds two captures at end of pm-bench-pcap.sh:
  manifest-bench-c96 — flat lockfile-derived names @ conc=96
  preload-bench-c96  — transitive walk @ conc=96 (matches utoo's
                       walk shape, but no graph build)

Each captured with the same tcpdump + iostat as the existing
utoo / utoo-next / bun captures. analyze_pcap globs *.pcap so the
new files get the same TCP signal extraction (zwin / retx /
dup_ack / per-stream gap p50/p99/max / distinct streams).

Workflow: downloads manifest-bench-linux-x64 +
preload-bench-linux-x64 artifacts (built by build-linux's
benchmark-label conditional steps) into the pm-bench-pcap-linux
job env so pm-bench-pcap.sh can find them.

Trigger: workflow_dispatch with target=pm-bench-pcap.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/pm-e2e-bench.yml | 23 ++++++++++++++++++++
 bench/pm-bench-pcap.sh             | 34 ++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)

diff --git a/.github/workflows/pm-e2e-bench.yml b/.github/workflows/pm-e2e-bench.yml
index eb560969b..13b463a22 100644
--- a/.github/workflows/pm-e2e-bench.yml
+++ b/.github/workflows/pm-e2e-bench.yml
@@ -1000,6 +1000,29 @@ jobs:
           mv /tmp/utoo-next-dist/utoo /tmp/utoo-next
           echo "Baseline utoo (next) version: $(/tmp/utoo-next --version)"
           echo "UTOO_NEXT_BIN=/tmp/utoo-next" >> $GITHUB_ENV
+      # manifest-bench + preload-bench binaries for pcap-comparing
+      # utoo's TCP-level behaviour against pure-HTTP and
+      # transitive-walk baselines.
+      - name: Download manifest-bench binary
+        uses: actions/download-artifact@v4
+        with:
+          name: manifest-bench-linux-x64
+          path: /tmp/manifest-bench-dist
+      - name: Install manifest-bench
+        run: |
+          chmod +x /tmp/manifest-bench-dist/manifest-bench
+          mv /tmp/manifest-bench-dist/manifest-bench /tmp/manifest-bench
+          echo "MANIFEST_BENCH_BIN=/tmp/manifest-bench" >> $GITHUB_ENV
+      - name: Download preload-bench binary
+        uses: actions/download-artifact@v4
+        with:
+          name: preload-bench-linux-x64
+          path: /tmp/preload-bench-dist
+      - name: Install preload-bench
+        run: |
+          chmod +x /tmp/preload-bench-dist/preload-bench
+          mv /tmp/preload-bench-dist/preload-bench /tmp/preload-bench
+          echo "PRELOAD_BENCH_BIN=/tmp/preload-bench" >> $GITHUB_ENV
       - name: Capture pcap
         env:
           PROJECT: ${{ github.event.inputs.project || 'ant-design' }}
diff --git a/bench/pm-bench-pcap.sh b/bench/pm-bench-pcap.sh
index 7a0f7c819..d7f71e106 100755
--- a/bench/pm-bench-pcap.sh
+++ b/bench/pm-bench-pcap.sh
@@ -139,6 +139,40 @@ fi
 
 run_pm_phases bun "$(command -v bun)" "$BUN_CACHE"
 
+# --- standalone bench captures (resolve-only baselines) ----------------
+# After all PM captures, regenerate a fresh package-lock.json via utoo
+# deps (untimed) so manifest-bench has a stable name list to consume.
+# Then pcap-capture each standalone bench at conc=96 — the same conc
+# utoo's mb_fetch_with_graph ran with — so the TCP signals are
+# directly comparable between the integrated path and the pure-HTTP
+# / pure-streaming-walk ceilings.
+cd "$PROJECT_DIR"
+rm -f package-lock.json bun.lock
+rm -rf "$UTOO_CACHE" node_modules
+echo "=== regenerating package-lock.json for standalone benches ==="
+utoo deps --registry="$REGISTRY" --cache-dir="$UTOO_CACHE" \
+  >/dev/null 2>&1 || echo "lock regen failed"
+
+if [ -f package-lock.json ] && [ -n "${MANIFEST_BENCH_BIN:-}" ] && [ -x "$MANIFEST_BENCH_BIN" ]; then
+  capture_one "manifest-bench-c96" \
+    "$MANIFEST_BENCH_BIN" \
+    --lockfile package-lock.json \
+    --registry "$REGISTRY" \
+    --concurrency 96 --reps 1 --http1-only
+else
+  echo "skip manifest-bench: bin missing or no lockfile"
+fi
+
+if [ -n "${PRELOAD_BENCH_BIN:-}" ] && [ -x "$PRELOAD_BENCH_BIN" ]; then
+  capture_one "preload-bench-c96" \
+    "$PRELOAD_BENCH_BIN" \
+    --package-json package.json \
+    --registry "$REGISTRY" \
+    --concurrency 96 --reps 1
+else
+  echo "skip preload-bench: bin missing"
+fi
+
 # --- post-capture analysis: tshark metrics per pcap ---------------------
 # Extract TCP-level stress signals to validate the "install greediness
 # starves download" hypothesis. All of these are pre-TLS so we don't need

From fe26709ebf7df12a70c65b06c86f0d20266cdf69 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 16:45:05 +0800
Subject: [PATCH 28/32] ci(pcap): upload small summaries artifact alongside the
 2GB pcap dump
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previous pm-bench-pcap artifact was 2GB (raw .pcap files for every
PM × phase × bench), making the round-trip download impractical
just to read JSON metrics. Adds a separate `pm-bench-pcap-summaries`
artifact containing only the *.json / *.log / *.iostat.txt / dns.txt
files — KB scale, downloads in seconds.

Raw pcap artifact is preserved for cases where we want to re-run
tshark with different filters.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/pm-e2e-bench.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/.github/workflows/pm-e2e-bench.yml b/.github/workflows/pm-e2e-bench.yml
index 13b463a22..1970a2cd5 100644
--- a/.github/workflows/pm-e2e-bench.yml
+++ b/.github/workflows/pm-e2e-bench.yml
@@ -1030,6 +1030,20 @@ jobs:
         run: |
           chmod +x bench/pm-bench-pcap.sh
           bash bench/pm-bench-pcap.sh
+      # Small artifact (KB scale) with just the per-capture +
+      # aggregated metrics — fast to download for diff analysis,
+      # avoids the 2GB pcap-corpus pull when we only need numbers.
+      - name: Upload pcap summaries
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: pm-bench-pcap-summaries
+          path: |
+            /tmp/pm-bench-pcap/*.json
+            /tmp/pm-bench-pcap/*.log
+            /tmp/pm-bench-pcap/*.iostat.txt
+            /tmp/pm-bench-pcap/dns.txt
+          retention-days: 7
       - name: Upload pcap artifact
         if: always()
         uses: actions/upload-artifact@v4

From 1ff58aec7f33282103361249d9f101ec1b394b8e Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 16:52:02 +0800
Subject: [PATCH 29/32] ci(pcap): upload summary-only artifact + print table to
 CI logs

The pm-bench-pcap artifact is ~2 GB (pcap binaries dominate). gh
run download keeps timing out before completion. Two fixes:

1. New `pm-bench-pcap-summaries` artifact uploads only the JSON
   summaries + .log + iostat.txt + dns.txt (small, fast download).
   The full pcap artifact stays for deep inspection when needed.

2. End of pm-bench-pcap.sh prints a tab-separated comparison
   table (name, wall_s, packets, streams, zwin, retx, dup_ack,
   gap_p99_us, gap_max_us) to stdout, so the data is visible in
   the CI run log without downloading anything.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/pm-e2e-bench.yml | 15 +++++++++++++++
 bench/pm-bench-pcap.sh             | 22 ++++++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/.github/workflows/pm-e2e-bench.yml b/.github/workflows/pm-e2e-bench.yml
index 1970a2cd5..5b219199a 100644
--- a/.github/workflows/pm-e2e-bench.yml
+++ b/.github/workflows/pm-e2e-bench.yml
@@ -1051,3 +1051,18 @@ jobs:
           name: pm-bench-pcap
           path: /tmp/pm-bench-pcap
           retention-days: 7
+      # Tiny summary-only artifact for quick comparison without
+      # re-downloading the multi-GB pcap blob. Includes the
+      # tshark-extracted JSON metrics + the pcap.log files (text,
+      # tiny) but no .pcap binaries.
+      - name: Upload pcap summaries (small)
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: pm-bench-pcap-summaries
+          path: |
+            /tmp/pm-bench-pcap/*.json
+            /tmp/pm-bench-pcap/*.log
+            /tmp/pm-bench-pcap/*.iostat.txt
+            /tmp/pm-bench-pcap/dns.txt
+          retention-days: 7
diff --git a/bench/pm-bench-pcap.sh b/bench/pm-bench-pcap.sh
index d7f71e106..7f37fc5db 100755
--- a/bench/pm-bench-pcap.sh
+++ b/bench/pm-bench-pcap.sh
@@ -367,3 +367,25 @@ fi
 
 echo "done. files:"
 ls -lh "$PCAP_DIR"
+
+# Print summary table to CI logs so we don't need to download the
+# 2 GB pcap artifact just to read the comparison numbers.
+echo
+echo "=== summary table ==="
+if command -v jq >/dev/null && [ -f "$PCAP_DIR/summary.json" ]; then
+  jq -r '
+    .captures
+    | (["name", "wall_s", "packets", "streams", "zwin", "retx", "dup_ack", "gap_p99_us", "gap_max_us"] | @tsv),
+      (.[] | [
+        .name,
+        (.wall_seconds | tostring),
+        (.packet_count | tostring),
+        (.distinct_streams | tostring),
+        (.zero_windows | tostring),
+        (.retransmits | tostring),
+        (.duplicate_acks | tostring),
+        (.gap_p99_us | tostring),
+        (.gap_max_us | tostring)
+      ] | @tsv)
+  ' "$PCAP_DIR/summary.json" | column -t
+fi

From a21f24bca1f6d0172b36a1b386ac3aed2ef61165 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 23:14:46 +0800
Subject: [PATCH 30/32] =?UTF-8?q?perf(pm):=20mb=5Ffetch=5Fwith=5Fgraph=20?=
 =?UTF-8?q?=E2=80=94=20channel-based=20separation=20of=20fetch=20+=20graph?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The pcap evidence (utoo-resolve zwin=71 vs mb-c96 zwin=49) confirmed
main loop CPU was starving tokio runtime workers from polling
sockets. Inline graph mutations (sum_graph=450ms across the fetch
loop) blocked the worker between awaits, so TCP receive buffers
filled and the server paused sending — directly extending wall.

This refactor:
* Spawns `graph_worker` as a separate tokio task (gets its own
  runtime worker thread on multi-thread runtime). Owns the
  DependencyGraph + edge_targets + seen_specs.
* Main loop owns FuturesUnordered + body_cache + dispatch state.
  No graph mutations on this path.
* mpsc channels: main → graph (FetchEventMsg, just the name —
  cache writes already in the future), graph → main (Vec<Dep>
  new pending specs to extend the fetch queue).
* `tokio::select!` with `biased` drains specs first to unblock
  fetch dispatch.
* `in_flight_graph` counter tracks outstanding messages to graph
  worker — termination = futs empty + in_flight_graph == 0.

Function signature changed: takes `mut graph: DependencyGraph` by
value, returns `(DependencyGraph, MbFetchStats)` since the worker
task needs ownership of the graph (can't borrow across spawn).
api.rs caller threads the graph through.

Expected: zwin drops back toward mb's ~49 (no more main loop
starvation), eff_par_net climbs from 56 toward mb's 72, wall
saves ~200ms.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/resolver/mb_resolve.rs | 402 ++++++++++++++-------
 crates/ruborist/src/service/api.rs         |   5 +-
 2 files changed, 277 insertions(+), 130 deletions(-)

diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs
index 4252e7efd..197fcbc26 100644
--- a/crates/ruborist/src/resolver/mb_resolve.rs
+++ b/crates/ruborist/src/resolver/mb_resolve.rs
@@ -39,6 +39,7 @@ use futures::stream::{FuturesUnordered, StreamExt};
 use parking_lot::Mutex;
 use petgraph::graph::{EdgeIndex, NodeIndex};
 use serde::Deserialize;
+use tokio::sync::mpsc;
 
 use crate::model::graph::DependencyGraph;
 use crate::model::manifest::{CoreVersionManifest, FullManifest};
@@ -584,13 +585,21 @@ fn enqueue_node_edges(
 /// caller must run a follow-up BFS sweep to handle them. For
 /// `utoo deps` on registry-only workloads (the common case), the
 /// sweep is a no-op.
+/// One fetched/settled event, sent from main loop to graph worker.
+/// The future already performed cache writes inline (cheap DashMap
+/// inserts). Graph worker uses `cache.get_version_manifest` to
+/// retrieve the manifest for `process_dependency_with_resolved`.
+struct FetchEventMsg {
+    name: String,
+}
+
 pub async fn mb_fetch_with_graph(
-    graph: &mut DependencyGraph,
+    mut graph: DependencyGraph,
     registry_url: &str,
     cache: &MemoryCache,
     preload_config: &PreloadConfig,
     build_config: &BuildDepsConfig,
-) -> Result<MbFetchStats> {
+) -> Result<(DependencyGraph, MbFetchStats)> {
     let mut stats = MbFetchStats::default();
     let total_start = Instant::now();
 
@@ -598,7 +607,7 @@ pub async fn mb_fetch_with_graph(
         Ok(c) => c,
         Err(e) => {
             tracing::warn!("mb_resolve client build failed: {e}");
-            return Ok(stats);
+            return Ok((graph, stats));
         }
     };
     let registry = Arc::new(registry_url.trim_end_matches('/').to_string());
@@ -606,15 +615,15 @@ pub async fn mb_fetch_with_graph(
     let peer_deps = preload_config.peer_deps;
 
     // Initial seed: walk root + workspace nodes for unresolved
-    // registry edges. (Workspace nodes were created during graph
-    // initialization in `service::api::build_deps`.)
+    // registry edges. Done inline before spawning workers (one-time
+    // cost, not on the hot path).
     let mut seen_specs: HashSet<(String, String)> = HashSet::new();
     let mut pending: VecDeque<Dep> = VecDeque::new();
     let mut edge_targets: EdgeTargets = HashMap::new();
 
     let root_index = graph.root_index;
     enqueue_node_edges(
-        graph,
+        &mut graph,
         root_index,
         &mut pending,
         &mut seen_specs,
@@ -622,9 +631,6 @@ pub async fn mb_fetch_with_graph(
         cache,
         build_config,
     );
-    // Workspace nodes' direct edges. Workspace deps may be
-    // workspace: (resolved at graph init) or registry; registry
-    // ones land in pending.
     let workspace_indices: Vec<NodeIndex> = graph
         .graph
         .node_indices()
@@ -632,7 +638,7 @@ pub async fn mb_fetch_with_graph(
         .collect();
     for node_idx in workspace_indices {
         enqueue_node_edges(
-            graph,
+            &mut graph,
             node_idx,
             &mut pending,
             &mut seen_specs,
@@ -642,24 +648,48 @@ pub async fn mb_fetch_with_graph(
         );
     }
 
-    // Sibling-fetch dedup carries over from `mb_fetch`.
+    // Channels: main → graph (fetched events) + graph → main (new
+    // pending specs). Bounded at 2 * cap so neither side stalls
+    // waiting for the other under bursty wave behavior.
+    let (fetch_tx, fetch_rx) = mpsc::channel::<FetchEventMsg>(cap * 2 + 16);
+    let (specs_tx, mut specs_rx) = mpsc::channel::<Vec<Dep>>(cap * 2 + 16);
+
+    // Spawn graph worker: owns the graph + edge_targets + seen_specs.
+    // This task is CPU-only (no awaits except channel IO), so on a
+    // multi-thread tokio runtime it gets its own worker thread,
+    // freeing the main task's worker to drive socket polling. That
+    // separation is the whole point of this rewrite — the inline
+    // version observed zwin events 71 vs mb's 49, evidence of main
+    // loop CPU starving the runtime's IO polling.
+    let cache_clone = cache.clone();
+    let build_config_owned = build_config.clone();
+    let graph_handle = tokio::spawn(graph_worker(
+        graph,
+        edge_targets,
+        seen_specs,
+        cache_clone,
+        build_config_owned,
+        fetch_rx,
+        specs_tx,
+    ));
+
+    // Sibling-fetch dedup stays in main loop (drives FuturesUnordered).
     let body_cache: Arc<Mutex<HashMap<String, Arc<[u8]>>>> = Arc::new(Mutex::new(HashMap::new()));
     let mut in_flight_names: HashSet<String> = HashSet::new();
     let mut deferred_by_name: HashMap<String, Vec<String>> = HashMap::new();
-
     let mut futs: FuturesUnordered<Fut> = FuturesUnordered::new();
 
-    // Per-fetch-future timing accumulators (same as `mb_fetch`).
     let mut sum_wall_us: u64 = 0;
     let mut sum_net_us: u64 = 0;
     let mut fetch_count: u64 = 0;
     let mut settle_count: u64 = 0;
-    // Sum of CPU spent in inline graph mutations across all fetched
-    // events. Reported alongside the fetch totals so we can attribute
-    // the mb_fetch wall split between IO and CPU.
-    let mut sum_graph_us: u64 = 0;
+    // Number of FetchEventMsg sent to graph worker that haven't yet
+    // had a corresponding Vec<Dep> response. Drives termination:
+    // when futs empty + in_flight == 0, no more work pipelined.
+    let mut in_flight_graph: usize = 0;
 
     loop {
+        // Refill futs from pending up to cap.
         while futs.len() < cap {
             let Some((name, spec)) = pending.pop_front() else {
                 break;
@@ -683,123 +713,74 @@ pub async fn mb_fetch_with_graph(
             ));
         }
 
-        if futs.is_empty() {
+        // Termination: nothing in flight at fetch level AND graph
+        // worker has nothing pending.
+        if futs.is_empty() && in_flight_graph == 0 {
             break;
         }
 
-        let Some(out) = futs.next().await else { break };
-
-        sum_wall_us += out.wall_us;
-        sum_net_us += out.net_us;
-        if out.fetched {
-            fetch_count += 1;
-        } else {
-            settle_count += 1;
-        }
-        if out.fetched {
-            stats.success += 1;
-        }
-
-        // Drain sibling specs deferred while the fetch was in flight.
-        if out.fetched
-            && let Some(siblings) = deferred_by_name.remove(&out.name)
-            && let Some(raw) = body_cache.lock().get(&out.name).cloned()
-        {
-            for sibling_spec in siblings {
-                futs.push(spawn_settle(
-                    out.name.clone(),
-                    sibling_spec,
-                    Arc::clone(&raw),
-                    cache.clone(),
-                    peer_deps,
-                ));
+        // Drive both halves: prefer draining specs back from graph
+        // worker (unblocks new fetch dispatch) over starting another
+        // fetch landing.
+        tokio::select! {
+            biased;
+            maybe_specs = specs_rx.recv() => {
+                match maybe_specs {
+                    Some(specs) => {
+                        pending.extend(specs);
+                        in_flight_graph -= 1;
+                    }
+                    None => {
+                        // Graph worker exited unexpectedly. Bail.
+                        break;
+                    }
+                }
             }
-        }
+            maybe_result = futs.next(), if !futs.is_empty() => {
+                if let Some(out) = maybe_result {
+                    sum_wall_us += out.wall_us;
+                    sum_net_us += out.net_us;
+                    if out.fetched {
+                        fetch_count += 1;
+                        stats.success += 1;
+                    } else {
+                        settle_count += 1;
+                    }
 
-        // Graph mutations: process every parent edge waiting on
-        // `(name, spec)` for each transitive spec the fetch resolved
-        // (the fetch itself touched only the primary spec; sibling
-        // settles touch their own specs). Each settle path covers
-        // its own bucket via the `out.transitives` path below.
-        //
-        // The fetched/settled (name, spec) pair has already been
-        // written to the cache by the future. Look up the version
-        // manifest to get the ResolvedPackage handed to
-        // process_dependency_with_resolved.
-        let graph_start = Instant::now();
-        let process_key_specs: Vec<(String, String)> = out
-            .transitives
-            .iter()
-            .map(|(n, s)| (n.clone(), s.clone()))
-            .collect();
-        // The primary fetched/settled spec itself: resolve it now.
-        let primary_keys: Vec<(String, String)> = edge_targets
-            .keys()
-            .filter(|(n, _)| n == &out.name)
-            .cloned()
-            .collect();
-        for (k_name, k_spec) in primary_keys {
-            // Pull resolved manifest from cache for this spec.
-            let Some(core_arc) = cache.get_version_manifest(&k_name, &k_spec) else {
-                continue;
-            };
-            let resolved = ResolvedPackage {
-                name: k_name.clone(),
-                version: core_arc.version.clone(),
-                manifest: core_arc,
-            };
-            let waiting = edge_targets.remove(&(k_name.clone(), k_spec.clone()));
-            if let Some(targets) = waiting {
-                for (parent_idx, edge_id) in targets {
-                    let edge_info = crate::resolver::edges::DependencyEdgeInfo {
-                        edge_id,
-                        name: k_name.clone(),
-                        spec: k_spec.clone(),
-                        // edge_type carried separately on the graph; we
-                        // re-look-up the actual edge here for
-                        // correctness.
-                        edge_type: graph
-                            .graph
-                            .edge_weight(edge_id)
-                            .and_then(|e| match e {
-                                crate::model::graph::GraphEdge::Dependency(d) => Some(d.edge_type),
-                                _ => None,
-                            })
-                            .unwrap_or(crate::model::node::EdgeType::Prod),
-                    };
-                    let result = process_dependency_with_resolved(
-                        graph,
-                        parent_idx,
-                        &edge_info,
-                        &resolved,
-                        build_config,
-                    );
-                    if let ProcessResult::Created(new_idx) = result {
-                        // The new node's transitive edges become new
-                        // pending entries. enqueue handles cache-hit
-                        // inline-process so we don't orphan
-                        // edge_targets entries after their fetch
-                        // already landed.
-                        enqueue_node_edges(
-                            graph,
-                            new_idx,
-                            &mut pending,
-                            &mut seen_specs,
-                            &mut edge_targets,
-                            cache,
-                            build_config,
-                        );
+                    // Drain sibling specs deferred while the fetch
+                    // was in flight. Sibling settles also produce a
+                    // FetchEventMsg downstream.
+                    if out.fetched
+                        && let Some(siblings) = deferred_by_name.remove(&out.name)
+                        && let Some(raw) = body_cache.lock().get(&out.name).cloned()
+                    {
+                        for sibling_spec in siblings {
+                            futs.push(spawn_settle(
+                                out.name.clone(),
+                                sibling_spec,
+                                Arc::clone(&raw),
+                                cache.clone(),
+                                peer_deps,
+                            ));
+                        }
+                    }
+
+                    // Send to graph worker. `send().await` only
+                    // blocks if channel is full (cap * 2 buffer);
+                    // under steady state shouldn't happen.
+                    if fetch_tx.send(FetchEventMsg { name: out.name }).await.is_ok() {
+                        in_flight_graph += 1;
                     }
                 }
             }
         }
-        sum_graph_us += graph_start.elapsed().as_micros() as u64;
-        // Suppress an unused-vars warning when the transitive list is
-        // identical to the keys we just pulled from edge_targets —
-        // we keep collecting it for tracing parity with `mb_fetch`.
-        let _ = process_key_specs;
     }
 
+    // Signal graph worker to exit, then await its finalization to
+    // recover the graph + stats.
+    drop(fetch_tx);
+    let (graph, graph_stats) = graph_handle.await.context("graph worker join")??;
+
     let total_wall_ms = total_start.elapsed().as_millis();
     let total_wall_us = (total_wall_ms as u64).saturating_mul(1000);
     let eff_par_full = if total_wall_us > 0 {
@@ -813,21 +794,186 @@ pub async fn mb_fetch_with_graph(
         0.0
     };
     let avg_net_us = sum_net_us.checked_div(fetch_count).unwrap_or(0);
-    let unresolved_remaining = edge_targets.len();
     tracing::info!(
-        "p1-breakdown mb_fetch_with_graph wall={}ms ok={} fetch={} settle={} sum_wall={}ms sum_net={}ms sum_graph={}ms avg_net={}us eff_par_full={:.1} eff_par_net={:.1} unresolved_targets={}",
+        "p1-breakdown mb_fetch_with_graph wall={}ms ok={} fetch={} settle={} sum_wall={}ms sum_net={}ms sum_graph={}ms avg_net={}us eff_par_full={:.1} eff_par_net={:.1} unresolved_targets={} graph_processed={} graph_new_specs={}",
         total_wall_ms,
         stats.success,
         fetch_count,
         settle_count,
         sum_wall_us / 1000,
         sum_net_us / 1000,
-        sum_graph_us / 1000,
+        graph_stats.sum_graph_us / 1000,
         avg_net_us,
         eff_par_full,
         eff_par_net,
-        unresolved_remaining,
+        graph_stats.unresolved_remaining,
+        graph_stats.processed,
+        graph_stats.new_specs_emitted,
     );
 
-    Ok(stats)
+    Ok((graph, stats))
+}
+
+#[derive(Debug, Default)]
+struct GraphWorkerStats {
+    sum_graph_us: u64,
+    processed: usize,
+    new_specs_emitted: usize,
+    unresolved_remaining: usize,
+}
+
+/// CPU-only worker task that owns the graph + edge_targets +
+/// seen_specs. Receives fetch events from main loop, mutates graph
+/// via `process_dependency_with_resolved`, sends new pending specs
+/// back. Designed to monopolize a tokio runtime worker thread so
+/// the main loop's worker can drive socket polling without
+/// competing for CPU.
+async fn graph_worker(
+    mut graph: DependencyGraph,
+    mut edge_targets: EdgeTargets,
+    mut seen_specs: HashSet<(String, String)>,
+    cache: MemoryCache,
+    build_config: BuildDepsConfig,
+    mut fetch_rx: mpsc::Receiver<FetchEventMsg>,
+    specs_tx: mpsc::Sender<Vec<Dep>>,
+) -> Result<(DependencyGraph, GraphWorkerStats)> {
+    let mut stats = GraphWorkerStats::default();
+
+    while let Some(msg) = fetch_rx.recv().await {
+        let graph_start = Instant::now();
+        stats.processed += 1;
+
+        // Drain edge_targets for every spec keyed under this name.
+        // The fetch future already wrote both `(name, primary_spec)`
+        // and `(name, resolved_version)` cache slots, so any
+        // edge_targets entry for this name should hit cache.
+        let primary_keys: Vec<(String, String)> = edge_targets
+            .keys()
+            .filter(|(n, _)| n == &msg.name)
+            .cloned()
+            .collect();
+
+        let mut new_specs: Vec<Dep> = Vec::new();
+        for (k_name, k_spec) in primary_keys {
+            let Some(core_arc) = cache.get_version_manifest(&k_name, &k_spec) else {
+                continue;
+            };
+            let resolved = ResolvedPackage {
+                name: k_name.clone(),
+                version: core_arc.version.clone(),
+                manifest: core_arc,
+            };
+            let Some(targets) = edge_targets.remove(&(k_name.clone(), k_spec.clone())) else {
+                continue;
+            };
+            for (parent_idx, edge_id) in targets {
+                let edge_info = crate::resolver::edges::DependencyEdgeInfo {
+                    edge_id,
+                    name: k_name.clone(),
+                    spec: k_spec.clone(),
+                    edge_type: graph
+                        .graph
+                        .edge_weight(edge_id)
+                        .and_then(|e| match e {
+                            crate::model::graph::GraphEdge::Dependency(d) => Some(d.edge_type),
+                            _ => None,
+                        })
+                        .unwrap_or(crate::model::node::EdgeType::Prod),
+                };
+                let result = process_dependency_with_resolved(
+                    &mut graph,
+                    parent_idx,
+                    &edge_info,
+                    &resolved,
+                    &build_config,
+                );
+                if let ProcessResult::Created(new_idx) = result {
+                    // Walk the new node's edges. enqueue handles
+                    // recursive cache-hit drain so already-cached
+                    // specs get processed inline (still on this
+                    // worker thread — graph mutations can't run on
+                    // multiple threads with `&mut graph`).
+                    enqueue_node_edges_into(
+                        &mut graph,
+                        new_idx,
+                        &mut new_specs,
+                        &mut seen_specs,
+                        &mut edge_targets,
+                        &cache,
+                        &build_config,
+                    );
+                }
+            }
+        }
+
+        stats.sum_graph_us += graph_start.elapsed().as_micros() as u64;
+        stats.new_specs_emitted += new_specs.len();
+
+        // Always reply (even if empty) so main loop's `in_flight`
+        // counter decrements for each FetchEventMsg sent.
+        if specs_tx.send(new_specs).await.is_err() {
+            // Main loop dropped the receiver — bail.
+            break;
+        }
+    }
+
+    stats.unresolved_remaining = edge_targets.len();
+    Ok((graph, stats))
+}
+
+/// Same as `enqueue_node_edges` but pushes new specs into the
+/// caller-provided `out` Vec instead of a VecDeque. Used by the
+/// graph worker to batch "new specs from this fetch" before sending
+/// them back to the main loop in one channel message.
+fn enqueue_node_edges_into(
+    graph: &mut DependencyGraph,
+    node_idx: NodeIndex,
+    out: &mut Vec<Dep>,
+    seen_specs: &mut HashSet<(String, String)>,
+    edge_targets: &mut EdgeTargets,
+    cache: &MemoryCache,
+    build_config: &BuildDepsConfig,
+) {
+    let mut work_stack: Vec<NodeIndex> = vec![node_idx];
+    while let Some(idx) = work_stack.pop() {
+        let edges = collect_unresolved_edges(graph, idx);
+        for edge in edges {
+            if !edge.spec.is_registry_spec() {
+                continue;
+            }
+            let key = (edge.name.clone(), edge.spec.clone());
+
+            if let Some(core_arc) = cache.get_version_manifest(&edge.name, &edge.spec) {
+                let resolved = ResolvedPackage {
+                    name: edge.name.clone(),
+                    version: core_arc.version.clone(),
+                    manifest: core_arc,
+                };
+                let edge_info = crate::resolver::edges::DependencyEdgeInfo {
+                    edge_id: edge.edge_id,
+                    name: edge.name.clone(),
+                    spec: edge.spec.clone(),
+                    edge_type: edge.edge_type,
+                };
+                if let ProcessResult::Created(new_idx) = process_dependency_with_resolved(
+                    graph,
+                    idx,
+                    &edge_info,
+                    &resolved,
+                    build_config,
+                ) {
+                    work_stack.push(new_idx);
+                }
+                continue;
+            }
+
+            edge_targets
+                .entry(key.clone())
+                .or_default()
+                .push((idx, edge.edge_id));
+            if seen_specs.insert(key.clone()) {
+                out.push(key);
+            }
+        }
+    }
 }
diff --git a/crates/ruborist/src/service/api.rs b/crates/ruborist/src/service/api.rs
index 837ebfc5b..6c9ada9a8 100644
--- a/crates/ruborist/src/service/api.rs
+++ b/crates/ruborist/src/service/api.rs
@@ -285,8 +285,8 @@ where
             peer_deps,
             concurrency,
         };
-        mb_fetch_with_graph(
-            &mut graph,
+        let (returned_graph, _stats) = mb_fetch_with_graph(
+            graph,
             registry.registry_url(),
             registry.cache(),
             &preload_config,
@@ -294,6 +294,7 @@ where
         )
         .await
         .map_err(|e| e.context("mb_fetch_with_graph failed"))?;
+        graph = returned_graph;
     }
 
     // Preserve the typed error via `Error::new` + `.context(...)` so CLI

From d1cf53e2f14acb5110048a33031401a38aca839a Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sat, 9 May 2026 23:54:41 +0800
Subject: [PATCH 31/32] perf(pm): integrate channel-based mb_fetch into install
 pipeline

Plumb the PipelineReceiver through the folded mb_fetch_with_graph path
so install (`utoo install`) gets the same channel-separated fetch + graph
architecture as `utoo deps`, with download/clone pipelines starting as
early as the legacy preload+BFS path:

- mb_fetch_with_graph now takes Arc<R: EventReceiver + 'static>; main
  loop emits PackageResolved on each fetch land (looked up via cache
  with the new FetchOutcome::primary_spec), graph_worker emits
  PackagePlaced on ProcessResult::Created.
- service::api::build_deps wraps the caller-supplied receiver in Arc
  once and shares it between mb_fetch_with_graph and
  build_deps_with_config; adds + 'static bound on R.
- pipeline_deps_options sets skip_preload=true so install routes
  through the same folded path as the lockfile-only command.

CI will validate that p1 resolve continues at/below 2.5s while
p0_full_cold and p3_cold_install do not regress (download + clone
pipelines remain saturated via emitted events).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 Cargo.lock                                 | 842 +++++++++++++++++----
 crates/pm/src/helper/ruborist_context.rs   |  11 +-
 crates/ruborist/src/resolver/mb_resolve.rs |  63 +-
 crates/ruborist/src/service/api.rs         |  10 +-
 4 files changed, 781 insertions(+), 145 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 3a136807b..c4b103915 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -61,6 +61,21 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "alloc-no-stdlib"
+version = "2.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3"
+
+[[package]]
+name = "alloc-stdlib"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece"
+dependencies = [
+ "alloc-no-stdlib",
+]
+
 [[package]]
 name = "allocator-api2"
 version = "0.2.21"
@@ -573,6 +588,27 @@ dependencies = [
  "syn 2.0.106",
 ]
 
+[[package]]
+name = "brotli"
+version = "8.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4bd8b9603c7aa97359dbd97ecf258968c95f3adddd6db2f7e7a5bef101c84560"
+dependencies = [
+ "alloc-no-stdlib",
+ "alloc-stdlib",
+ "brotli-decompressor",
+]
+
+[[package]]
+name = "brotli-decompressor"
+version = "5.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03"
+dependencies = [
+ "alloc-no-stdlib",
+ "alloc-stdlib",
+]
+
 [[package]]
 name = "browserslist-data"
 version = "0.1.4"
@@ -1110,6 +1146,7 @@ version = "0.4.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ef8a506ec4b81c460798f572caead636d57d3d7e940f998160f52bd254bf2d23"
 dependencies = [
+ "brotli",
  "compression-core",
  "flate2",
  "memchr",
@@ -1690,23 +1727,6 @@ dependencies = [
  "syn 2.0.106",
 ]
 
-[[package]]
-name = "ctor"
-version = "0.10.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "83cf0d42651b16c6dfe68685716d18480d18a9c39c62d76e8cf3eb6ed5d8bcbf"
-dependencies = [
- "ctor-proc-macro",
- "dtor",
- "link-section",
-]
-
-[[package]]
-name = "ctor-proc-macro"
-version = "0.0.13"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a949c44fcacbbbb7ada007dc7acb34603dd97cd47de5d054f2b6493ecebb483"
-
 [[package]]
 name = "cty"
 version = "0.2.2"
@@ -2246,21 +2266,6 @@ dependencies = [
  "dtoa",
 ]
 
-[[package]]
-name = "dtor"
-version = "0.8.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "edf234dd1594d6dd434a8fb8cada51ddbbc593e40e4a01556a0b31c62da2775b"
-dependencies = [
- "dtor-proc-macro",
-]
-
-[[package]]
-name = "dtor-proc-macro"
-version = "0.0.13"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2647271c92754afcb174e758003cfd1cbf1e43e5a7853d7b1813e63e19e39a73"
-
 [[package]]
 name = "dunce"
 version = "1.0.5"
@@ -4824,12 +4829,6 @@ dependencies = [
  "syn 1.0.109",
 ]
 
-[[package]]
-name = "link-section"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b685d66585d646efe09fec763d796c291049c8b6bf84e04954bffc8748341f0d"
-
 [[package]]
 name = "linked-hash-map"
 version = "0.5.6"
@@ -4944,6 +4943,21 @@ version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ca88d725a0a943b096803bd34e73a4437208b6077654cc4ecb2947a5f91618d"
 
+[[package]]
+name = "manifest-bench"
+version = "0.0.0"
+dependencies = [
+ "anyhow",
+ "clap",
+ "futures",
+ "reqwest 0.12.24",
+ "rustls",
+ "rustls-native-certs",
+ "serde",
+ "serde_json",
+ "tokio",
+]
+
 [[package]]
 name = "markdown"
 version = "1.0.0"
@@ -5380,7 +5394,7 @@ checksum = "55740c4ae1d8696773c78fdafd5d0e5fe9bc9f1b071c7ba493ba5c413a9184f3"
 dependencies = [
  "anyhow",
  "bitflags 2.9.4",
- "ctor 0.2.9",
+ "ctor",
  "napi-derive",
  "napi-sys",
  "once_cell",
@@ -6358,6 +6372,22 @@ version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "84350ffee5cedfabf9bee3e8825721f651da8ff79d50fe7a37cf0ca015c428ee"
 
+[[package]]
+name = "preload-bench"
+version = "0.0.0"
+dependencies = [
+ "anyhow",
+ "clap",
+ "futures",
+ "reqwest 0.12.24",
+ "rustls",
+ "rustls-native-certs",
+ "serde",
+ "serde_json",
+ "simd-json",
+ "tokio",
+]
+
 [[package]]
 name = "preset_env_base"
 version = "7.0.0"
@@ -7208,9 +7238,9 @@ dependencies = [
 
 [[package]]
 name = "roaring"
-version = "0.11.4"
+version = "0.10.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1dedc5658c6ecb3bdb5ef5f3295bb9253f42dcf3fd1402c03f6b1f7659c3c4a9"
+checksum = "19e8d2cfa184d94d0726d650a9f4a1be7f9b76ac9fdb954219878dc00c1c1e7b"
 dependencies = [
  "bytemuck",
  "byteorder",
@@ -8115,9 +8145,9 @@ dependencies = [
 
 [[package]]
 name = "styled_components"
-version = "4.0.0"
+version = "3.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "72418ea605a423c70ffa8590196c83b04b04636fd25aaceabe0fa7f1e15f66f0"
+checksum = "99aeadac58111060ad883c7e7a01917bcecc6572243c06d41315f200cbaa9240"
 dependencies = [
  "Inflector",
  "once_cell",
@@ -8134,9 +8164,9 @@ dependencies = [
 
 [[package]]
 name = "styled_jsx"
-version = "4.0.0"
+version = "3.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "98cc5352e19f02be3ba10fb9ecbcd0d72e9b2d9762965712f1cbe737d1f428ec"
+checksum = "c3917b257122e7cf3f46f95557af3178edaa9a3fd89fc1469768e05f01901e98"
 dependencies = [
  "anyhow",
  "lightningcss",
@@ -8155,9 +8185,9 @@ dependencies = [
  "swc_css_prefixer",
  "swc_css_visit",
  "swc_ecma_ast 23.0.0",
- "swc_ecma_minifier",
- "swc_ecma_parser 39.0.2",
- "swc_ecma_transforms_base 42.0.0",
+ "swc_ecma_minifier 51.1.0",
+ "swc_ecma_parser 38.0.2",
+ "swc_ecma_transforms_base 41.0.1",
  "swc_ecma_utils 29.1.0",
  "swc_ecma_visit 23.0.0",
  "swc_plugin_macro",
@@ -8170,6 +8200,57 @@ version = "2.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
 
+[[package]]
+name = "swc"
+version = "61.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fb7d502b72d0b5e059cefe3a55825c43587a2e3c81025862694e52deecddc3de"
+dependencies = [
+ "anyhow",
+ "base64 0.22.1",
+ "bytes-str",
+ "dashmap 5.5.3",
+ "either",
+ "indexmap 2.13.0",
+ "jsonc-parser",
+ "once_cell",
+ "par-core",
+ "par-iter",
+ "parking_lot",
+ "regex",
+ "rustc-hash 2.1.1",
+ "serde",
+ "serde_json",
+ "swc_atoms",
+ "swc_common 21.0.1",
+ "swc_compiler_base 54.0.0",
+ "swc_config",
+ "swc_ecma_ast 23.0.0",
+ "swc_ecma_codegen 26.0.1",
+ "swc_ecma_ext_transforms",
+ "swc_ecma_loader",
+ "swc_ecma_minifier 51.1.0",
+ "swc_ecma_parser 38.0.2",
+ "swc_ecma_preset_env 52.0.0",
+ "swc_ecma_transforms 51.0.0",
+ "swc_ecma_transforms_base 41.0.1",
+ "swc_ecma_transforms_compat 47.0.0",
+ "swc_ecma_transforms_optimization 43.0.0",
+ "swc_ecma_utils 29.1.0",
+ "swc_ecma_visit 23.0.0",
+ "swc_error_reporters",
+ "swc_node_comments",
+ "swc_plugin_proxy",
+ "swc_plugin_runner",
+ "swc_sourcemap",
+ "swc_timer",
+ "swc_transform_common",
+ "swc_visit",
+ "tokio",
+ "tracing",
+ "url",
+]
+
 [[package]]
 name = "swc"
 version = "63.0.0"
@@ -8193,19 +8274,19 @@ dependencies = [
  "serde_json",
  "swc_atoms",
  "swc_common 21.0.1",
- "swc_compiler_base",
+ "swc_compiler_base 55.0.0",
  "swc_config",
  "swc_ecma_ast 23.0.0",
  "swc_ecma_codegen 26.0.1",
  "swc_ecma_ext_transforms",
  "swc_ecma_loader",
- "swc_ecma_minifier",
+ "swc_ecma_minifier 52.0.4",
  "swc_ecma_parser 39.0.2",
- "swc_ecma_preset_env",
- "swc_ecma_transforms",
+ "swc_ecma_preset_env 53.0.0",
+ "swc_ecma_transforms 52.0.0",
  "swc_ecma_transforms_base 42.0.0",
- "swc_ecma_transforms_compat",
- "swc_ecma_transforms_optimization",
+ "swc_ecma_transforms_compat 48.0.0",
+ "swc_ecma_transforms_optimization 44.0.0",
  "swc_ecma_utils 29.1.0",
  "swc_ecma_visit 23.0.0",
  "swc_error_reporters",
@@ -8322,6 +8403,32 @@ dependencies = [
  "url",
 ]
 
+[[package]]
+name = "swc_compiler_base"
+version = "54.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "386c6121a98d7630ef5a07b79acee964c778568d61d3b76a188be17f19418a9c"
+dependencies = [
+ "anyhow",
+ "base64 0.22.1",
+ "bytes-str",
+ "once_cell",
+ "pathdiff",
+ "rustc-hash 2.1.1",
+ "serde",
+ "serde_json",
+ "swc_atoms",
+ "swc_common 21.0.1",
+ "swc_config",
+ "swc_ecma_ast 23.0.0",
+ "swc_ecma_codegen 26.0.1",
+ "swc_ecma_minifier 51.1.0",
+ "swc_ecma_parser 38.0.2",
+ "swc_ecma_visit 23.0.0",
+ "swc_sourcemap",
+ "swc_timer",
+]
+
 [[package]]
 name = "swc_compiler_base"
 version = "55.0.0"
@@ -8341,7 +8448,7 @@ dependencies = [
  "swc_config",
  "swc_ecma_ast 23.0.0",
  "swc_ecma_codegen 26.0.1",
- "swc_ecma_minifier",
+ "swc_ecma_minifier 52.0.4",
  "swc_ecma_parser 39.0.2",
  "swc_ecma_visit 23.0.0",
  "swc_sourcemap",
@@ -8398,6 +8505,38 @@ dependencies = [
  "vergen",
 ]
 
+[[package]]
+name = "swc_core"
+version = "63.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bb9470306b0d532da617be037de878f64ec0f04cb364d920e8cee05d658d66de"
+dependencies = [
+ "par-core",
+ "swc 61.0.0",
+ "swc_allocator",
+ "swc_atoms",
+ "swc_common 21.0.1",
+ "swc_ecma_ast 23.0.0",
+ "swc_ecma_codegen 26.0.1",
+ "swc_ecma_lints",
+ "swc_ecma_loader",
+ "swc_ecma_minifier 51.1.0",
+ "swc_ecma_parser 38.0.2",
+ "swc_ecma_preset_env 52.0.0",
+ "swc_ecma_quote_macros",
+ "swc_ecma_transforms_base 41.0.1",
+ "swc_ecma_transforms_optimization 43.0.0",
+ "swc_ecma_transforms_proposal 41.0.3",
+ "swc_ecma_transforms_react 45.0.0",
+ "swc_ecma_transforms_typescript 45.0.2",
+ "swc_ecma_utils 29.1.0",
+ "swc_ecma_visit 23.0.0",
+ "swc_plugin_proxy",
+ "swc_plugin_runner",
+ "testing",
+ "vergen",
+]
+
 [[package]]
 name = "swc_core"
 version = "65.0.3"
@@ -8405,28 +8544,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "898413141c6d3e1fed24ac3a4c57cc61ef98194df2a7957820d48ad158a318f6"
 dependencies = [
  "par-core",
- "swc",
+ "swc 63.0.0",
  "swc_allocator",
  "swc_atoms",
  "swc_common 21.0.1",
  "swc_ecma_ast 23.0.0",
  "swc_ecma_codegen 26.0.1",
- "swc_ecma_lints",
  "swc_ecma_loader",
- "swc_ecma_minifier",
+ "swc_ecma_minifier 52.0.4",
  "swc_ecma_parser 39.0.2",
- "swc_ecma_preset_env",
- "swc_ecma_quote_macros",
+ "swc_ecma_preset_env 53.0.0",
  "swc_ecma_transforms_base 42.0.0",
- "swc_ecma_transforms_optimization",
- "swc_ecma_transforms_proposal",
- "swc_ecma_transforms_react",
- "swc_ecma_transforms_typescript",
+ "swc_ecma_transforms_optimization 44.0.0",
+ "swc_ecma_transforms_react 46.0.1",
+ "swc_ecma_transforms_typescript 46.0.1",
  "swc_ecma_utils 29.1.0",
  "swc_ecma_visit 23.0.0",
  "swc_plugin_proxy",
  "swc_plugin_runner",
- "testing",
  "vergen",
 ]
 
@@ -8656,6 +8791,24 @@ dependencies = [
  "syn 2.0.106",
 ]
 
+[[package]]
+name = "swc_ecma_compat_bugfixes"
+version = "46.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22d4da77f7014b5efd416bb5208ab6e3d005ad5d532df8ced2904e50ca233d44"
+dependencies = [
+ "rustc-hash 2.1.1",
+ "swc_atoms",
+ "swc_common 21.0.1",
+ "swc_ecma_ast 23.0.0",
+ "swc_ecma_compat_es2015 45.0.0",
+ "swc_ecma_transforms_base 41.0.1",
+ "swc_ecma_utils 29.1.0",
+ "swc_ecma_visit 23.0.0",
+ "swc_trace_macro",
+ "tracing",
+]
+
 [[package]]
 name = "swc_ecma_compat_bugfixes"
 version = "47.0.0"
@@ -8666,7 +8819,7 @@ dependencies = [
  "swc_atoms",
  "swc_common 21.0.1",
  "swc_ecma_ast 23.0.0",
- "swc_ecma_compat_es2015",
+ "swc_ecma_compat_es2015 46.0.0",
  "swc_ecma_transforms_base 42.0.0",
  "swc_ecma_utils 29.1.0",
  "swc_ecma_visit 23.0.0",
@@ -8674,6 +8827,18 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "swc_ecma_compat_common"
+version = "37.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d72d7d499e4bd4059ccfe432c1a52111a28fdd2b49b3882f18108fddfa3f6b4f"
+dependencies = [
+ "swc_common 21.0.1",
+ "swc_ecma_ast 23.0.0",
+ "swc_ecma_transformer 13.0.0",
+ "swc_ecma_utils 29.1.0",
+]
+
 [[package]]
 name = "swc_ecma_compat_common"
 version = "38.0.0"
@@ -8682,8 +8847,36 @@ checksum = "04b936fe418e2bd707298357f560d269c1bdedc86a2325f7163307fe140806bd"
 dependencies = [
  "swc_common 21.0.1",
  "swc_ecma_ast 23.0.0",
- "swc_ecma_transformer",
+ "swc_ecma_transformer 14.0.0",
+ "swc_ecma_utils 29.1.0",
+]
+
+[[package]]
+name = "swc_ecma_compat_es2015"
+version = "45.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5095800ee11e7c37df38a2e0fae2caa9d98b7801121d5f5ce70710ab65e21ec7"
+dependencies = [
+ "arrayvec",
+ "indexmap 2.13.0",
+ "is-macro",
+ "rustc-hash 2.1.1",
+ "serde",
+ "serde_derive",
+ "smallvec",
+ "swc_atoms",
+ "swc_common 21.0.1",
+ "swc_config",
+ "swc_ecma_ast 23.0.0",
+ "swc_ecma_compat_common 37.0.0",
+ "swc_ecma_transformer 13.0.0",
+ "swc_ecma_transforms_base 41.0.1",
+ "swc_ecma_transforms_classes 41.0.1",
+ "swc_ecma_transforms_macros",
  "swc_ecma_utils 29.1.0",
+ "swc_ecma_visit 23.0.0",
+ "swc_trace_macro",
+ "tracing",
 ]
 
 [[package]]
@@ -8703,10 +8896,10 @@ dependencies = [
  "swc_common 21.0.1",
  "swc_config",
  "swc_ecma_ast 23.0.0",
- "swc_ecma_compat_common",
- "swc_ecma_transformer",
+ "swc_ecma_compat_common 38.0.0",
+ "swc_ecma_transformer 14.0.0",
  "swc_ecma_transforms_base 42.0.0",
- "swc_ecma_transforms_classes",
+ "swc_ecma_transforms_classes 42.0.0",
  "swc_ecma_transforms_macros",
  "swc_ecma_utils 29.1.0",
  "swc_ecma_visit 23.0.0",
@@ -8714,6 +8907,19 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "swc_ecma_compat_es2016"
+version = "42.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1358f912b0b5bdb6509f64dada8dc9ac8dc9233175b1d033c571cd34ad0bbec"
+dependencies = [
+ "swc_ecma_ast 23.0.0",
+ "swc_ecma_transformer 13.0.0",
+ "swc_ecma_transforms_base 41.0.1",
+ "swc_ecma_utils 29.1.0",
+ "tracing",
+]
+
 [[package]]
 name = "swc_ecma_compat_es2016"
 version = "43.0.0"
@@ -8721,7 +8927,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4402a84df86ebd3723decdd041743ba8e48c7903bfe7f5c7c712bac46642ac90"
 dependencies = [
  "swc_ecma_ast 23.0.0",
- "swc_ecma_transformer",
+ "swc_ecma_transformer 14.0.0",
  "swc_ecma_transforms_base 42.0.0",
  "swc_ecma_utils 29.1.0",
  "tracing",
@@ -8729,77 +8935,170 @@ dependencies = [
 
 [[package]]
 name = "swc_ecma_compat_es2017"
-version = "43.0.0"
+version = "42.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "99d5f9f182e397fb69ea1f592770b67b94fe2bf201f3e6695cbeba66ccc1715a"
+checksum = "65a437c6a98cbfed7b355e2da721a52b1731537b6debf81cadccc9f196bbdbba"
 dependencies = [
  "serde",
  "swc_common 21.0.1",
  "swc_ecma_ast 23.0.0",
- "swc_ecma_transformer",
- "swc_ecma_transforms_base 42.0.0",
+ "swc_ecma_transformer 13.0.0",
+ "swc_ecma_transforms_base 41.0.1",
  "swc_ecma_utils 29.1.0",
  "tracing",
 ]
 
 [[package]]
-name = "swc_ecma_compat_es2018"
-version = "44.0.0"
+name = "swc_ecma_compat_es2017"
+version = "43.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "757acfefd8ececa3fd3491e7dcbf6da1b7b5fba602b70b8f2b36af30fac35eea"
+checksum = "99d5f9f182e397fb69ea1f592770b67b94fe2bf201f3e6695cbeba66ccc1715a"
 dependencies = [
  "serde",
+ "swc_common 21.0.1",
  "swc_ecma_ast 23.0.0",
- "swc_ecma_transformer",
+ "swc_ecma_transformer 14.0.0",
  "swc_ecma_transforms_base 42.0.0",
  "swc_ecma_utils 29.1.0",
  "tracing",
 ]
 
 [[package]]
-name = "swc_ecma_compat_es2019"
+name = "swc_ecma_compat_es2018"
 version = "43.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3a0f39d1ebadade7d0a0a137cedec958cfd38fe99c5c69c762d879650b5e9848"
+checksum = "27ffcf499581d598250e4d93d45ef64fe81b16f83c3bcb8c21d27af2004e6f54"
 dependencies = [
- "swc_common 21.0.1",
+ "serde",
  "swc_ecma_ast 23.0.0",
- "swc_ecma_transformer",
- "swc_ecma_transforms_base 42.0.0",
+ "swc_ecma_transformer 13.0.0",
+ "swc_ecma_transforms_base 41.0.1",
  "swc_ecma_utils 29.1.0",
  "tracing",
 ]
 
 [[package]]
-name = "swc_ecma_compat_es2020"
-version = "45.0.0"
+name = "swc_ecma_compat_es2018"
+version = "44.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "170d1ba05307a49e53a55f13128e991e6d250819ed2f75be267dbd9a4a14b00d"
+checksum = "757acfefd8ececa3fd3491e7dcbf6da1b7b5fba602b70b8f2b36af30fac35eea"
 dependencies = [
  "serde",
- "swc_common 21.0.1",
  "swc_ecma_ast 23.0.0",
- "swc_ecma_compat_es2022",
- "swc_ecma_transformer",
+ "swc_ecma_transformer 14.0.0",
  "swc_ecma_transforms_base 42.0.0",
  "swc_ecma_utils 29.1.0",
- "swc_ecma_visit 23.0.0",
  "tracing",
 ]
 
 [[package]]
-name = "swc_ecma_compat_es2021"
-version = "43.0.0"
+name = "swc_ecma_compat_es2019"
+version = "42.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bfef1313a8410a2229aca737b65bb82c4aa45bdd6cedc0a0083688da0b960b20"
+checksum = "5125766d7ca9c4789eefdb68fd9d1bc9eba1119df21ad3d1fd7b0ac2808893d0"
 dependencies = [
+ "swc_common 21.0.1",
  "swc_ecma_ast 23.0.0",
- "swc_ecma_transformer",
- "swc_ecma_transforms_base 42.0.0",
+ "swc_ecma_transformer 13.0.0",
+ "swc_ecma_transforms_base 41.0.1",
  "swc_ecma_utils 29.1.0",
  "tracing",
 ]
 
+[[package]]
+name = "swc_ecma_compat_es2019"
+version = "43.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3a0f39d1ebadade7d0a0a137cedec958cfd38fe99c5c69c762d879650b5e9848"
+dependencies = [
+ "swc_common 21.0.1",
+ "swc_ecma_ast 23.0.0",
+ "swc_ecma_transformer 14.0.0",
+ "swc_ecma_transforms_base 42.0.0",
+ "swc_ecma_utils 29.1.0",
+ "tracing",
+]
+
+[[package]]
+name = "swc_ecma_compat_es2020"
+version = "44.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4eba7cf139b36cdf75daf9f1fc9096f566c8034d774ce040f09f0fccd4ffe02e"
+dependencies = [
+ "serde",
+ "swc_common 21.0.1",
+ "swc_ecma_ast 23.0.0",
+ "swc_ecma_compat_es2022 44.0.0",
+ "swc_ecma_transformer 13.0.0",
+ "swc_ecma_transforms_base 41.0.1",
+ "swc_ecma_utils 29.1.0",
+ "swc_ecma_visit 23.0.0",
+ "tracing",
+]
+
+[[package]]
+name = "swc_ecma_compat_es2020"
+version = "45.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "170d1ba05307a49e53a55f13128e991e6d250819ed2f75be267dbd9a4a14b00d"
+dependencies = [
+ "serde",
+ "swc_common 21.0.1",
+ "swc_ecma_ast 23.0.0",
+ "swc_ecma_compat_es2022 45.0.0",
+ "swc_ecma_transformer 14.0.0",
+ "swc_ecma_transforms_base 42.0.0",
+ "swc_ecma_utils 29.1.0",
+ "swc_ecma_visit 23.0.0",
+ "tracing",
+]
+
+[[package]]
+name = "swc_ecma_compat_es2021"
+version = "42.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f64ee2ff23cdc2bb9749f3fb730bd4a95cc26cdea84b384b85574a1ab43f78af"
+dependencies = [
+ "swc_ecma_ast 23.0.0",
+ "swc_ecma_transformer 13.0.0",
+ "swc_ecma_transforms_base 41.0.1",
+ "swc_ecma_utils 29.1.0",
+ "tracing",
+]
+
+[[package]]
+name = "swc_ecma_compat_es2021"
+version = "43.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bfef1313a8410a2229aca737b65bb82c4aa45bdd6cedc0a0083688da0b960b20"
+dependencies = [
+ "swc_ecma_ast 23.0.0",
+ "swc_ecma_transformer 14.0.0",
+ "swc_ecma_transforms_base 42.0.0",
+ "swc_ecma_utils 29.1.0",
+ "tracing",
+]
+
+[[package]]
+name = "swc_ecma_compat_es2022"
+version = "44.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9e0499dc93f8eb04c88d5cf6aefc4ce34fdcca9dd69155d6882eb011339c9dd"
+dependencies = [
+ "rustc-hash 2.1.1",
+ "swc_atoms",
+ "swc_common 21.0.1",
+ "swc_ecma_ast 23.0.0",
+ "swc_ecma_transformer 13.0.0",
+ "swc_ecma_transforms_base 41.0.1",
+ "swc_ecma_transforms_classes 41.0.1",
+ "swc_ecma_transforms_macros",
+ "swc_ecma_utils 29.1.0",
+ "swc_ecma_visit 23.0.0",
+ "swc_trace_macro",
+ "tracing",
+]
+
 [[package]]
 name = "swc_ecma_compat_es2022"
 version = "45.0.0"
@@ -8810,9 +9109,9 @@ dependencies = [
  "swc_atoms",
  "swc_common 21.0.1",
  "swc_ecma_ast 23.0.0",
- "swc_ecma_transformer",
+ "swc_ecma_transformer 14.0.0",
  "swc_ecma_transforms_base 42.0.0",
- "swc_ecma_transforms_classes",
+ "swc_ecma_transforms_classes 42.0.0",
  "swc_ecma_transforms_macros",
  "swc_ecma_utils 29.1.0",
  "swc_ecma_visit 23.0.0",
@@ -8899,6 +9198,42 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "swc_ecma_minifier"
+version = "51.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c25a685c2efe2f88ba359dde0a17382b28a206ea21b23bda612f97b2c423b2f2"
+dependencies = [
+ "arrayvec",
+ "bitflags 2.9.4",
+ "indexmap 2.13.0",
+ "num-bigint",
+ "num_cpus",
+ "once_cell",
+ "par-core",
+ "par-iter",
+ "parking_lot",
+ "phf",
+ "radix_fmt",
+ "rustc-hash 2.1.1",
+ "ryu-js",
+ "serde",
+ "serde_json",
+ "swc_atoms",
+ "swc_common 21.0.1",
+ "swc_config",
+ "swc_ecma_ast 23.0.0",
+ "swc_ecma_codegen 26.0.1",
+ "swc_ecma_hooks",
+ "swc_ecma_parser 38.0.2",
+ "swc_ecma_transforms_base 41.0.1",
+ "swc_ecma_transforms_optimization 43.0.0",
+ "swc_ecma_utils 29.1.0",
+ "swc_ecma_visit 23.0.0",
+ "swc_timer",
+ "tracing",
+]
+
 [[package]]
 name = "swc_ecma_minifier"
 version = "52.0.4"
@@ -8928,7 +9263,7 @@ dependencies = [
  "swc_ecma_hooks",
  "swc_ecma_parser 39.0.2",
  "swc_ecma_transforms_base 42.0.0",
- "swc_ecma_transforms_optimization",
+ "swc_ecma_transforms_optimization 44.0.0",
  "swc_ecma_utils 29.1.0",
  "swc_ecma_visit 23.0.0",
  "swc_timer",
@@ -8955,6 +9290,27 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "swc_ecma_parser"
+version = "38.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a7c251d44e048647b5335861d1585b3e95fa8bc74f6e7a40570b0ea95d27ba66"
+dependencies = [
+ "bitflags 2.9.4",
+ "either",
+ "num-bigint",
+ "phf",
+ "rustc-hash 2.1.1",
+ "seq-macro",
+ "serde",
+ "smartstring",
+ "stacker",
+ "swc_atoms",
+ "swc_common 21.0.1",
+ "swc_ecma_ast 23.0.0",
+ "tracing",
+]
+
 [[package]]
 name = "swc_ecma_parser"
 version = "39.0.2"
@@ -8969,13 +9325,37 @@ dependencies = [
  "seq-macro",
  "serde",
  "smartstring",
- "stacker",
  "swc_atoms",
  "swc_common 21.0.1",
  "swc_ecma_ast 23.0.0",
  "tracing",
 ]
 
+[[package]]
+name = "swc_ecma_preset_env"
+version = "52.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5132d5890cddc4e47feb29c3388b4b0ca2251173c2c859c4b48b896794767c54"
+dependencies = [
+ "anyhow",
+ "foldhash 0.1.5",
+ "indexmap 2.13.0",
+ "once_cell",
+ "precomputed-map",
+ "preset_env_base",
+ "rustc-hash 2.1.1",
+ "serde",
+ "serde_json",
+ "string_enum",
+ "swc_atoms",
+ "swc_common 21.0.1",
+ "swc_ecma_ast 23.0.0",
+ "swc_ecma_transformer 13.0.0",
+ "swc_ecma_transforms 51.0.0",
+ "swc_ecma_utils 29.1.0",
+ "swc_ecma_visit 23.0.0",
+]
+
 [[package]]
 name = "swc_ecma_preset_env"
 version = "53.0.0"
@@ -8995,17 +9375,17 @@ dependencies = [
  "swc_atoms",
  "swc_common 21.0.1",
  "swc_ecma_ast 23.0.0",
- "swc_ecma_transformer",
- "swc_ecma_transforms",
+ "swc_ecma_transformer 14.0.0",
+ "swc_ecma_transforms 52.0.0",
  "swc_ecma_utils 29.1.0",
  "swc_ecma_visit 23.0.0",
 ]
 
 [[package]]
 name = "swc_ecma_quote_macros"
-version = "39.0.0"
+version = "38.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54e4d28106d86d9c45d187687688d03bab7064bd8480d8bc783df9ff2a5d5a9a"
+checksum = "16896c184ff6915c85ee4bffd08db32e010b1c1a9628e6c4ee49a233653c20a7"
 dependencies = [
  "anyhow",
  "proc-macro2",
@@ -9014,7 +9394,7 @@ dependencies = [
  "swc_atoms",
  "swc_common 21.0.1",
  "swc_ecma_ast 23.0.0",
- "swc_ecma_parser 39.0.2",
+ "swc_ecma_parser 38.0.2",
  "swc_macros_common",
  "syn 2.0.106",
 ]
@@ -9067,6 +9447,25 @@ dependencies = [
  "swc_visit",
 ]
 
+[[package]]
+name = "swc_ecma_transformer"
+version = "13.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "65c334a42d7d8252e5a80dbae85a1230144d29f7ed4aa7feada2a47167f9282e"
+dependencies = [
+ "rustc-hash 2.1.1",
+ "swc_atoms",
+ "swc_common 21.0.1",
+ "swc_ecma_ast 23.0.0",
+ "swc_ecma_compat_regexp",
+ "swc_ecma_hooks",
+ "swc_ecma_regexp",
+ "swc_ecma_transforms_base 41.0.1",
+ "swc_ecma_utils 29.1.0",
+ "swc_ecma_visit 23.0.0",
+ "tracing",
+]
+
 [[package]]
 name = "swc_ecma_transformer"
 version = "14.0.0"
@@ -9086,6 +9485,24 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "swc_ecma_transforms"
+version = "51.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b94503bbcd555d82cb33ff0e591e935bb925b79b254e94e706521f15d762b473"
+dependencies = [
+ "par-core",
+ "swc_common 21.0.1",
+ "swc_ecma_ast 23.0.0",
+ "swc_ecma_transforms_base 41.0.1",
+ "swc_ecma_transforms_compat 47.0.0",
+ "swc_ecma_transforms_optimization 43.0.0",
+ "swc_ecma_transforms_proposal 41.0.3",
+ "swc_ecma_transforms_react 45.0.0",
+ "swc_ecma_transforms_typescript 45.0.2",
+ "swc_ecma_utils 29.1.0",
+]
+
 [[package]]
 name = "swc_ecma_transforms"
 version = "52.0.0"
@@ -9096,11 +9513,11 @@ dependencies = [
  "swc_common 21.0.1",
  "swc_ecma_ast 23.0.0",
  "swc_ecma_transforms_base 42.0.0",
- "swc_ecma_transforms_compat",
- "swc_ecma_transforms_optimization",
- "swc_ecma_transforms_proposal",
- "swc_ecma_transforms_react",
- "swc_ecma_transforms_typescript",
+ "swc_ecma_transforms_compat 48.0.0",
+ "swc_ecma_transforms_optimization 44.0.0",
+ "swc_ecma_transforms_proposal 42.0.0",
+ "swc_ecma_transforms_react 46.0.1",
+ "swc_ecma_transforms_typescript 46.0.1",
  "swc_ecma_utils 29.1.0",
 ]
 
@@ -9126,6 +9543,28 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "swc_ecma_transforms_base"
+version = "41.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6be824dc326da1f7673d1e241790626e5f39f09e1d896175134143408eeaa081"
+dependencies = [
+ "better_scoped_tls",
+ "indexmap 2.13.0",
+ "once_cell",
+ "par-core",
+ "phf",
+ "rustc-hash 2.1.1",
+ "serde",
+ "swc_atoms",
+ "swc_common 21.0.1",
+ "swc_ecma_ast 23.0.0",
+ "swc_ecma_parser 38.0.2",
+ "swc_ecma_utils 29.1.0",
+ "swc_ecma_visit 23.0.0",
+ "tracing",
+]
+
 [[package]]
 name = "swc_ecma_transforms_base"
 version = "42.0.0"
@@ -9148,6 +9587,19 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "swc_ecma_transforms_classes"
+version = "41.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ffae23e996fa1a7b20b77ff599aa0e4997a6eb21369e2e5e906c91b89fdffaa"
+dependencies = [
+ "swc_common 21.0.1",
+ "swc_ecma_ast 23.0.0",
+ "swc_ecma_transforms_base 41.0.1",
+ "swc_ecma_utils 29.1.0",
+ "swc_ecma_visit 23.0.0",
+]
+
 [[package]]
 name = "swc_ecma_transforms_classes"
 version = "42.0.0"
@@ -9161,6 +9613,34 @@ dependencies = [
  "swc_ecma_visit 23.0.0",
 ]
 
+[[package]]
+name = "swc_ecma_transforms_compat"
+version = "47.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fd54b7d82f0037f03367b4c9052a4ba2913e044df009fbeac388b2142c3ddd8a"
+dependencies = [
+ "indexmap 2.13.0",
+ "par-core",
+ "serde",
+ "swc_atoms",
+ "swc_common 21.0.1",
+ "swc_ecma_ast 23.0.0",
+ "swc_ecma_compat_bugfixes 46.0.0",
+ "swc_ecma_compat_common 37.0.0",
+ "swc_ecma_compat_es2015 45.0.0",
+ "swc_ecma_compat_es2016 42.0.0",
+ "swc_ecma_compat_es2017 42.0.0",
+ "swc_ecma_compat_es2018 43.0.0",
+ "swc_ecma_compat_es2019 42.0.0",
+ "swc_ecma_compat_es2020 44.0.0",
+ "swc_ecma_compat_es2021 42.0.0",
+ "swc_ecma_compat_es2022 44.0.0",
+ "swc_ecma_transforms_base 41.0.1",
+ "swc_ecma_utils 29.1.0",
+ "swc_ecma_visit 23.0.0",
+ "tracing",
+]
+
 [[package]]
 name = "swc_ecma_transforms_compat"
 version = "48.0.0"
@@ -9173,16 +9653,16 @@ dependencies = [
  "swc_atoms",
  "swc_common 21.0.1",
  "swc_ecma_ast 23.0.0",
- "swc_ecma_compat_bugfixes",
- "swc_ecma_compat_common",
- "swc_ecma_compat_es2015",
- "swc_ecma_compat_es2016",
- "swc_ecma_compat_es2017",
- "swc_ecma_compat_es2018",
- "swc_ecma_compat_es2019",
- "swc_ecma_compat_es2020",
- "swc_ecma_compat_es2021",
- "swc_ecma_compat_es2022",
+ "swc_ecma_compat_bugfixes 47.0.0",
+ "swc_ecma_compat_common 38.0.0",
+ "swc_ecma_compat_es2015 46.0.0",
+ "swc_ecma_compat_es2016 43.0.0",
+ "swc_ecma_compat_es2017 43.0.0",
+ "swc_ecma_compat_es2018 44.0.0",
+ "swc_ecma_compat_es2019 43.0.0",
+ "swc_ecma_compat_es2020 45.0.0",
+ "swc_ecma_compat_es2021 43.0.0",
+ "swc_ecma_compat_es2022 45.0.0",
  "swc_ecma_transforms_base 42.0.0",
  "swc_ecma_utils 29.1.0",
  "swc_ecma_visit 23.0.0",
@@ -9201,6 +9681,30 @@ dependencies = [
  "syn 2.0.106",
 ]
 
+[[package]]
+name = "swc_ecma_transforms_optimization"
+version = "43.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae12179c92f0690850bae8932dfac2b7f191b8bfc6bac80dd81abfe6b0c014aa"
+dependencies = [
+ "bytes-str",
+ "dashmap 5.5.3",
+ "indexmap 2.13.0",
+ "once_cell",
+ "par-core",
+ "petgraph 0.7.1",
+ "rustc-hash 2.1.1",
+ "serde_json",
+ "swc_atoms",
+ "swc_common 21.0.1",
+ "swc_ecma_ast 23.0.0",
+ "swc_ecma_parser 38.0.2",
+ "swc_ecma_transforms_base 41.0.1",
+ "swc_ecma_utils 29.1.0",
+ "swc_ecma_visit 23.0.0",
+ "tracing",
+]
+
 [[package]]
 name = "swc_ecma_transforms_optimization"
 version = "44.0.0"
@@ -9225,6 +9729,24 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "swc_ecma_transforms_proposal"
+version = "41.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "02c49fd90ad7ef87cfacb9e15eb939bfecac83fe6638fdd4f94a31eff56b8276"
+dependencies = [
+ "either",
+ "rustc-hash 2.1.1",
+ "serde",
+ "swc_atoms",
+ "swc_common 21.0.1",
+ "swc_ecma_ast 23.0.0",
+ "swc_ecma_transforms_base 41.0.1",
+ "swc_ecma_transforms_classes 41.0.1",
+ "swc_ecma_utils 29.1.0",
+ "swc_ecma_visit 23.0.0",
+]
+
 [[package]]
 name = "swc_ecma_transforms_proposal"
 version = "42.0.0"
@@ -9238,7 +9760,32 @@ dependencies = [
  "swc_common 21.0.1",
  "swc_ecma_ast 23.0.0",
  "swc_ecma_transforms_base 42.0.0",
- "swc_ecma_transforms_classes",
+ "swc_ecma_transforms_classes 42.0.0",
+ "swc_ecma_utils 29.1.0",
+ "swc_ecma_visit 23.0.0",
+]
+
+[[package]]
+name = "swc_ecma_transforms_react"
+version = "45.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b41b35e76a78a01650dcfb92889d37fdebbc3b86932a052259c2a99e7955e699"
+dependencies = [
+ "base64 0.22.1",
+ "bytes-str",
+ "indexmap 2.13.0",
+ "once_cell",
+ "rustc-hash 2.1.1",
+ "serde",
+ "sha1",
+ "string_enum",
+ "swc_atoms",
+ "swc_common 21.0.1",
+ "swc_config",
+ "swc_ecma_ast 23.0.0",
+ "swc_ecma_hooks",
+ "swc_ecma_parser 38.0.2",
+ "swc_ecma_transforms_base 41.0.1",
  "swc_ecma_utils 29.1.0",
  "swc_ecma_visit 23.0.0",
 ]
@@ -9268,6 +9815,24 @@ dependencies = [
  "swc_ecma_visit 23.0.0",
 ]
 
+[[package]]
+name = "swc_ecma_transforms_typescript"
+version = "45.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d25026f22efe873b50c97b3aaca6bfd178f954031effd14394e7b3add1e95fb"
+dependencies = [
+ "bytes-str",
+ "rustc-hash 2.1.1",
+ "serde",
+ "swc_atoms",
+ "swc_common 21.0.1",
+ "swc_ecma_ast 23.0.0",
+ "swc_ecma_transforms_base 41.0.1",
+ "swc_ecma_transforms_react 45.0.0",
+ "swc_ecma_utils 29.1.0",
+ "swc_ecma_visit 23.0.0",
+]
+
 [[package]]
 name = "swc_ecma_transforms_typescript"
 version = "46.0.1"
@@ -9281,7 +9846,7 @@ dependencies = [
  "swc_common 21.0.1",
  "swc_ecma_ast 23.0.0",
  "swc_ecma_transforms_base 42.0.0",
- "swc_ecma_transforms_react",
+ "swc_ecma_transforms_react 46.0.1",
  "swc_ecma_utils 29.1.0",
  "swc_ecma_visit 23.0.0",
 ]
@@ -9357,9 +9922,9 @@ dependencies = [
 
 [[package]]
 name = "swc_emotion"
-version = "4.0.0"
+version = "3.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7077db4cd3dc9908a860c2e55b40ae6de8d6ce41d919867f2e58eb81b4019718"
+checksum = "11d8058e754b05eb672671b71974c4f79673b32bc2a2763706ba6970f8d2c86f"
 dependencies = [
  "base64 0.22.1",
  "byteorder",
@@ -9373,7 +9938,7 @@ dependencies = [
  "swc_common 21.0.1",
  "swc_ecma_ast 23.0.0",
  "swc_ecma_codegen 26.0.1",
- "swc_ecma_transforms",
+ "swc_ecma_transforms 51.0.0",
  "swc_ecma_utils 29.1.0",
  "swc_ecma_visit 23.0.0",
  "swc_sourcemap",
@@ -9506,9 +10071,9 @@ dependencies = [
 
 [[package]]
 name = "swc_relay"
-version = "4.0.0"
+version = "3.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b592abba81c24baad593d6130a162beaa50699b5c2ba791a5b0db7be2dff1db4"
+checksum = "d1a0e98d0497d914f2a0736be9be050af6c3c0fbb2a9d911dae40379fffcc7c8"
 dependencies = [
  "once_cell",
  "regex",
@@ -10465,7 +11030,6 @@ dependencies = [
  "auto-hash-map",
  "bincode 2.0.1",
  "concurrent-queue",
- "ctor 0.10.1",
  "dashmap 6.1.0",
  "either",
  "erased-serde",
@@ -10638,10 +11202,8 @@ dependencies = [
 name = "turbo-tasks-malloc"
 version = "0.1.0"
 dependencies = [
- "libc",
  "libmimalloc-sys",
  "mimalloc",
- "windows-sys 0.60.2",
 ]
 
 [[package]]
@@ -10741,6 +11303,7 @@ dependencies = [
  "either",
  "indexmap 2.13.0",
  "num-bigint",
+ "once_cell",
  "patricia_tree",
  "petgraph 0.8.3",
  "ref-cast",
@@ -10750,7 +11313,7 @@ dependencies = [
  "serde",
  "serde_json",
  "smallvec",
- "swc_core 65.0.3",
+ "swc_core 63.1.3",
  "swc_sourcemap",
  "tracing",
  "turbo-bincode",
@@ -10782,7 +11345,7 @@ dependencies = [
  "rustc-hash 2.1.1",
  "serde",
  "smallvec",
- "swc_core 65.0.3",
+ "swc_core 63.1.3",
  "tokio",
  "tracing",
  "turbo-bincode",
@@ -10849,6 +11412,7 @@ dependencies = [
  "itertools 0.10.5",
  "num-bigint",
  "num-traits",
+ "once_cell",
  "parking_lot",
  "petgraph 0.8.3",
  "phf",
@@ -10858,7 +11422,7 @@ dependencies = [
  "serde_json",
  "smallvec",
  "strsim 0.11.1",
- "swc_core 65.0.3",
+ "swc_core 63.1.3",
  "swc_sourcemap",
  "tokio",
  "tracing",
@@ -10873,7 +11437,6 @@ dependencies = [
  "turbopack-resolve",
  "turbopack-swc-utils",
  "url",
- "urlencoding",
 ]
 
 [[package]]
@@ -10901,7 +11464,7 @@ dependencies = [
  "serde_json",
  "styled_components",
  "styled_jsx",
- "swc_core 65.0.3",
+ "swc_core 63.1.3",
  "swc_emotion",
  "swc_plugin_backend_wasmtime",
  "swc_relay",
@@ -10951,8 +11514,9 @@ dependencies = [
  "bincode 2.0.1",
  "image",
  "mime",
- "phf",
+ "once_cell",
  "regex",
+ "rustc-hash 2.1.1",
  "serde",
  "turbo-bincode",
  "turbo-rcstr",
@@ -10996,6 +11560,7 @@ dependencies = [
  "js-sys",
  "napi",
  "napi-derive",
+ "once_cell",
  "owo-colors",
  "parking_lot",
  "regex",
@@ -11081,7 +11646,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "parking_lot",
- "swc_core 65.0.3",
+ "swc_core 63.1.3",
  "turbo-rcstr",
  "turbo-tasks",
  "turbopack-core",
@@ -11093,6 +11658,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "bincode 2.0.1",
+ "once_cell",
  "regex",
  "rustc-hash 2.1.1",
  "serde",
@@ -11121,7 +11687,6 @@ dependencies = [
  "rustc-hash 2.1.1",
  "serde",
  "serde_json",
- "smallvec",
  "tungstenite 0.21.0",
  "turbo-rcstr",
  "turbo-tasks-malloc",
@@ -11136,6 +11701,7 @@ dependencies = [
  "anyhow",
  "crossbeam-channel",
  "crossbeam-utils",
+ "once_cell",
  "parking_lot",
  "postcard",
  "rustc-hash 2.1.1",
diff --git a/crates/pm/src/helper/ruborist_context.rs b/crates/pm/src/helper/ruborist_context.rs
index c8b758a6f..f5d883d8e 100644
--- a/crates/pm/src/helper/ruborist_context.rs
+++ b/crates/pm/src/helper/ruborist_context.rs
@@ -69,6 +69,14 @@ impl Context {
 
     /// Create BuildDepsOptions with PipelineReceiver for concurrent download/clone.
     /// Returns (options, channels) where channels are used to start pipeline workers.
+    ///
+    /// Sets `skip_preload=true` so ruborist's `service::api::build_deps`
+    /// routes through `mb_fetch_with_graph` (folded preload + graph
+    /// build). The pipeline still receives `PackageResolved` /
+    /// `PackagePlaced` events — emitted from inside
+    /// `mb_fetch_with_graph` (main loop and graph worker
+    /// respectively) — so download/clone start as early as the
+    /// classic preload+BFS path.
     pub async fn pipeline_deps_options(
         cwd: PathBuf,
     ) -> (
@@ -76,7 +84,8 @@ impl Context {
         PipelineChannels,
     ) {
         let (receiver, channels) = PipelineReceiver::new(ProgressReceiver);
-        let options = Self::deps_options(cwd, receiver).await;
+        let mut options = Self::deps_options(cwd, receiver).await;
+        options.skip_preload = true;
         (options, channels)
     }
 
diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs
index 197fcbc26..33e3819a8 100644
--- a/crates/ruborist/src/resolver/mb_resolve.rs
+++ b/crates/ruborist/src/resolver/mb_resolve.rs
@@ -51,6 +51,7 @@ use crate::resolver::preload::{Dep, PreloadConfig};
 use crate::resolver::version::resolve_target_version;
 use crate::service::MemoryCache;
 use crate::spec::SpecStr;
+use crate::traits::progress::{BuildEvent, EventReceiver};
 use crate::traits::registry::ResolvedPackage;
 
 #[derive(Debug, Default)]
@@ -131,6 +132,11 @@ fn extract_transitive(manifest: &CoreVersionManifest, peer_deps: PeerDeps) -> Ve
 /// `body_cache` and trigger sibling drain.
 struct FetchOutcome {
     name: String,
+    /// The spec that triggered this fetch / settle. Used by the
+    /// main loop to look up the cached `CoreVersionManifest` for
+    /// `PackageResolved` event emission (the future already wrote
+    /// `(name, primary_spec)` to the cache).
+    primary_spec: String,
     transitives: Vec<Dep>,
     fetched: bool,
     /// Per-future wall (network + body recv + spawn_blocking parse).
@@ -211,6 +217,7 @@ fn spawn_fetch(
 ) -> Fut {
     Box::pin(async move {
         let fut_start = Instant::now();
+        let primary_spec = spec.clone();
         let url = format!("{}/{}", registry_url, name);
         let resp = match client
             .get(&url)
@@ -223,6 +230,7 @@ fn spawn_fetch(
                 let wall_us = fut_start.elapsed().as_micros() as u64;
                 return FetchOutcome {
                     name,
+                    primary_spec,
                     transitives: Vec::new(),
                     fetched: true,
                     wall_us,
@@ -236,6 +244,7 @@ fn spawn_fetch(
                 let wall_us = fut_start.elapsed().as_micros() as u64;
                 return FetchOutcome {
                     name,
+                    primary_spec,
                     transitives: Vec::new(),
                     fetched: true,
                     wall_us,
@@ -270,6 +279,7 @@ fn spawn_fetch(
         let wall_us = fut_start.elapsed().as_micros() as u64;
         FetchOutcome {
             name,
+            primary_spec,
             transitives,
             fetched: true,
             wall_us,
@@ -289,6 +299,7 @@ fn spawn_settle(
 ) -> Fut {
     Box::pin(async move {
         let fut_start = Instant::now();
+        let primary_spec = spec.clone();
         let spec_for_parse = spec.clone();
         let peer = peer_deps;
         let parsed = tokio::task::spawn_blocking(move || {
@@ -314,6 +325,7 @@ fn spawn_settle(
         let wall_us = fut_start.elapsed().as_micros() as u64;
         FetchOutcome {
             name,
+            primary_spec,
             transitives,
             fetched: false,
             wall_us,
@@ -593,13 +605,17 @@ struct FetchEventMsg {
     name: String,
 }
 
-pub async fn mb_fetch_with_graph(
+pub async fn mb_fetch_with_graph<R>(
     mut graph: DependencyGraph,
     registry_url: &str,
     cache: &MemoryCache,
     preload_config: &PreloadConfig,
     build_config: &BuildDepsConfig,
-) -> Result<(DependencyGraph, MbFetchStats)> {
+    receiver: Arc<R>,
+) -> Result<(DependencyGraph, MbFetchStats)>
+where
+    R: EventReceiver + 'static,
+{
     let mut stats = MbFetchStats::default();
     let total_start = Instant::now();
 
@@ -663,6 +679,7 @@ pub async fn mb_fetch_with_graph(
     // loop CPU starving the runtime's IO polling.
     let cache_clone = cache.clone();
     let build_config_owned = build_config.clone();
+    let receiver_for_graph = Arc::clone(&receiver);
     let graph_handle = tokio::spawn(graph_worker(
         graph,
         edge_targets,
@@ -671,6 +688,7 @@ pub async fn mb_fetch_with_graph(
         build_config_owned,
         fetch_rx,
         specs_tx,
+        receiver_for_graph,
     ));
 
     // Sibling-fetch dedup stays in main loop (drives FuturesUnordered).
@@ -747,6 +765,21 @@ pub async fn mb_fetch_with_graph(
                         settle_count += 1;
                     }
 
+                    // Pipeline early-start signal: emit
+                    // PackageResolved as soon as the manifest is in
+                    // cache. The install path's PipelineReceiver
+                    // forwards this to the download worker so
+                    // tarball download begins before BFS finishes.
+                    // For lockfile-only callers (NoopReceiver), this
+                    // is a no-op.
+                    if let Some(core_arc) =
+                        cache.get_version_manifest(&out.name, &out.primary_spec)
+                    {
+                        receiver.on_event(BuildEvent::PackageResolved(
+                            (&*core_arc).into(),
+                        ));
+                    }
+
                     // Drain sibling specs deferred while the fetch
                     // was in flight. Sibling settles also produce a
                     // FetchEventMsg downstream.
@@ -828,7 +861,8 @@ struct GraphWorkerStats {
 /// back. Designed to monopolize a tokio runtime worker thread so
 /// the main loop's worker can drive socket polling without
 /// competing for CPU.
-async fn graph_worker(
+#[allow(clippy::too_many_arguments)]
+async fn graph_worker<R>(
     mut graph: DependencyGraph,
     mut edge_targets: EdgeTargets,
     mut seen_specs: HashSet<(String, String)>,
@@ -836,7 +870,12 @@ async fn graph_worker(
     build_config: BuildDepsConfig,
     mut fetch_rx: mpsc::Receiver<FetchEventMsg>,
     specs_tx: mpsc::Sender<Vec<Dep>>,
-) -> Result<(DependencyGraph, GraphWorkerStats)> {
+    receiver: Arc<R>,
+) -> Result<(DependencyGraph, GraphWorkerStats)>
+where
+    R: EventReceiver + 'static,
+{
+    use crate::model::manifest::NodeManifest;
     let mut stats = GraphWorkerStats::default();
 
     while let Some(msg) = fetch_rx.recv().await {
@@ -888,6 +927,22 @@ async fn graph_worker(
                     &build_config,
                 );
                 if let ProcessResult::Created(new_idx) = result {
+                    // Pipeline clone signal: emit PackagePlaced so
+                    // the install path's clone worker can begin
+                    // hardlinking from cache as soon as a node is
+                    // placed in the graph. lockfile-only callers
+                    // (NoopReceiver) drop this on the floor.
+                    if let Some(node) = graph.get_node(new_idx)
+                        && let NodeManifest::Registry(ref manifest) = node.manifest
+                    {
+                        let parent_path = graph.get_node(parent_idx).map(|p| p.path.as_path());
+                        receiver.on_event(BuildEvent::PackagePlaced {
+                            package: manifest.as_ref().into(),
+                            path: &node.path,
+                            parent_path,
+                        });
+                    }
+
                     // Walk the new node's edges. enqueue handles
                     // recursive cache-hit drain so already-cached
                     // specs get processed inline (still on this
diff --git a/crates/ruborist/src/service/api.rs b/crates/ruborist/src/service/api.rs
index 6c9ada9a8..2dc7d62e8 100644
--- a/crates/ruborist/src/service/api.rs
+++ b/crates/ruborist/src/service/api.rs
@@ -131,7 +131,7 @@ pub struct BuildDepsOutput {
 pub async fn build_deps<G, R>(options: BuildDepsOptions<G, R>) -> Result<BuildDepsOutput>
 where
     G: Glob + Clone,
-    R: EventReceiver,
+    R: EventReceiver + 'static,
 {
     let BuildDepsOptions {
         cwd,
@@ -279,6 +279,11 @@ where
     // `build_deps_with_config` call still runs to handle any
     // non-registry edges (workspace / git / http / file) the fold
     // path skipped, but on registry-only workloads it's near no-op.
+    // Wrap receiver in Arc so the folded mb_fetch_with_graph can
+    // share it with its spawned graph_worker task. The follow-up
+    // BFS sweep also holds an &Arc<R> via deref.
+    let receiver = Arc::new(receiver);
+
     let folded = skip_preload_caller && cache_count == 0;
     if folded {
         let preload_config = PreloadConfig {
@@ -291,6 +296,7 @@ where
             registry.cache(),
             &preload_config,
             &config,
+            Arc::clone(&receiver),
         )
         .await
         .map_err(|e| e.context("mb_fetch_with_graph failed"))?;
@@ -305,7 +311,7 @@ where
     // (non-registry: workspace / git / http / file). On
     // registry-only workloads (the common case) the graph is fully
     // built already, BFS walks nothing.
-    build_deps_with_config(&mut graph, &registry, config, &receiver)
+    build_deps_with_config(&mut graph, &registry, config, &*receiver)
         .await
         .map_err(|e| anyhow::Error::new(e).context("Dependency resolution failed"))?;
 

From 21d9c7dce4c03b507701e0e4b2ca4bc2dc7b0ad8 Mon Sep 17 00:00:00 2001
From: elrrrrrrr <elrrrrrrr@gmail.com>
Date: Sun, 10 May 2026 00:32:24 +0800
Subject: [PATCH 32/32] =?UTF-8?q?fix(pm):=20mb=5Ffetch=5Fwith=5Fgraph=20?=
 =?UTF-8?q?=E2=80=94=20normalize=20npm:=20alias=20specs=20before=20fetch?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously spawn_fetch / spawn_settle used the raw dep key as both
the registry path segment and the cache lookup key. For an npm-alias
dep like \`\"ms\": \"npm:raw-body@2.1.3\"\` this hit
\`registry/ms\` instead of \`registry/raw-body\`, parsed ms's manifest
against \`npm:raw-body@2.1.3\`, and ultimately installed the real ms
into \`node_modules/ms/\` rather than raw-body. e2e
\`utoo-pm.sh:466\` (\"top-level ms should be raw-body\") caught this
on d1cf53e2.

Fix:
- spawn_fetch / spawn_settle call \`normalize_spec\` to split out the
  real package name + spec; URL hits \`registry/{real_name}\` and the
  combined parse runs against \`real_spec\` so version resolution sees
  the right manifest envelope.
- Cache writes go under both keys: the original
  \`(alias_name, alias_spec)\` so \`graph_worker\` finds the manifest
  via \`edge_targets\`, and the normalized
  \`(real_name, resolved_version)\` for direct-dep dedup.
- Main loop dedup state (in_flight_names / deferred_by_name /
  body_cache) keys by real_name so two distinct aliases pointing at
  the same registry package share dedup; deferred entries store
  \`(alias_name, spec)\` so the drain spawns spawn_settle with the
  correct cache key.
- Adds \`real_name\` to FetchOutcome so the deferred-drain step can
  look up by real name without re-normalizing.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/ruborist/src/resolver/mb_resolve.rs | 127 ++++++++++++++-------
 1 file changed, 86 insertions(+), 41 deletions(-)

diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs
index 33e3819a8..6af0c80f1 100644
--- a/crates/ruborist/src/resolver/mb_resolve.rs
+++ b/crates/ruborist/src/resolver/mb_resolve.rs
@@ -48,6 +48,7 @@ use crate::resolver::builder::{
     BuildDepsConfig, ProcessResult, collect_unresolved_edges, process_dependency_with_resolved,
 };
 use crate::resolver::preload::{Dep, PreloadConfig};
+use crate::resolver::semver::normalize_spec;
 use crate::resolver::version::resolve_target_version;
 use crate::service::MemoryCache;
 use crate::spec::SpecStr;
@@ -131,7 +132,16 @@ fn extract_transitive(manifest: &CoreVersionManifest, peer_deps: PeerDeps) -> Ve
 /// happened inside the future. Only `fetched=true` futures populate
 /// `body_cache` and trigger sibling drain.
 struct FetchOutcome {
+    /// The dep key (alias name as it appears in the parent's deps map).
+    /// Used by `graph_worker` to filter `edge_targets`, which is keyed
+    /// on the alias.
     name: String,
+    /// The real package name after npm-alias normalization (e.g.
+    /// `name="ms"` + `spec="npm:raw-body@2.1.3"` → `real_name="raw-body"`).
+    /// Used by the main loop for `body_cache` / `deferred_by_name` /
+    /// `in_flight_names` keying, so two distinct aliases pointing at
+    /// the same package share dedup.
+    real_name: String,
     /// The spec that triggered this fetch / settle. Used by the
     /// main loop to look up the cached `CoreVersionManifest` for
     /// `PackageResolved` event emission (the future already wrote
@@ -218,7 +228,13 @@ fn spawn_fetch(
     Box::pin(async move {
         let fut_start = Instant::now();
         let primary_spec = spec.clone();
-        let url = format!("{}/{}", registry_url, name);
+        // Normalize npm-alias / workspace specs so the registry hit
+        // and the manifest parse run against the *real* package, not
+        // the alias name. Cache writes still go under the original
+        // (alias_name, alias_spec) key so `graph_worker` can locate
+        // them via `edge_targets`.
+        let (real_name, real_spec) = normalize_spec(&name, &spec);
+        let url = format!("{}/{}", registry_url, real_name);
         let resp = match client
             .get(&url)
             .header("accept", "application/vnd.npm.install-v1+json")
@@ -230,6 +246,7 @@ fn spawn_fetch(
                 let wall_us = fut_start.elapsed().as_micros() as u64;
                 return FetchOutcome {
                     name,
+                    real_name,
                     primary_spec,
                     transitives: Vec::new(),
                     fetched: true,
@@ -244,6 +261,7 @@ fn spawn_fetch(
                 let wall_us = fut_start.elapsed().as_micros() as u64;
                 return FetchOutcome {
                     name,
+                    real_name,
                     primary_spec,
                     transitives: Vec::new(),
                     fetched: true,
@@ -254,23 +272,31 @@ fn spawn_fetch(
         };
         let net_us = fut_start.elapsed().as_micros() as u64;
         let raw_arc: Arc<[u8]> = Arc::from(raw_bytes.as_ref());
-        // Stash in body_cache early so concurrent sibling specs
-        // arriving slightly after see it on their pending pop.
-        body_cache.lock().insert(name.clone(), Arc::clone(&raw_arc));
-
-        let spec_for_parse = spec.clone();
+        // Body cache is keyed by real_name so two aliases pointing at
+        // the same registry package share the body and only one fetch
+        // fires. Sibling drains know to use real_name (see
+        // `deferred_by_name` keying in the main loop).
+        body_cache
+            .lock()
+            .insert(real_name.clone(), Arc::clone(&raw_arc));
+
+        let real_spec_for_parse = real_spec.clone();
         let peer = peer_deps;
-        let parsed =
-            tokio::task::spawn_blocking(move || parse_combined(raw_arc, &spec_for_parse, peer))
-                .await
-                .ok()
-                .flatten();
+        let parsed = tokio::task::spawn_blocking(move || {
+            parse_combined(raw_arc, &real_spec_for_parse, peer)
+        })
+        .await
+        .ok()
+        .flatten();
 
         let transitives = match parsed {
             Some((full_arc, resolved, core_arc, transitives)) => {
-                cache.set_full_manifest(name.clone(), Arc::clone(&full_arc));
+                cache.set_full_manifest(real_name.clone(), Arc::clone(&full_arc));
+                // Under the alias key so `graph_worker` finds it.
                 cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc));
-                cache.set_version_manifest(name.clone(), resolved, core_arc);
+                // Under the real key so subsequent direct deps on
+                // the same package@version dedupe correctly.
+                cache.set_version_manifest(real_name.clone(), resolved, core_arc);
                 transitives
             }
             None => Vec::new(),
@@ -279,6 +305,7 @@ fn spawn_fetch(
         let wall_us = fut_start.elapsed().as_micros() as u64;
         FetchOutcome {
             name,
+            real_name,
             primary_spec,
             transitives,
             fetched: true,
@@ -300,10 +327,11 @@ fn spawn_settle(
     Box::pin(async move {
         let fut_start = Instant::now();
         let primary_spec = spec.clone();
-        let spec_for_parse = spec.clone();
+        let (real_name, real_spec) = normalize_spec(&name, &spec);
+        let real_spec_for_parse = real_spec.clone();
         let peer = peer_deps;
         let parsed = tokio::task::spawn_blocking(move || {
-            parse_combined(Arc::clone(&raw), &spec_for_parse, peer)
+            parse_combined(Arc::clone(&raw), &real_spec_for_parse, peer)
         })
         .await
         .ok()
@@ -312,11 +340,12 @@ fn spawn_settle(
         let transitives = match parsed {
             Some((full_arc, resolved, core_arc, transitives)) => {
                 // Don't overwrite full_manifest — the original fetcher
-                // already set it. Only populate the version-manifest
-                // slots so BFS hits the (name, spec) early-return.
-                cache.set_full_manifest(name.clone(), full_arc);
+                // already set it under real_name. Populate version
+                // slots so BFS hits the (alias_name, alias_spec)
+                // early-return.
+                cache.set_full_manifest(real_name.clone(), full_arc);
                 cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc));
-                cache.set_version_manifest(name.clone(), resolved, core_arc);
+                cache.set_version_manifest(real_name.clone(), resolved, core_arc);
                 transitives
             }
             None => Vec::new(),
@@ -325,6 +354,7 @@ fn spawn_settle(
         let wall_us = fut_start.elapsed().as_micros() as u64;
         FetchOutcome {
             name,
+            real_name,
             primary_spec,
             transitives,
             fetched: false,
@@ -376,12 +406,15 @@ pub async fn mb_fetch(
         }
     }
 
-    // Sibling-fetch dedup: when two specs for the same name are both
-    // in flight, only the first fires a fetch; the second arrives at
-    // the cached body and goes through `spawn_settle` instead.
+    // Sibling-fetch dedup: when two specs for the same package are
+    // both in flight, only the first fires a fetch; the second
+    // arrives at the cached body and goes through `spawn_settle`.
+    // Keyed by *real* package name (post npm-alias normalization)
+    // so two distinct aliases pointing at the same registry package
+    // share dedup.
     let body_cache: Arc<Mutex<HashMap<String, Arc<[u8]>>>> = Arc::new(Mutex::new(HashMap::new()));
-    let mut in_flight_names: HashSet<String> = HashSet::new();
-    let mut deferred_by_name: HashMap<String, Vec<String>> = HashMap::new();
+    let mut in_flight_real_names: HashSet<String> = HashSet::new();
+    let mut deferred_by_real_name: HashMap<String, Vec<(String, String)>> = HashMap::new();
 
     let mut futs: FuturesUnordered<Fut> = FuturesUnordered::new();
 
@@ -391,14 +424,18 @@ pub async fn mb_fetch(
             let Some((name, spec)) = pending.pop_front() else {
                 break;
             };
+            let (real_name, _) = normalize_spec(&name, &spec);
             // Sibling fast path: body already cached.
-            if let Some(raw) = body_cache.lock().get(&name).cloned() {
+            if let Some(raw) = body_cache.lock().get(&real_name).cloned() {
                 futs.push(spawn_settle(name, spec, raw, cache.clone(), peer_deps));
                 continue;
             }
-            // Defer if a fetch for this name is already in flight.
-            if !in_flight_names.insert(name.clone()) {
-                deferred_by_name.entry(name).or_default().push(spec);
+            // Defer if a fetch for this real package is already in flight.
+            if !in_flight_real_names.insert(real_name.clone()) {
+                deferred_by_real_name
+                    .entry(real_name)
+                    .or_default()
+                    .push((name, spec));
                 continue;
             }
             futs.push(spawn_fetch(
@@ -439,12 +476,12 @@ pub async fn mb_fetch(
 
         // Drain sibling specs deferred while the fetch was in flight.
         if out.fetched
-            && let Some(siblings) = deferred_by_name.remove(&out.name)
-            && let Some(raw) = body_cache.lock().get(&out.name).cloned()
+            && let Some(siblings) = deferred_by_real_name.remove(&out.real_name)
+            && let Some(raw) = body_cache.lock().get(&out.real_name).cloned()
         {
-            for sibling_spec in siblings {
+            for (sibling_name, sibling_spec) in siblings {
                 futs.push(spawn_settle(
-                    out.name.clone(),
+                    sibling_name,
                     sibling_spec,
                     Arc::clone(&raw),
                     cache.clone(),
@@ -692,9 +729,13 @@ where
     ));
 
     // Sibling-fetch dedup stays in main loop (drives FuturesUnordered).
+    // Keyed by *real* package name (post npm-alias normalization)
+    // so two distinct aliases pointing at the same registry package
+    // share dedup; siblings store their alias `(name, spec)` so the
+    // drain knows how to spawn `spawn_settle` with the right cache key.
     let body_cache: Arc<Mutex<HashMap<String, Arc<[u8]>>>> = Arc::new(Mutex::new(HashMap::new()));
-    let mut in_flight_names: HashSet<String> = HashSet::new();
-    let mut deferred_by_name: HashMap<String, Vec<String>> = HashMap::new();
+    let mut in_flight_real_names: HashSet<String> = HashSet::new();
+    let mut deferred_by_real_name: HashMap<String, Vec<(String, String)>> = HashMap::new();
     let mut futs: FuturesUnordered<Fut> = FuturesUnordered::new();
 
     let mut sum_wall_us: u64 = 0;
@@ -712,12 +753,16 @@ where
             let Some((name, spec)) = pending.pop_front() else {
                 break;
             };
-            if let Some(raw) = body_cache.lock().get(&name).cloned() {
+            let (real_name, _) = normalize_spec(&name, &spec);
+            if let Some(raw) = body_cache.lock().get(&real_name).cloned() {
                 futs.push(spawn_settle(name, spec, raw, cache.clone(), peer_deps));
                 continue;
             }
-            if !in_flight_names.insert(name.clone()) {
-                deferred_by_name.entry(name).or_default().push(spec);
+            if !in_flight_real_names.insert(real_name.clone()) {
+                deferred_by_real_name
+                    .entry(real_name)
+                    .or_default()
+                    .push((name, spec));
                 continue;
             }
             futs.push(spawn_fetch(
@@ -784,12 +829,12 @@ where
                     // was in flight. Sibling settles also produce a
                     // FetchEventMsg downstream.
                     if out.fetched
-                        && let Some(siblings) = deferred_by_name.remove(&out.name)
-                        && let Some(raw) = body_cache.lock().get(&out.name).cloned()
+                        && let Some(siblings) = deferred_by_real_name.remove(&out.real_name)
+                        && let Some(raw) = body_cache.lock().get(&out.real_name).cloned()
                     {
-                        for sibling_spec in siblings {
+                        for (sibling_name, sibling_spec) in siblings {
                             futs.push(spawn_settle(
-                                out.name.clone(),
+                                sibling_name,
                                 sibling_spec,
                                 Arc::clone(&raw),
                                 cache.clone(),