From 2bf71158cde397de4b423ce100385a0e5561e900 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Fri, 8 May 2026 21:56:24 +0800 Subject: [PATCH 01/24] =?UTF-8?q?perf(pm):=20bump=20manifests-concurrency-?= =?UTF-8?q?limit=2064=20=E2=86=92=20256=20+=20add=20fetch=20breakdown?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit p1_resolve has been ~0.9s behind bun on phases bench for the past several PRs. Pcap on prior runs measured bun opening ~260 parallel TCP streams against registry.npmjs.org for resolve, while utoo opened ~70 (the 64 manifests-concurrency-limit cap was at saturation). Adding fetch-breakdown timing in ruborist showed where p1's 22s (local Mac) actually goes: fetch-timings: n=2730 sum_request = 1089s (88% — TCP+TLS+HTTP RTT to first byte) sum_body = 138s (11% — body download) sum_parse = 2s (0.16% — simd_json on rayon) The dominant cost is per-request RTT, not parsing or body transfer. The lever is the cap on concurrent in-flight requests. This commit: 1. Adds `crates/ruborist/src/util/timing.rs` — process-wide atomic accumulator that records per-fetch (request_us, body_us, parse_us, bytes) inside both `fetch_full_manifest` and `fetch_version_manifest`. Reset before each preload phase, dumped at INFO level after preload + bfs. 2. Bumps `manifests-concurrency-limit` default 64 → 256 to match bun's observed working point against npmjs.org. CI bench will validate. Expected: p1 utoo wall drops toward bun's range (~2.3s on GHA). Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/pm/src/util/user_config.rs | 10 +- crates/ruborist/src/resolver/builder.rs | 17 ++- crates/ruborist/src/service/manifest.rs | 24 ++++- crates/ruborist/src/util/mod.rs | 2 + crates/ruborist/src/util/timing.rs | 134 ++++++++++++++++++++++++ 5 files changed, 181 insertions(+), 6 deletions(-) create mode 100644 crates/ruborist/src/util/timing.rs diff --git a/crates/pm/src/util/user_config.rs b/crates/pm/src/util/user_config.rs index 34ee45a34..bc281fb40 100644 --- a/crates/pm/src/util/user_config.rs +++ b/crates/pm/src/util/user_config.rs @@ -132,9 +132,15 @@ pub fn get_install_scope() -> InstallScope { INSTALL_SCOPE.get().copied().unwrap_or_default() } -// Manifest fetch concurrency configuration +// Manifest fetch concurrency configuration. +// +// 256 to match bun's observed ~260 parallel TCP streams against +// registry.npmjs.org. Local fetch-breakdown instrumentation showed +// 88% of preload-phase work is in per-request RTT (TCP+TLS+server), +// only 11% body, 0.16% parse — so the dominant lever for p1 wall is +// the cap on concurrent in-flight manifest requests. static MANIFESTS_CONCURRENCY_LIMIT: LazyLock> = - LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 64)); + LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 256)); pub fn set_manifests_concurrency_limit(value: Option) { MANIFESTS_CONCURRENCY_LIMIT.set(value); diff --git a/crates/ruborist/src/resolver/builder.rs b/crates/ruborist/src/resolver/builder.rs index b0bf2794c..166372c91 100644 --- a/crates/ruborist/src/resolver/builder.rs +++ b/crates/ruborist/src/resolver/builder.rs @@ -756,6 +756,7 @@ async fn run_preload_phase( return; } + crate::util::FETCH_TIMINGS.reset(); let start = tokio::time::Instant::now(); let initial_deps = gather_preload_deps(graph, config.peer_deps); @@ -794,7 +795,13 @@ async fn run_preload_phase( failed: stats.failed_count, }); - tracing::debug!("Preload phase: {:?}", start.elapsed()); + let preload_elapsed = start.elapsed(); + tracing::debug!("Preload phase: {:?}", preload_elapsed); + tracing::info!( + "p1-breakdown preload_wall={}ms | {}", + preload_elapsed.as_millis(), + crate::util::FETCH_TIMINGS.snapshot().summary_line(), + ); } /// Run the BFS traversal phase to build the dependency tree. @@ -896,7 +903,13 @@ async fn run_bfs_phase( current_level = next_level; } - tracing::debug!("Build phase: {:?}", start.elapsed()); + let bfs_elapsed = start.elapsed(); + tracing::debug!("Build phase: {:?}", bfs_elapsed); + tracing::info!( + "p1-breakdown bfs_wall={}ms | {}", + bfs_elapsed.as_millis(), + crate::util::FETCH_TIMINGS.snapshot().summary_line(), + ); Ok(()) } diff --git a/crates/ruborist/src/service/manifest.rs b/crates/ruborist/src/service/manifest.rs index 74baf3b9c..36bc6a85a 100644 --- a/crates/ruborist/src/service/manifest.rs +++ b/crates/ruborist/src/service/manifest.rs @@ -12,6 +12,7 @@ use super::fetch::{ }; use super::http::get_client; use crate::model::manifest::{CoreVersionManifest, FullManifest}; +use crate::util::FETCH_TIMINGS; /// Parse JSON bytes on rayon's CPU thread pool (native) or inline /// (wasm32). Keeps the tokio runtime free of `simd_json` work so other @@ -91,7 +92,9 @@ pub async fn fetch_full_manifest(opts: FetchManifestOptions<'_>) -> Result) -> Result(bytes) + let body_us = t_body_start.elapsed().as_micros() as u64; + let bytes_len = bytes.len() as u64; + let t_parse_start = std::time::Instant::now(); + let parsed = parse_json_off_runtime::(bytes) .await - .map_err(FetchError::Permanent) + .map_err(FetchError::Permanent); + let parse_us = t_parse_start.elapsed().as_micros() as u64; + if parsed.is_ok() { + FETCH_TIMINGS.record(request_us, body_us, parse_us, bytes_len); + } + parsed } else { Err(classify_status(response.status(), &url)) } diff --git a/crates/ruborist/src/util/mod.rs b/crates/ruborist/src/util/mod.rs index 649e47c95..a7f0b7b7d 100644 --- a/crates/ruborist/src/util/mod.rs +++ b/crates/ruborist/src/util/mod.rs @@ -1,6 +1,8 @@ //! Shared utility primitives for ruborist and downstream consumers. pub mod oncemap; +pub mod timing; pub use crate::model::util::{PackageNameStr, parse_package_spec, read_package_json}; pub use oncemap::OnceMap; +pub use timing::{FETCH_TIMINGS, FetchTimings, FetchTimingsSnapshot}; diff --git a/crates/ruborist/src/util/timing.rs b/crates/ruborist/src/util/timing.rs new file mode 100644 index 000000000..f50e921b9 --- /dev/null +++ b/crates/ruborist/src/util/timing.rs @@ -0,0 +1,134 @@ +//! Per-phase manifest fetch timing accumulator for p1 perf investigation. +//! +//! Splits each `fetch_*_manifest` call into three observable pieces: +//! - `request_us`: from `request.send().await` to response headers +//! received. Captures TCP connect (when not pooled), TLS handshake, +//! HTTP request roundtrip, and server-side processing. +//! - `body_us`: from response headers to the entire JSON body buffered. +//! Network-bandwidth bound for large packuments. +//! - `parse_us`: from full body buffered to a typed manifest. CPU bound +//! (simd_json on a spawn_blocking thread). +//! +//! `parse_us` is wall-clock for the await on `parse_json_off_runtime` — +//! since JSON parse runs on `spawn_blocking`, this includes scheduling +//! latency rather than pure CPU time. Together with the per-fetch total +//! already tracked in `preload_manifests`, this lets us answer "where +//! did p1's wall time go?" without external profiling. +//! +//! All counters are `AtomicU64` so the recording path is lock-free. +//! Numbers are reset between resolves via [`reset()`] so successive +//! `utoo deps` invocations report independently. + +use std::sync::atomic::{AtomicU64, Ordering}; + +/// Per-process accumulator for manifest fetch timings. +#[derive(Default, Debug)] +pub struct FetchTimings { + /// Number of fetches recorded (full + version manifest). + pub count: AtomicU64, + /// Sum of microseconds spent in `request.send().await`. + pub request_us: AtomicU64, + /// Sum of microseconds spent in `response.bytes().await`. + pub body_us: AtomicU64, + /// Sum of microseconds spent awaiting `parse_json_off_runtime`. + pub parse_us: AtomicU64, + /// Sum of body bytes received across all fetches. + pub bytes: AtomicU64, +} + +impl FetchTimings { + /// Record one fetch's split timings. Call once per successful fetch. + pub fn record(&self, request_us: u64, body_us: u64, parse_us: u64, bytes: u64) { + self.count.fetch_add(1, Ordering::Relaxed); + self.request_us.fetch_add(request_us, Ordering::Relaxed); + self.body_us.fetch_add(body_us, Ordering::Relaxed); + self.parse_us.fetch_add(parse_us, Ordering::Relaxed); + self.bytes.fetch_add(bytes, Ordering::Relaxed); + } + + /// Reset all counters to zero. + pub fn reset(&self) { + self.count.store(0, Ordering::Relaxed); + self.request_us.store(0, Ordering::Relaxed); + self.body_us.store(0, Ordering::Relaxed); + self.parse_us.store(0, Ordering::Relaxed); + self.bytes.store(0, Ordering::Relaxed); + } + + /// Snapshot of the current accumulator state. + pub fn snapshot(&self) -> FetchTimingsSnapshot { + FetchTimingsSnapshot { + count: self.count.load(Ordering::Relaxed), + request_us: self.request_us.load(Ordering::Relaxed), + body_us: self.body_us.load(Ordering::Relaxed), + parse_us: self.parse_us.load(Ordering::Relaxed), + bytes: self.bytes.load(Ordering::Relaxed), + } + } +} + +/// Immutable snapshot suitable for printing. +#[derive(Debug, Clone, Copy)] +pub struct FetchTimingsSnapshot { + pub count: u64, + pub request_us: u64, + pub body_us: u64, + pub parse_us: u64, + pub bytes: u64, +} + +impl FetchTimingsSnapshot { + /// One-line summary for tracing logs. + pub fn summary_line(&self) -> String { + if self.count == 0 { + return "fetch-timings: no requests recorded".to_string(); + } + let count = self.count; + let avg_req = self.request_us / count; + let avg_body = self.body_us / count; + let avg_parse = self.parse_us / count; + let avg_bytes = self.bytes / count; + format!( + "fetch-timings: n={} sum_request={}ms sum_body={}ms sum_parse={}ms total_bytes={}MB | avg_request={}us avg_body={}us avg_parse={}us avg_bytes={}KB", + count, + self.request_us / 1_000, + self.body_us / 1_000, + self.parse_us / 1_000, + self.bytes / 1_000_000, + avg_req, + avg_body, + avg_parse, + avg_bytes / 1_024, + ) + } +} + +/// Process-wide manifest fetch timing accumulator. +pub static FETCH_TIMINGS: FetchTimings = FetchTimings { + count: AtomicU64::new(0), + request_us: AtomicU64::new(0), + body_us: AtomicU64::new(0), + parse_us: AtomicU64::new(0), + bytes: AtomicU64::new(0), +}; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn record_and_snapshot() { + FETCH_TIMINGS.reset(); + FETCH_TIMINGS.record(100, 200, 300, 1024); + FETCH_TIMINGS.record(150, 250, 350, 2048); + let snap = FETCH_TIMINGS.snapshot(); + assert_eq!(snap.count, 2); + assert_eq!(snap.request_us, 250); + assert_eq!(snap.body_us, 450); + assert_eq!(snap.parse_us, 650); + assert_eq!(snap.bytes, 3072); + FETCH_TIMINGS.reset(); + let snap2 = FETCH_TIMINGS.snapshot(); + assert_eq!(snap2.count, 0); + } +} From 8ac97ae036ab97cb986ce19109af18e130dbc1cd Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Fri, 8 May 2026 22:25:36 +0800 Subject: [PATCH 02/24] =?UTF-8?q?chore(p1):=20revert=20concurrency=20256?= =?UTF-8?q?=20=E2=86=92=2064=20+=20restore=20manifest-bench?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two changes after the GHA bench on the previous commit (PR #2916, run 25559625024) showed the concurrency=256 hypothesis was wrong on GHA's environment. Revert concurrency 256 → 64 --------------------------- The new fetch-timing instrumentation shipped in the previous commit caught the surprise: GHA's pcap-vs-local profile is the *opposite* of what local Mac measurements suggested. metric local Mac GHA Linux avg_request 399ms 70ms ← network MUCH faster on GHA avg_body 50ms 20ms avg_parse 730µs 266ms ← parse 365× SLOWER on GHA Mechanism: `parse_json_off_runtime` dispatches to `rayon::spawn`, and rayon's pool size is `num_cpus` (= 2 on GHA ubuntu-latest). Bumping concurrency 64 → 256 queued 256 manifest parses behind 2 rayon workers — head-of-line blocking. avg_parse jumped from ~10ms to 266ms wall, dragging p1 utoo wall from 3.10s up to 3.33s. Restore manifest-bench ---------------------- Brought back `crates/manifest-bench` (originally landed in the post-#2818 driver hunt, dropped in af714eb3 once #2818 graduated). It's a single-binary HTTP-only fetch tool that strips out the ruborist pipeline (no BFS, no dedup, no parse, no project cache, no lockfile write) — fires `GET /` in parallel and reports the same diag shape as the new `p1-breakdown` lines. Goal: separate the network ceiling from the resolver pipeline so the next round of p1 experiments (parse offload, partial parse, dedicated parse pool, etc.) can be evaluated against a stable "pure network" baseline. Knobs (unchanged from the original drop): --concurrency N sweep without rebuilding utoo --reps N run same workload back-to-back --single-version use //latest (smaller bodies) --user-agent X UA-fingerprint experiments --http1-only H2 vs H1 toggle --accept X override Accept header Same TLS stack as ruborist (rustls + aws-lc-rs, native roots). Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.toml | 1 + crates/manifest-bench/Cargo.toml | 37 +++ crates/manifest-bench/src/main.rs | 371 ++++++++++++++++++++++++++++++ crates/pm/src/util/user_config.rs | 19 +- 4 files changed, 421 insertions(+), 7 deletions(-) create mode 100644 crates/manifest-bench/Cargo.toml create mode 100644 crates/manifest-bench/src/main.rs diff --git a/Cargo.toml b/Cargo.toml index ef4a4f926..0574a185a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,7 @@ [workspace] resolver = "2" members = [ + "crates/manifest-bench", "crates/pack-api", "crates/pack-cli", "crates/pack-core", diff --git a/crates/manifest-bench/Cargo.toml b/crates/manifest-bench/Cargo.toml new file mode 100644 index 000000000..5b01e57c0 --- /dev/null +++ b/crates/manifest-bench/Cargo.toml @@ -0,0 +1,37 @@ +[package] +name = "manifest-bench" +version = "0.0.0" +edition = "2024" +license = "MIT" +publish = false +description = "Standalone HTTP-only manifest fetch benchmark, isolating network behaviour from ruborist's resolver pipeline." + +[[bin]] +name = "manifest-bench" +path = "src/main.rs" + +# tombi: format.rules.table-keys-order.disabled = true +[dependencies] +anyhow = { workspace = true } +clap = { workspace = true } +futures = "0.3" +serde = { version = "1", features = ["derive"] } +serde_json = { workspace = true } +tokio = { workspace = true, features = ["macros", "rt-multi-thread", "fs", "time"] } + +# Identical TLS / DNS choices to ruborist so we measure the *protocol* +# characteristics of the same stack, not a different implementation. +reqwest = { version = "0.12", default-features = false, features = [ + "brotli", + "gzip", + "http2", + "rustls-tls-native-roots-no-provider", + "socks" +] } +rustls = { version = "0.23", default-features = false, features = [ + "aws-lc-rs", + "logging", + "std", + "tls12" +] } +rustls-native-certs = "0.8" diff --git a/crates/manifest-bench/src/main.rs b/crates/manifest-bench/src/main.rs new file mode 100644 index 000000000..fa70f3fe4 --- /dev/null +++ b/crates/manifest-bench/src/main.rs @@ -0,0 +1,371 @@ +//! Standalone HTTP-only manifest fetch benchmark. +//! +//! Isolates the network behaviour of `reqwest + rustls + tokio` from +//! ruborist's resolver pipeline (BFS, dedup, parse, lockfile, project +//! cache). Reads a list of package names, builds manifest URLs, fires +//! parallel `GET` requests, records `(start, end)` per request, and +//! reports the same diag shape as ruborist's `Preload HTTP diag` line. +//! +//! Two input modes: +//! - `--names-file ` — newline-separated package names +//! - `--lockfile ` — a npm-style package-lock.json; we extract +//! the `packages.*` (v3) or `dependencies.*` (v2) keys +//! +//! Two registry modes: +//! - `/` — full manifest endpoint (default, npmjs) +//! - `//latest` — single-version endpoint +//! (gated behind `--single-version`) +//! +//! Each request reads the body to completion (we only measure I/O, no +//! parse). Output: same fields as preload's HTTP diag for direct +//! comparison. + +use std::path::PathBuf; +use std::sync::Arc; +use std::time::Instant; + +use anyhow::{Context, Result, anyhow}; +use clap::Parser; +use futures::stream::{FuturesUnordered, StreamExt}; + +#[derive(Parser, Debug)] +#[command( + name = "manifest-bench", + about = "HTTP-only manifest fetch bench (no parse, no resolver)" +)] +struct Args { + /// Registry base URL. + #[arg(long, default_value = "https://registry.npmjs.org")] + registry: String, + + /// File of newline-separated package names. Mutually exclusive with `--lockfile`. + #[arg(long, conflicts_with = "lockfile")] + names_file: Option, + + /// `package-lock.json` file. Reads top-level `packages.*.name` keys. + #[arg(long)] + lockfile: Option, + + /// Maximum concurrent in-flight requests. + #[arg(long, default_value_t = 128)] + concurrency: usize, + + /// Number of times to repeat the whole sweep (each iteration is a + /// fresh `reqwest::Client`, so connection pool / TLS handshake + /// costs are paid each time, matching `hyperfine` cold-start). + #[arg(long, default_value_t = 1)] + reps: usize, + + /// Use the single-version endpoint `//latest` instead of the + /// full-manifest endpoint `/`. Smaller bodies, more requests + /// served per byte. + #[arg(long)] + single_version: bool, + + /// Override `Accept` header. Default mimics ruborist's preload + /// (`application/vnd.npm.install-v1+json` — abbreviated metadata). + #[arg(long)] + accept: Option, + + /// Override `User-Agent`. Default uses reqwest's default. Try + /// `Bun/1.x.x` to test whether Cloudflare differentiates by UA. + #[arg(long)] + user_agent: Option, + + /// Force HTTP/1.1 (no H2 negotiation). Default lets ALPN decide. + #[arg(long)] + http1_only: bool, +} + +#[tokio::main] +async fn main() -> Result<()> { + let args = Args::parse(); + + let names = load_names(&args)?; + if names.is_empty() { + return Err(anyhow!("no package names found in input")); + } + + println!( + "manifest-bench: registry={} concurrency={} reps={} names={} h1_only={} single_version={} accept={} ua={}", + args.registry, + args.concurrency, + args.reps, + names.len(), + args.http1_only, + args.single_version, + args.accept.as_deref().unwrap_or(""), + args.user_agent.as_deref().unwrap_or(""), + ); + + for rep in 1..=args.reps { + run_once(&args, &names, rep).await?; + } + + Ok(()) +} + +fn load_names(args: &Args) -> Result> { + if let Some(path) = &args.names_file { + let raw = std::fs::read_to_string(path).with_context(|| format!("read {path:?}"))?; + return Ok(raw + .lines() + .map(str::trim) + .filter(|s| !s.is_empty() && !s.starts_with('#')) + .map(str::to_string) + .collect()); + } + + if let Some(path) = &args.lockfile { + let raw = std::fs::read_to_string(path).with_context(|| format!("read {path:?}"))?; + return extract_lockfile_names(&raw); + } + + Err(anyhow!("provide --names-file or --lockfile")) +} + +/// Pull unique package names from an npm v3 lockfile (`packages.*`) +/// or an older v2 lockfile (`dependencies.*`). +fn extract_lockfile_names(raw: &str) -> Result> { + use std::collections::BTreeSet; + + let v: serde_json::Value = serde_json::from_str(raw).context("parse lockfile JSON")?; + let mut names: BTreeSet = BTreeSet::new(); + + if let Some(packages) = v.get("packages").and_then(|p| p.as_object()) { + for key in packages.keys() { + if key.is_empty() { + continue; + } + // npm v3 packages key like "node_modules/foo" or + // "node_modules/@scope/bar/node_modules/baz" — take the + // last path segment (or @scope/name pair). + let last = last_module_name(key); + if !last.is_empty() { + names.insert(last); + } + } + } else if let Some(deps) = v.get("dependencies").and_then(|d| d.as_object()) { + for key in deps.keys() { + names.insert(key.clone()); + } + } + + Ok(names.into_iter().collect()) +} + +fn last_module_name(key: &str) -> String { + let parts: Vec<&str> = key.split("node_modules/").collect(); + let tail = parts.last().copied().unwrap_or(""); + tail.to_string() +} + +#[derive(Debug)] +struct ReqResult { + start: Instant, + end: Instant, + bytes: usize, + status: u16, +} + +async fn run_once(args: &Args, names: &[String], rep: usize) -> Result<()> { + // Build a fresh client per rep — matches hyperfine's cold-start + // assumption that each iteration pays the TLS handshake cost. + let client = build_client(args)?; + let registry = Arc::new(args.registry.trim_end_matches('/').to_string()); + let accept = Arc::new( + args.accept + .clone() + .unwrap_or_else(|| "application/vnd.npm.install-v1+json".to_string()), + ); + + let single_version = args.single_version; + let concurrency = args.concurrency; + + let phase_start = Instant::now(); + let mut futs = FuturesUnordered::new(); + let mut idx = 0usize; + let mut results: Vec = Vec::with_capacity(names.len()); + + while idx < names.len() && futs.len() < concurrency { + spawn_one( + &client, + ®istry, + &names[idx], + &accept, + single_version, + &mut futs, + ); + idx += 1; + } + + while let Some(res) = futs.next().await { + results.push(res); + if idx < names.len() { + spawn_one( + &client, + ®istry, + &names[idx], + &accept, + single_version, + &mut futs, + ); + idx += 1; + } + } + let phase_wall_ms = phase_start.elapsed().as_millis(); + + report(rep, &results, phase_wall_ms); + Ok(()) +} + +type Fut = std::pin::Pin + Send>>; + +fn spawn_one( + client: &reqwest::Client, + registry: &Arc, + name: &str, + accept: &Arc, + single_version: bool, + futs: &mut FuturesUnordered, +) { + let url = if single_version { + format!("{registry}/{name}/latest") + } else { + format!("{registry}/{name}") + }; + let client = client.clone(); + let accept = Arc::clone(accept); + futs.push(Box::pin(async move { + let start = Instant::now(); + let req = client.get(&url).header("accept", accept.as_str()).send(); + let (bytes, status) = match req.await { + Ok(resp) => { + let status = resp.status().as_u16(); + let body = resp.bytes().await.map(|b| b.len()).unwrap_or(0); + (body, status) + } + Err(_) => (0, 0), + }; + let end = Instant::now(); + ReqResult { + start, + end, + bytes, + status, + } + })); +} + +fn build_client(args: &Args) -> Result { + // Install aws-lc-rs as the default crypto provider (idempotent — + // first call wins). Matches ruborist's `service::http` setup. + let _ = rustls::crypto::aws_lc_rs::default_provider().install_default(); + + let mut roots = rustls::RootCertStore::empty(); + let native = rustls_native_certs::load_native_certs(); + for cert in native.certs { + let _ = roots.add(cert); + } + + let tls_config = rustls::ClientConfig::builder_with_provider(std::sync::Arc::new( + rustls::crypto::aws_lc_rs::default_provider(), + )) + .with_safe_default_protocol_versions() + .map_err(|e| anyhow!("rustls protocol versions: {e}"))? + .with_root_certificates(roots) + .with_no_client_auth(); + + let mut builder = reqwest::Client::builder() + .use_preconfigured_tls(tls_config) + .no_proxy() + .pool_max_idle_per_host(256); + if args.http1_only { + builder = builder.http1_only(); + } + if let Some(ua) = &args.user_agent { + builder = builder.user_agent(ua); + } + builder.build().context("build reqwest client") +} + +fn report(rep: usize, results: &[ReqResult], wall_ms: u128) { + if results.is_empty() { + eprintln!("[rep {rep}] no results"); + return; + } + + let mut spans: Vec<(Instant, Instant)> = results.iter().map(|r| (r.start, r.end)).collect(); + spans.sort_by_key(|(s, _)| *s); + + let first_start = spans.first().unwrap().0; + let last_end = spans.iter().map(|(_, e)| *e).max().unwrap(); + let win_wall = last_end.duration_since(first_start).as_millis(); + + let mut per_us: Vec = spans + .iter() + .map(|(s, e)| e.duration_since(*s).as_micros()) + .collect(); + per_us.sort_unstable(); + let n = per_us.len(); + let pct = |p: usize| per_us[(n * p).div_ceil(100).saturating_sub(1)]; + let sum: u128 = per_us.iter().sum(); + let p50 = per_us[n / 2]; + + let mut busy_us: u128 = 0; + let (mut cur_s, mut cur_e) = spans[0]; + for &(s, e) in &spans[1..] { + if s <= cur_e { + if e > cur_e { + cur_e = e; + } + } else { + busy_us += cur_e.duration_since(cur_s).as_micros(); + cur_s = s; + cur_e = e; + } + } + busy_us += cur_e.duration_since(cur_s).as_micros(); + + let bytes_total: usize = results.iter().map(|r| r.bytes).sum(); + let ok = results.iter().filter(|r| r.status == 200).count(); + let err = results.iter().filter(|r| r.status == 0).count(); + let four_xx = results + .iter() + .filter(|r| (400..500).contains(&r.status)) + .count(); + let five_xx = results + .iter() + .filter(|r| (500..600).contains(&r.status)) + .count(); + + let avg_conc = if busy_us > 0 { + sum as f64 / busy_us as f64 + } else { + 0.0 + }; + + println!( + "[rep {rep}] n={} phase_wall={}ms win_wall={}ms busy={}ms ({:.0}%) sum={}ms avg_conc={:.1} p50={}ms p95={}ms p99={}ms max={}ms bytes={} 200={} 4xx={} 5xx={} err={}", + n, + wall_ms, + win_wall, + busy_us / 1000, + if win_wall > 0 { + 100.0 * (busy_us as f64 / 1000.0) / win_wall as f64 + } else { + 0.0 + }, + sum / 1000, + avg_conc, + p50 / 1000, + pct(95) / 1000, + pct(99) / 1000, + per_us.last().unwrap() / 1000, + bytes_total, + ok, + four_xx, + five_xx, + err, + ); +} diff --git a/crates/pm/src/util/user_config.rs b/crates/pm/src/util/user_config.rs index bc281fb40..a0235830a 100644 --- a/crates/pm/src/util/user_config.rs +++ b/crates/pm/src/util/user_config.rs @@ -132,15 +132,20 @@ pub fn get_install_scope() -> InstallScope { INSTALL_SCOPE.get().copied().unwrap_or_default() } -// Manifest fetch concurrency configuration. +// Manifest fetch concurrency configuration. Default kept at 64. // -// 256 to match bun's observed ~260 parallel TCP streams against -// registry.npmjs.org. Local fetch-breakdown instrumentation showed -// 88% of preload-phase work is in per-request RTT (TCP+TLS+server), -// only 11% body, 0.16% parse — so the dominant lever for p1 wall is -// the cap on concurrent in-flight manifest requests. +// We tried 256 to match bun's observed parallel streams; on GHA the +// fetch-breakdown instrumentation showed sum_parse exploded from +// ~10ms (local Mac, network-bound) to 728s on first cold run with +// 256 concurrency. Mechanism: parse_json_off_runtime dispatches to +// rayon, which has only num_cpus (=2 on GHA) workers. Bumping +// concurrency to 256 queued 256 parses behind 2 workers → wall +// per-parse jumped from 730µs to 266ms. Net p1 wall *increased* +// 3.10s → 3.33s on phases bench. Keep 64 until we address the +// parse-side queueing (e.g. inline parse on tokio, or a wider +// dedicated parse pool). static MANIFESTS_CONCURRENCY_LIMIT: LazyLock> = - LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 256)); + LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 64)); pub fn set_manifests_concurrency_limit(value: Option) { MANIFESTS_CONCURRENCY_LIMIT.set(value); From 5690a9b6b416fb7040a52a3ce24a303177d8bc76 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Fri, 8 May 2026 22:56:20 +0800 Subject: [PATCH 03/24] ci(p1): wire manifest-bench standalone HTTP sweep into bench-phases-linux MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit build-linux now also builds + uploads `manifest-bench` when a phases bench is going to run (label or dispatch). bench-phases-linux downloads the binary and runs it after the regular phase-isolated benchmark. Sweep mirrors the original (#2818-era) wire-in: concurrency: 32 / 64 / 96 / 128 / 192 / 256 (HTTP/1.1, full manifest) protocol: H1 vs H2-negotiate (cap=128) endpoint: full vs `//latest` (cap=128, smaller bodies) UA: default vs `Bun/1.2.21` (cap=128) Output goes to /tmp/pm-bench-output/manifest-bench-npmjs.log and ships in the existing pm-bench-logs-linux artifact — no PR comment surface (the headline phases bench comment stays the same). Why now: the new ruborist `p1-breakdown` instrumentation showed sum_parse on GHA can dominate when concurrency is bumped (256: sum_parse 728s vs sum_request 193s). To attribute the bun-vs-utoo gap on p1_resolve we need a "pure HTTP" baseline that strips out ruborist's parse / BFS / dedup / lockfile path. manifest-bench is that baseline: same TLS stack as ruborist (rustls + aws-lc-rs, native roots), no resolver pipeline. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/pm-e2e-bench.yml | 80 ++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/.github/workflows/pm-e2e-bench.yml b/.github/workflows/pm-e2e-bench.yml index 74c90ece5..b25f5c380 100644 --- a/.github/workflows/pm-e2e-bench.yml +++ b/.github/workflows/pm-e2e-bench.yml @@ -143,6 +143,24 @@ jobs: name: utoo-linux-x64 path: target/x86_64-unknown-linux-gnu/release/utoo retention-days: 1 + # manifest-bench is a standalone HTTP-only fetch sweeper used as + # the network-only baseline for p1_resolve perf work. Built only + # when phases bench is going to run (label or dispatch), so plain + # PR builds aren't slowed by the extra crate. + - name: Build manifest-bench (p1 baseline) + if: > + (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'benchmark')) || + (github.event_name == 'workflow_dispatch' && (inputs.target == 'pm-bench-phases' || inputs.target == 'pm-bench-pcap')) + run: cargo build --release --target x86_64-unknown-linux-gnu -p manifest-bench + - name: Upload manifest-bench binary + if: > + (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'benchmark')) || + (github.event_name == 'workflow_dispatch' && (inputs.target == 'pm-bench-phases' || inputs.target == 'pm-bench-pcap')) + uses: actions/upload-artifact@v4 + with: + name: manifest-bench-linux-x64 + path: target/x86_64-unknown-linux-gnu/release/manifest-bench + retention-days: 1 # Piggyback on the already-built target/ from the step above: when the # PR is labeled `benchmark`, overlay origin/next's tree onto the current # workdir and re-run cargo build. cargo's incremental compile only @@ -516,6 +534,19 @@ jobs: mv /tmp/utoo-next-dist/utoo /tmp/utoo-next echo "Baseline utoo (next) version: $(/tmp/utoo-next --version)" echo "UTOO_NEXT_BIN=/tmp/utoo-next" >> $GITHUB_ENV + # Download the manifest-bench binary built by build-linux. Used as + # the network-only baseline for p1_resolve work — strips out parse, + # BFS, dedup, lockfile write so the wall is pure HTTP fetch. + - name: Download manifest-bench binary + uses: actions/download-artifact@v4 + with: + name: manifest-bench-linux-x64 + path: /tmp/manifest-bench-dist + - name: Install manifest-bench + run: | + chmod +x /tmp/manifest-bench-dist/manifest-bench + mv /tmp/manifest-bench-dist/manifest-bench /tmp/manifest-bench + echo "MANIFEST_BENCH_BIN=/tmp/manifest-bench" >> $GITHUB_ENV - name: Verify tools run: | hyperfine --version @@ -565,6 +596,55 @@ jobs: run: | mkdir -p /tmp/pm-bench-output bash bench/pm-bench-phases.sh 2>&1 | tee /tmp/pm-bench-output/bench-phases-npmmirror.log + # Standalone HTTP-only sweep — sweeps the network-only ceiling + # against the same lockfile-derived workload phase-bench just used. + # Output goes into the bench logs artifact; no PR comment surface. + - name: Standalone manifest-bench (HTTP-only sweep) + env: + PROJECT: ${{ github.event.inputs.project || 'ant-design' }} + REGISTRY: 'https://registry.npmjs.org' + run: | + set -eu + mkdir -p /tmp/pm-bench-output + PROJECT_DIR="/tmp/pm-bench/$PROJECT" + if [ ! -d "$PROJECT_DIR" ]; then + mkdir -p /tmp/pm-bench + git clone --depth 1 "https://github.com/ant-design/$PROJECT" "$PROJECT_DIR" + fi + cd "$PROJECT_DIR" + if [ ! -f package-lock.json ]; then + echo "==> generating lockfile via utoo (one-shot, untimed)" + utoo deps --registry "$REGISTRY" || true + fi + ls -la package-lock.json || { echo "no lockfile; skipping manifest-bench"; exit 0; } + + MB_LOG=/tmp/pm-bench-output/manifest-bench-npmjs.log + { + echo "============================================================" + echo "manifest-bench: HTTP-only fetch (no parse, no resolver)" + echo " Goal: isolate reqwest/rustls/tokio behaviour from" + echo " ruborist's resolver pipeline. Same metric shape as" + echo " ruborist's p1-breakdown line." + echo "============================================================" + for CAP in 32 64 96 128 192 256; do + echo + echo "--- concurrency=$CAP, h1, full manifest, default UA ---" + "$MANIFEST_BENCH_BIN" --lockfile package-lock.json --registry "$REGISTRY" \ + --concurrency "$CAP" --reps 2 --http1-only || true + done + echo + echo "--- concurrency=128, h2 negotiate, full manifest, default UA ---" + "$MANIFEST_BENCH_BIN" --lockfile package-lock.json --registry "$REGISTRY" \ + --concurrency 128 --reps 2 || true + echo + echo "--- concurrency=128, h1, single-version endpoint ---" + "$MANIFEST_BENCH_BIN" --lockfile package-lock.json --registry "$REGISTRY" \ + --concurrency 128 --reps 2 --http1-only --single-version || true + echo + echo "--- concurrency=128, h1, UA=Bun/1.2.21 ---" + "$MANIFEST_BENCH_BIN" --lockfile package-lock.json --registry "$REGISTRY" \ + --concurrency 128 --reps 2 --http1-only --user-agent "Bun/1.2.21" || true + } 2>&1 | tee "$MB_LOG" - name: Upload bench logs if: always() uses: actions/upload-artifact@v4 From 94af458887de3add09f2e973dbbad6f2524f1a5f Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Fri, 8 May 2026 23:24:56 +0800 Subject: [PATCH 04/24] perf(ruborist): inline JSON parse, drop rayon::spawn dispatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI fetch-breakdown on GHA (run 25562552058, conc=64) showed parse queueing on rayon dominates the gap to manifest-bench's pure-HTTP baseline: manifest-bench (pure HTTP, conc=64): 2.12s wall utoo p1 (full ruborist): 3.10s wall ← +1.0s overhead ↑ sum_parse 95s vs sum_request 95s, parse 50% of work-time ↑ avg_parse 30ms wall vs ~5ms actual CPU — the 25ms extra is rayon queue wait Mechanism: 64 concurrent tasks all dispatching parse to rayon's pool (size = num_cpus = 2 on GHA). Queue depth grows to ~32 per worker. Each parse waits 25ms+ in queue before running its 5ms of CPU work. Round 1 fix: inline parse, drop the rayon hop. simd_json on a tokio worker thread is fast (~5ms for 115KB JSON), and the tokio runtime's cooperative budget naturally rebalances CPU across the 64 tasks. Expected on next CI: - avg_parse drops from 30ms wall → ~5-10ms wall (close to CPU-only) - preload_wall drops from 5.4s → ~3.5-4s for cold runs - p1 hyperfine wall drops from 3.10s → 2.3-2.5s, narrowing the gap to manifest-bench's 2.12s ceiling If parse becomes the new bottleneck (CPU-bound), next round could look at partial parse / lazy field access. If wall doesn't drop, hypothesis is wrong and we look elsewhere (BFS, dedup, lockfile). Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/service/manifest.rs | 29 +++++++++---------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/crates/ruborist/src/service/manifest.rs b/crates/ruborist/src/service/manifest.rs index 36bc6a85a..3502f6ec2 100644 --- a/crates/ruborist/src/service/manifest.rs +++ b/crates/ruborist/src/service/manifest.rs @@ -14,29 +14,20 @@ use super::http::get_client; use crate::model::manifest::{CoreVersionManifest, FullManifest}; use crate::util::FETCH_TIMINGS; -/// Parse JSON bytes on rayon's CPU thread pool (native) or inline -/// (wasm32). Keeps the tokio runtime free of `simd_json` work so other -/// in-flight manifest fetches keep driving network IO while this one -/// parses. +/// Parse JSON bytes inline on the calling tokio task. Previously this +/// dispatched to `rayon::spawn` to "free the runtime", but +/// fetch-breakdown instrumentation on GHA showed the rayon hop made it +/// strictly worse: rayon's pool is `num_cpus` (= 2 on ubuntu-latest), +/// 64 concurrent fetches all dispatching parse queued behind 2 workers +/// — avg_parse ballooned from ~5ms (CPU only) to 30ms wall (queue + +/// CPU). Inlining puts parse on the tokio worker that already owns +/// the buffer; the cooperative-scheduling budget naturally rebalances +/// CPU between fetches. async fn parse_json_off_runtime(mut bytes: Vec) -> Result where T: serde::de::DeserializeOwned + Send + 'static, { - #[cfg(not(target_arch = "wasm32"))] - { - let (tx, rx) = tokio::sync::oneshot::channel(); - rayon::spawn(move || { - let result = simd_json::serde::from_slice::(&mut bytes) - .map_err(|e| anyhow!("JSON parse error: {e}")); - let _ = tx.send(result); - }); - rx.await - .map_err(|e| anyhow!("rayon parse channel closed: {e}"))? - } - #[cfg(target_arch = "wasm32")] - { - simd_json::serde::from_slice::(&mut bytes).map_err(|e| anyhow!("JSON parse error: {e}")) - } + simd_json::serde::from_slice::(&mut bytes).map_err(|e| anyhow!("JSON parse error: {e}")) } /// Result of a full manifest fetch with ETag support. From ee5f5f4d23c8c9668c90c7d6b3b12eb49dab3afe Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Fri, 8 May 2026 23:47:49 +0800 Subject: [PATCH 05/24] perf(ruborist): switch JSON parse to tokio spawn_blocking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 1 (inline parse) reverted on data: GHA showed +0.37s p1 regression because parse blocked tokio runtime workers, dropping eff_parallel 42 → 35 even though per-fetch work-time fell. avg_request went up from 35ms → 52ms — symptomatic of socket reads being delayed by the parsing task on the same worker. metric round 0 (rayon) round 1 (inline) p1 wall 3.27s 3.64s ⚠️ +0.37s avg_parse 30ms (queued) 300µs ✓ avg_request 35ms 52ms ⚠️ +17ms (worker contention) eff_parallel 42 35 ⚠️ Round 2 attempts the third option: `tokio::task::spawn_blocking`. - rayon's pool was too small (num_cpus = 2 on GHA) — 64 concurrent parses queued behind 2 workers, parse wall 30ms. - inline parse held tokio worker hostage during simd_json call, starving in-flight socket reads. - tokio's blocking pool has a much larger default cap (512), so 64 concurrent parses never queue. Unlike rayon there's no contention with the install path's parallel-write rayon usage. Unlike inline the tokio runtime workers stay free to drive network I/O. Expected on next CI: - avg_parse drops to ~5-10ms wall (close to CPU floor, no queue) - avg_request stays ~35ms (workers free for I/O) - eff_parallel returns to ~50, possibly higher - p1 wall drops toward manifest-bench's 2.10s ceiling Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/service/manifest.rs | 39 ++++++++++++++++++------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/crates/ruborist/src/service/manifest.rs b/crates/ruborist/src/service/manifest.rs index 3502f6ec2..90f1db71b 100644 --- a/crates/ruborist/src/service/manifest.rs +++ b/crates/ruborist/src/service/manifest.rs @@ -14,20 +14,39 @@ use super::http::get_client; use crate::model::manifest::{CoreVersionManifest, FullManifest}; use crate::util::FETCH_TIMINGS; -/// Parse JSON bytes inline on the calling tokio task. Previously this -/// dispatched to `rayon::spawn` to "free the runtime", but -/// fetch-breakdown instrumentation on GHA showed the rayon hop made it -/// strictly worse: rayon's pool is `num_cpus` (= 2 on ubuntu-latest), -/// 64 concurrent fetches all dispatching parse queued behind 2 workers -/// — avg_parse ballooned from ~5ms (CPU only) to 30ms wall (queue + -/// CPU). Inlining puts parse on the tokio worker that already owns -/// the buffer; the cooperative-scheduling budget naturally rebalances -/// CPU between fetches. +/// Parse JSON bytes on tokio's blocking thread pool. +/// +/// The history of this function captures three different attempts: +/// - rayon::spawn (original): rayon's pool is `num_cpus` (= 2 on +/// GHA), 64 concurrent parses queued behind 2 workers → avg_parse +/// 30ms wall vs ~5ms CPU. round-0 baseline. +/// - inline (round 1, reverted): no rayon hop, but the simd_json +/// call blocks the tokio runtime worker, so other in-flight +/// fetches couldn't drive their socket I/O — avg_request grew +/// 35ms → 52ms (+17ms), eff_parallel 42 → 35, net p1 wall +0.37s. +/// - spawn_blocking (current): tokio's dedicated blocking pool has +/// a much higher default cap (512), so 64 concurrent parses are +/// never queued. Unlike rayon there's no contention with the +/// install path's parallel-write rayon usage, and unlike inline +/// the tokio runtime workers stay free to drive network I/O on +/// all in-flight fetches. async fn parse_json_off_runtime(mut bytes: Vec) -> Result where T: serde::de::DeserializeOwned + Send + 'static, { - simd_json::serde::from_slice::(&mut bytes).map_err(|e| anyhow!("JSON parse error: {e}")) + #[cfg(not(target_arch = "wasm32"))] + { + tokio::task::spawn_blocking(move || { + simd_json::serde::from_slice::(&mut bytes) + .map_err(|e| anyhow!("JSON parse error: {e}")) + }) + .await + .map_err(|e| anyhow!("spawn_blocking parse panicked: {e}"))? + } + #[cfg(target_arch = "wasm32")] + { + simd_json::serde::from_slice::(&mut bytes).map_err(|e| anyhow!("JSON parse error: {e}")) + } } /// Result of a full manifest fetch with ETag support. From 16404fc481577a03b00ba2f46aa1f3711ec5351f Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 00:14:46 +0800 Subject: [PATCH 06/24] perf(ruborist): switch extract_core_version to spawn_blocking too MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 2 moved parse_json_off_runtime off rayon (-0.11s p1). But fetch-breakdown still showed avg_request 41ms vs round 0's 35ms, hinting at a second source of rayon contention. Found it: `extract_core_version_off_runtime` is also on `rayon::spawn`. On npmjs.org's `!supports_semver` path EVERY fetch resolves through `resolve_via_full_manifest`, which fetches the full packument once per package name (deduped via inflight_full) and then calls `extract_core_version_off_runtime` per (name, spec) to materialize the chosen version into a `CoreVersionManifest`. So per fetch we hit rayon TWICE — once for the JSON parse (round 2 moved to spawn_blocking), and once for `get_core_version` (still on rayon). The second hop has the same head-of-line blocking signature as the first: 64 concurrent resolves dispatching to a 2-thread rayon pool. Round 3: move extract_core_version_off_runtime to spawn_blocking for the same reasons. The work is JSON lazy-reparse (`raw_json` sub-tree decoding) — genuinely blocking, well-suited for tokio's blocking pool. Expected: utoo p1 wall drops further toward manifest-bench's 2.10s ceiling. avg_request should fall back from 41ms → ~35ms (rayon contention removed from the fetch task's await chain). Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/model/manifest.rs | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/crates/ruborist/src/model/manifest.rs b/crates/ruborist/src/model/manifest.rs index 37e95deb9..15c762eb5 100644 --- a/crates/ruborist/src/model/manifest.rs +++ b/crates/ruborist/src/model/manifest.rs @@ -163,14 +163,20 @@ pub async fn extract_core_version_off_runtime( full: Arc, version: String, ) -> (String, Option>) { + // See `parse_json_off_runtime` for the same rayon-vs-spawn_blocking + // history: rayon's `num_cpus` pool oversubscribes when many concurrent + // resolves all extract from full manifests at once. spawn_blocking's + // larger pool avoids the queue, and the work is genuinely blocking + // (lazy JSON re-parse via `get_core_version`) so the blocking pool + // is the right home. #[cfg(not(target_arch = "wasm32"))] { - let (tx, rx) = tokio::sync::oneshot::channel(); - rayon::spawn(move || { + tokio::task::spawn_blocking(move || { let core = full.get_core_version(&version).map(Arc::new); - let _ = tx.send((version, core)); - }); - rx.await.expect("rayon parse worker dropped before sending") + (version, core) + }) + .await + .expect("spawn_blocking parse worker panicked") } #[cfg(target_arch = "wasm32")] { From 460a53885b30982bd19c68ca1a866fa540c66a76 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 00:37:37 +0800 Subject: [PATCH 07/24] revert + instrument(ruborist): post-build phase timing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two changes for round 4 of p1 optimization: 1. Revert `extract_core_version_off_runtime` from spawn_blocking back to rayon::spawn (round 3). Within-run measurement showed +0.42s regression vs utoo-next (round 2 was +0.11s). Likely cause: this function is called per (name, spec), so multi-spec packages call it 2-5x per fetch. spawn_blocking's per-dispatch overhead exceeds rayon queue savings at this multiplier. 2. Add `serialize_us` and `cache_export_us` to the p1-breakdown line so we can attribute the remaining gap. Currently: manifest-bench wall: 2.10s (pure HTTP ceiling) utoo p1 wall (round 2): 3.16s gap: 1.06s We have: preload_wall ≈ 2.7s (logged) bfs_wall ≈ 0.3s (logged) serialize_us ? cache_export_us ? ← suspected: full manifest deep-clone into ProjectCacheData for ~2730 entries Next round will have data to choose between attacking serialize, cache export, or the BFS loop body. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/model/manifest.rs | 24 +++++++++++++----------- crates/ruborist/src/service/api.rs | 10 ++++++++++ 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/crates/ruborist/src/model/manifest.rs b/crates/ruborist/src/model/manifest.rs index 15c762eb5..3509e839d 100644 --- a/crates/ruborist/src/model/manifest.rs +++ b/crates/ruborist/src/model/manifest.rs @@ -163,20 +163,22 @@ pub async fn extract_core_version_off_runtime( full: Arc, version: String, ) -> (String, Option>) { - // See `parse_json_off_runtime` for the same rayon-vs-spawn_blocking - // history: rayon's `num_cpus` pool oversubscribes when many concurrent - // resolves all extract from full manifests at once. spawn_blocking's - // larger pool avoids the queue, and the work is genuinely blocking - // (lazy JSON re-parse via `get_core_version`) so the blocking pool - // is the right home. + // Round 3 attempted to switch this to `tokio::task::spawn_blocking` + // for the same reasons as `parse_json_off_runtime`, but CI showed + // it regressed p1 by 0.5s on `preload_wall`. Mechanism: this + // function is called per (name, spec), so packages with multiple + // specs (e.g. peer-dep range overlaps) call it 2-5x per fetch. + // spawn_blocking's per-dispatch overhead (channel + thread wake) + // is significant for short CPU work; with the multiplier this + // outweighed rayon queue waits at conc=64. Keep on rayon::spawn. #[cfg(not(target_arch = "wasm32"))] { - tokio::task::spawn_blocking(move || { + let (tx, rx) = tokio::sync::oneshot::channel(); + rayon::spawn(move || { let core = full.get_core_version(&version).map(Arc::new); - (version, core) - }) - .await - .expect("spawn_blocking parse worker panicked") + let _ = tx.send((version, core)); + }); + rx.await.expect("rayon parse worker dropped before sending") } #[cfg(target_arch = "wasm32")] { diff --git a/crates/ruborist/src/service/api.rs b/crates/ruborist/src/service/api.rs index 878b357a1..82703ed97 100644 --- a/crates/ruborist/src/service/api.rs +++ b/crates/ruborist/src/service/api.rs @@ -258,9 +258,12 @@ where .await .map_err(|e| anyhow::Error::new(e).context("Dependency resolution failed"))?; + let t_serialize_start = std::time::Instant::now(); let (packages, _total) = graph.serialize_to_packages(&root_path); + let serialize_us = t_serialize_start.elapsed().as_micros() as u64; // Export project cache from memory cache for the host to persist. + let t_cache_export_start = std::time::Instant::now(); let mut project_cache = ProjectCacheData::default(); for (key, manifest) in registry.cache().export_version_manifests() { // `parse_package_spec` rather than `split_once('@')` so scoped names @@ -271,6 +274,13 @@ where pkg_cache.specs.insert(spec.to_string(), version.clone()); pkg_cache.manifests.insert(version, (*manifest).clone()); } + let cache_export_us = t_cache_export_start.elapsed().as_micros() as u64; + + tracing::info!( + "p1-breakdown serialize_us={} cache_export_us={}", + serialize_us, + cache_export_us, + ); Ok(BuildDepsOutput { lock: PackageLock::new(&pkg.name, &pkg.version, packages), From 58d49aafd2f886d1af364d91f85997e4dc01e37e Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 01:02:11 +0800 Subject: [PATCH 08/24] instrument(ruborist): preload main loop dispatch + result split MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 4 measured serialize_us = 15ms and cache_export_us = 34ms — both tiny — confirming the 1s gap from manifest-bench (utoo p1 = 3.16s vs mb wall = 2.10s) is not in post-build code. Per-fetch math also pointed at main-loop bookkeeping: manifest-bench: eff_parallel = 52 (sum_work 111s / wall 2.14s) utoo preload : eff_parallel = 43 (sum_work 120s / wall 2.85s) Same conc=64 cap, but utoo loses 9 effective slots — most likely the main loop's serial bookkeeping (dedup hash insert, format! key, extract_transitive_deps, queue push, 3-4 receiver events) holds the flow between futures.next() returning and the next fetch dispatch. This commit splits the main loop into two timed segments: preload_loop_dispatch_us: time spent in the `while in_flight < concurrency` block — popping pending, dedup check, futures.push. preload_loop_result_us: time spent processing each completed future — extract_transitive_deps, pending.extend, on_manifest. If dispatch+result sum approaches preload_wall, the main loop is the bottleneck and we need to either (a) split processing onto a dedicated task, or (b) use unbounded futures with a downstream consumer. If they're small, the gap is elsewhere (per-task overhead in resolve_package's inflight gates). Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/resolver/preload.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/crates/ruborist/src/resolver/preload.rs b/crates/ruborist/src/resolver/preload.rs index 1230c5bf6..e9a777407 100644 --- a/crates/ruborist/src/resolver/preload.rs +++ b/crates/ruborist/src/resolver/preload.rs @@ -99,8 +99,17 @@ where let mut in_flight = 0usize; let mut started = false; + // Main-loop overhead instrumentation. Atomic accumulators so we + // can attribute the gap between manifest-bench's pure-HTTP wall + // and ruborist's preload wall: how much of the gap is bookkeeping + // (dedup hash, extract_transitive_deps, queue push, events) vs + // actual fetch wait? + let mut total_dispatch_us: u64 = 0; + let mut total_result_us: u64 = 0; + loop { // Fill up to concurrency limit + let dispatch_start = tokio::time::Instant::now(); while in_flight < concurrency { let item = loop { let Some((name, spec)) = pending.pop_front() else { @@ -134,6 +143,7 @@ where }); in_flight += 1; } + total_dispatch_us += dispatch_start.elapsed().as_micros() as u64; if in_flight == 0 { break; @@ -142,6 +152,7 @@ where let Some((name, result, elapsed_ms)) = futures.next().await else { break; }; + let result_start = tokio::time::Instant::now(); in_flight -= 1; if stats.success_count == 0 && stats.failed_count == 0 { @@ -174,8 +185,15 @@ where tracing::debug!("Failed to preload {}: {}", name, e); } } + total_result_us += result_start.elapsed().as_micros() as u64; } + tracing::info!( + "p1-breakdown preload_loop_dispatch_us={} preload_loop_result_us={}", + total_dispatch_us, + total_result_us, + ); + stats.total_processed = processed.len(); receiver.on_event(BuildEvent::PreloadComplete { From 8114bf42af0e9d102bd9c2893acd764d9e0470be Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 01:25:54 +0800 Subject: [PATCH 09/24] perf(pm): grow rayon pool to max(num_cpus, 8) to drain p1 extract queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 5 main-loop instrumentation showed the preload main loop itself is fast (15-25ms total dispatch+result). The 0.8s gap from manifest-bench's 2.10s wall lives INSIDE the spawned fetch tasks. Per-fetch wall (warm runs): measured: avg_request 30ms + avg_body 6ms + avg_parse 2.5ms = ~38ms derived: preload_wall 2.4s × eff_parallel(43) / 2730 = 38ms delta: ~12ms unaccounted per task That 12ms is `extract_core_version_off_runtime` queueing on rayon's 2-thread pool. extract is called per (name, spec) — for ant-design that's ~3000+ calls. With pool=2 and 64 concurrent fetches each dispatching extract, the queue depth grows; each task waits its turn before extract returns. Bump rayon pool to `max(num_cpus, 8)` for non-Windows. Sizing the pool above the CPU count for short blocking JSON ops (parse + extract) replaces FIFO queueing with parallel dispatch. Real CPU contention is bounded by num_cpus (the kernel scheduler still gates), so the extra pool threads just hold ready-to-run dispatches in parallel rather than serialised in a queue. Why not just spawn_blocking (round 3 attempt): tokio's blocking pool defaults to 512 threads, but its per-dispatch overhead was higher than rayon's even when queueing — round 3 regressed by 0.5s. Expected: extract queue wait drops from ~12ms to ~1-2ms wall, p1 preload_wall narrows toward manifest-bench's 2.10s. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/pm/src/util/sysconf.rs | 45 ++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/crates/pm/src/util/sysconf.rs b/crates/pm/src/util/sysconf.rs index af77a7745..645b7b451 100644 --- a/crates/pm/src/util/sysconf.rs +++ b/crates/pm/src/util/sysconf.rs @@ -6,13 +6,46 @@ pub fn init() { reset_sigpipe(); } - // Windows default thread stack is 1MB, insufficient for libdeflater + tar - // + rayon work-stealing. + init_rayon_pool(); +} + +/// Configure the global rayon pool size. +/// +/// Rayon defaults to `num_cpus` workers, which is 2 on GHA ubuntu-latest. +/// Two workers are enough for the install-path's `par_chunks(64)` extract +/// (mostly disk-bound), but the resolve-path's manifest parse + extract +/// pipeline runs *many* short CPU bursts (parse: ~5ms, get_core_version: +/// ~1-3ms) dispatched from up to 64 concurrent fetches. +/// +/// With pool=2, each fetch waits up to ~25ms in queue per dispatch — +/// fetch-breakdown instrumentation showed avg_parse jumping 5ms (CPU) +/// → 30ms (CPU + queue) just from the first dispatch. The second hop +/// (`extract_core_version_off_runtime`) has the same problem. `tokio +/// spawn_blocking` avoids the queue but its per-dispatch overhead +/// (round 3 measurement) was higher than rayon's queue wait at 64×. +/// +/// Sizing the pool above the host CPU count for these short, blocking +/// JSON-shape operations gives the queue a chance to drain even when +/// 64 fetches dispatch concurrently. The work itself is bounded — at +/// most 2 are doing real CPU at once on a 2-core box; the extra pool +/// slots just hold pending tasks until a CPU is free, replacing FIFO +/// queueing with parallel dispatch. +/// +/// Cap of 8 keeps the pool reasonable on bigger machines (where +/// `num_cpus` is already enough); the floor of 8 oversubscribes +/// only on the constrained 2-core CI image. +fn init_rayon_pool() { + let parallelism = std::thread::available_parallelism() + .map(std::num::NonZero::get) + .unwrap_or(2); + let threads = parallelism.max(8); + + let builder = rayon::ThreadPoolBuilder::new().num_threads(threads); + #[cfg(target_os = "windows")] - rayon::ThreadPoolBuilder::new() - .stack_size(8 * 1024 * 1024) - .build_global() - .ok(); + let builder = builder.stack_size(8 * 1024 * 1024); + + builder.build_global().ok(); } /// Restore default SIGPIPE handling so broken pipes cause a clean exit From 394f6c92d7c5f929c18846abec54fefb9dbbb1bd Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 04:06:09 +0800 Subject: [PATCH 10/24] perf(pm): skip preload for p1 path; BFS does per-level parallel prefetch Adds `BuildDepsOptions::skip_preload` so callers without a pipeline consumer (utoo deps / package-lock-only) can drop the up-front preload phase entirely. BFS now batches prefetch per level across the whole frontier, then runs the existing sequential process_dependency walk against the warmed cache. For install paths (Context::pipeline_deps_options), skip_preload stays false so PackageResolved events still feed the download/clone pipeline. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/pm/src/helper/ruborist_context.rs | 8 ++- crates/ruborist/src/resolver/builder.rs | 71 +++++++++++++++++++++--- crates/ruborist/src/service/api.rs | 21 ++++++- 3 files changed, 90 insertions(+), 10 deletions(-) diff --git a/crates/pm/src/helper/ruborist_context.rs b/crates/pm/src/helper/ruborist_context.rs index b47def019..bc4d7faa1 100644 --- a/crates/pm/src/helper/ruborist_context.rs +++ b/crates/pm/src/helper/ruborist_context.rs @@ -63,6 +63,7 @@ impl Context { receiver, supports_semver: get_supports_semver(), catalogs, + skip_preload: false, } } @@ -82,8 +83,13 @@ impl Context { /// Resolve dependency tree with plain ProgressReceiver. Returns /// [`BuildDepsOutput`] (lock + project cache); the project cache is /// persisted in the background. + /// + /// Used by the lockfile-only path (`utoo deps`). No pipeline consumes + /// `PackageResolved` events here, so preload is pure overhead — BFS's + /// own per-level parallel prefetch warms the manifest cache. pub async fn build_deps(cwd: PathBuf) -> anyhow::Result { - let options = Self::deps_options(cwd.clone(), ProgressReceiver).await; + let mut options = Self::deps_options(cwd.clone(), ProgressReceiver).await; + options.skip_preload = true; let output = utoo_ruborist::service::build_deps(options).await?; spawn_save_project_cache(cwd, output.project_cache.clone()); Ok(output) diff --git a/crates/ruborist/src/resolver/builder.rs b/crates/ruborist/src/resolver/builder.rs index 166372c91..d811fc38c 100644 --- a/crates/ruborist/src/resolver/builder.rs +++ b/crates/ruborist/src/resolver/builder.rs @@ -18,21 +18,22 @@ //! This separation allows for maximum parallelism during network I/O //! while keeping the graph building logic simple and deterministic. -use petgraph::graph::NodeIndex; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::path::PathBuf; use std::sync::Arc; #[cfg(feature = "http-tarball")] use anyhow::Context as _; +use futures::stream::{self, StreamExt}; +use petgraph::graph::NodeIndex; use crate::model::graph::{DependencyGraph, FindResult, PackageNode}; use crate::model::manifest::NodeManifest; use crate::model::node::EdgeType; use crate::model::package_json::PackageJson; use crate::resolver::preload::{PreloadConfig, preload_manifests}; -use crate::resolver::registry::{ResolveError, resolve_registry_dep}; -use crate::spec::{Catalogs, PackageSpec, Protocol}; +use crate::resolver::registry::{ResolveError, resolve_package, resolve_registry_dep}; +use crate::spec::{Catalogs, PackageSpec, Protocol, SpecStr}; use crate::traits::progress::{BuildEvent, EventReceiver, NoopReceiver}; use crate::traits::registry::{RegistryClient, ResolvedPackage}; @@ -181,9 +182,6 @@ struct NodeFlags { /// resolved at edge creation time, so by the time this runs they are already /// concrete registry specs. fn gather_preload_deps(graph: &DependencyGraph, peer_deps: PeerDeps) -> Vec<(String, String)> { - use crate::spec::SpecStr; - use std::collections::HashSet; - let mut deps = HashSet::new(); let collect = |node_index: NodeIndex, deps: &mut HashSet<(String, String)>| { @@ -805,20 +803,74 @@ async fn run_preload_phase( } /// Run the BFS traversal phase to build the dependency tree. +/// +/// Each level does a parallel prefetch of all unresolved registry specs +/// before the sequential `process_dependency` walk. The prefetch warms +/// the registry's manifest cache so the per-edge `process_dependency` +/// calls below hit cache instead of awaiting network. +/// +/// This collapses the previously-separate `run_preload_phase` (which +/// fetched all transitive manifests up-front) into per-level batches. +/// Net effect on `utoo deps`: no separate preload wall — fetch happens +/// inside BFS in waves matching the dep tree's natural levels. For +/// install paths (p0/p3), `run_preload_phase` may still run via +/// `skip_preload=false` and feed the `PackageResolved` pipeline event. async fn run_bfs_phase( graph: &mut DependencyGraph, registry: &R, config: &BuildDepsConfig, receiver: &E, ) -> Result<(), ResolveError> { + // Reset fetch counters so the breakdown line reports fetches issued + // *during* this BFS phase, not preload's. (Preload still runs for + // install-path callers and reports its own breakdown.) + if config.skip_preload { + crate::util::FETCH_TIMINGS.reset(); + } + let start = tokio::time::Instant::now(); + let mut total_prefetch_wall_us: u64 = 0; + let mut total_merge_wall_us: u64 = 0; let mut current_level = vec![graph.root_index]; + let mut prefetched: HashSet = HashSet::new(); while !current_level.is_empty() { receiver.on_event(BuildEvent::LevelStart { node_count: current_level.len(), }); + + // Phase A: collect unresolved registry edges across the whole level + // (deduplicated against earlier levels — once a (name, spec) is + // prefetched, the registry's cache satisfies every subsequent + // `process_dependency` call). + let mut prefetch_targets: Vec<(String, String)> = Vec::new(); + for &node_index in ¤t_level { + for edge in collect_unresolved_edges(graph, node_index) { + if edge.spec.is_registry_spec() { + let key = format!("{}@{}", edge.name, edge.spec); + if prefetched.insert(key) { + prefetch_targets.push((edge.name, edge.spec)); + } + } + } + } + + // Phase B: parallel prefetch — pure cache warming. Errors are + // ignored here; the sequential `process_dependency` below will + // re-issue (now hitting either cache or the same fresh failure) + // and propagate any real error through the existing path. + if !prefetch_targets.is_empty() { + let prefetch_start = tokio::time::Instant::now(); + stream::iter(prefetch_targets) + .for_each_concurrent(config.concurrency, |(name, spec)| async move { + let _ = resolve_package(registry, &name, &spec).await; + }) + .await; + total_prefetch_wall_us += prefetch_start.elapsed().as_micros() as u64; + } + + let merge_start = tokio::time::Instant::now(); let mut next_level = Vec::new(); for node_index in current_level { @@ -900,14 +952,17 @@ async fn run_bfs_phase( receiver.on_event(BuildEvent::LevelComplete { next_level_count: next_level.len(), }); + total_merge_wall_us += merge_start.elapsed().as_micros() as u64; current_level = next_level; } let bfs_elapsed = start.elapsed(); tracing::debug!("Build phase: {:?}", bfs_elapsed); tracing::info!( - "p1-breakdown bfs_wall={}ms | {}", + "p1-breakdown bfs_wall={}ms bfs_prefetch_wall_us={} bfs_merge_wall_us={} | {}", bfs_elapsed.as_millis(), + total_prefetch_wall_us, + total_merge_wall_us, crate::util::FETCH_TIMINGS.snapshot().summary_line(), ); Ok(()) diff --git a/crates/ruborist/src/service/api.rs b/crates/ruborist/src/service/api.rs index 82703ed97..5a14f2a56 100644 --- a/crates/ruborist/src/service/api.rs +++ b/crates/ruborist/src/service/api.rs @@ -70,6 +70,16 @@ pub struct BuildDepsOptions { /// Catalog definitions for the `catalog:` dependency protocol. /// Key `""` = default catalog, other keys = named catalogs. pub catalogs: Catalogs, + /// When true, skip the up-front `run_preload_phase`. Set by callers + /// that don't consume the `BuildEvent::PackageResolved` pipeline + /// stream — e.g. `utoo deps` (lockfile-only). The BFS phase has its + /// own per-level prefetch that warms the manifest cache, so dropping + /// preload doesn't change correctness, only avoids the redundant + /// up-front fetch + dedicated wall. + /// Install paths (which feed `PipelineReceiver` to start tarball + /// downloads as resolves complete) leave this false so preload still + /// emits PackageResolved events to the pipeline. + pub skip_preload: bool, } impl BuildDepsOptions { @@ -91,6 +101,7 @@ impl BuildDepsOptions { receiver, supports_semver: None, catalogs: HashMap::new(), + skip_preload: false, } } } @@ -132,6 +143,7 @@ where receiver, supports_semver, catalogs, + skip_preload: skip_preload_caller, } = options; // 1. Find root path (workspace root if applicable) @@ -234,7 +246,13 @@ where registry.supports_semver(), ); - let skip_preload = cache_count > 0; + // Skip preload when: + // - the caller asked us to (e.g. `utoo deps`, no pipeline consumer + // for PackageResolved events — BFS does its own per-level + // prefetch, preload is redundant), OR + // - the project's warm cache already has manifests covering most + // of the workload (existing skip-on-warm behavior). + let skip_preload = skip_preload_caller || cache_count > 0; let mut config = BuildDepsConfig::default() .with_peer_deps(peer_deps) .with_concurrency(concurrency) @@ -334,6 +352,7 @@ mod tests { receiver: NoopReceiver, supports_semver: None, catalogs: HashMap::new(), + skip_preload: false, }; assert_eq!(options.concurrency, 20); From 596cd2045fd6ef5031703343b52ccad2a67a907f Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 04:18:21 +0800 Subject: [PATCH 11/24] perf(pm): fast_preload bypasses UnifiedRegistry for utoo deps path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds resolver::fast_preload, a manifest-bench-style flat FuturesUnordered over service::manifest::fetch_full_manifest. It warms MemoryCache (both full_manifests and version_manifests slots) synchronously after each fetch, so the BFS phase is pure cache-hit: no rayon hop on extract_core_version, no OnceMap gates, no DiskManifestStore writes, no PackageResolved events. Wired into service::api::build_deps: when the caller asks to skip preload (Context::build_deps for `utoo deps`) and there's no warm project cache, fast_preload runs ahead of build_deps_with_config. Install paths still go through preload_manifests so the pipeline keeps its early-start signal. Also reverts the per-level prefetch I added in 394f6c92 — with fast_preload pre-warming everything, BFS doesn't need its own prefetch wave. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/resolver/builder.rs | 72 ++---- crates/ruborist/src/resolver/fast_preload.rs | 234 +++++++++++++++++++ crates/ruborist/src/resolver/mod.rs | 1 + crates/ruborist/src/service/api.rs | 26 +++ 4 files changed, 275 insertions(+), 58 deletions(-) create mode 100644 crates/ruborist/src/resolver/fast_preload.rs diff --git a/crates/ruborist/src/resolver/builder.rs b/crates/ruborist/src/resolver/builder.rs index d811fc38c..156622502 100644 --- a/crates/ruborist/src/resolver/builder.rs +++ b/crates/ruborist/src/resolver/builder.rs @@ -24,7 +24,6 @@ use std::sync::Arc; #[cfg(feature = "http-tarball")] use anyhow::Context as _; -use futures::stream::{self, StreamExt}; use petgraph::graph::NodeIndex; use crate::model::graph::{DependencyGraph, FindResult, PackageNode}; @@ -32,7 +31,7 @@ use crate::model::manifest::NodeManifest; use crate::model::node::EdgeType; use crate::model::package_json::PackageJson; use crate::resolver::preload::{PreloadConfig, preload_manifests}; -use crate::resolver::registry::{ResolveError, resolve_package, resolve_registry_dep}; +use crate::resolver::registry::{ResolveError, resolve_registry_dep}; use crate::spec::{Catalogs, PackageSpec, Protocol, SpecStr}; use crate::traits::progress::{BuildEvent, EventReceiver, NoopReceiver}; use crate::traits::registry::{RegistryClient, ResolvedPackage}; @@ -181,7 +180,10 @@ struct NodeFlags { /// Only registry specs (e.g. `^4.17.0`) are collected. `catalog:` specs are /// resolved at edge creation time, so by the time this runs they are already /// concrete registry specs. -fn gather_preload_deps(graph: &DependencyGraph, peer_deps: PeerDeps) -> Vec<(String, String)> { +pub(crate) fn gather_preload_deps( + graph: &DependencyGraph, + peer_deps: PeerDeps, +) -> Vec<(String, String)> { let mut deps = HashSet::new(); let collect = |node_index: NodeIndex, deps: &mut HashSet<(String, String)>| { @@ -805,72 +807,29 @@ async fn run_preload_phase( /// Run the BFS traversal phase to build the dependency tree. /// /// Each level does a parallel prefetch of all unresolved registry specs -/// before the sequential `process_dependency` walk. The prefetch warms -/// the registry's manifest cache so the per-edge `process_dependency` -/// calls below hit cache instead of awaiting network. +/// before the sequential `process_dependency` walk. /// -/// This collapses the previously-separate `run_preload_phase` (which -/// fetched all transitive manifests up-front) into per-level batches. -/// Net effect on `utoo deps`: no separate preload wall — fetch happens -/// inside BFS in waves matching the dep tree's natural levels. For -/// install paths (p0/p3), `run_preload_phase` may still run via -/// `skip_preload=false` and feed the `PackageResolved` pipeline event. +/// When `skip_preload=true` (lockfile-only path), the caller is +/// expected to have already populated `registry.cache()` via +/// [`super::fast_preload::fast_preload`], so this BFS sees only +/// cache hits. When `skip_preload=false` (install paths), the +/// receiver-driven [`super::preload::preload_manifests`] runs ahead +/// of this phase and feeds `BuildEvent::PackageResolved` to the +/// pipeline. async fn run_bfs_phase( graph: &mut DependencyGraph, registry: &R, config: &BuildDepsConfig, receiver: &E, ) -> Result<(), ResolveError> { - // Reset fetch counters so the breakdown line reports fetches issued - // *during* this BFS phase, not preload's. (Preload still runs for - // install-path callers and reports its own breakdown.) - if config.skip_preload { - crate::util::FETCH_TIMINGS.reset(); - } - let start = tokio::time::Instant::now(); - let mut total_prefetch_wall_us: u64 = 0; - let mut total_merge_wall_us: u64 = 0; - let mut current_level = vec![graph.root_index]; - let mut prefetched: HashSet = HashSet::new(); while !current_level.is_empty() { receiver.on_event(BuildEvent::LevelStart { node_count: current_level.len(), }); - // Phase A: collect unresolved registry edges across the whole level - // (deduplicated against earlier levels — once a (name, spec) is - // prefetched, the registry's cache satisfies every subsequent - // `process_dependency` call). - let mut prefetch_targets: Vec<(String, String)> = Vec::new(); - for &node_index in ¤t_level { - for edge in collect_unresolved_edges(graph, node_index) { - if edge.spec.is_registry_spec() { - let key = format!("{}@{}", edge.name, edge.spec); - if prefetched.insert(key) { - prefetch_targets.push((edge.name, edge.spec)); - } - } - } - } - - // Phase B: parallel prefetch — pure cache warming. Errors are - // ignored here; the sequential `process_dependency` below will - // re-issue (now hitting either cache or the same fresh failure) - // and propagate any real error through the existing path. - if !prefetch_targets.is_empty() { - let prefetch_start = tokio::time::Instant::now(); - stream::iter(prefetch_targets) - .for_each_concurrent(config.concurrency, |(name, spec)| async move { - let _ = resolve_package(registry, &name, &spec).await; - }) - .await; - total_prefetch_wall_us += prefetch_start.elapsed().as_micros() as u64; - } - - let merge_start = tokio::time::Instant::now(); let mut next_level = Vec::new(); for node_index in current_level { @@ -952,17 +911,14 @@ async fn run_bfs_phase( receiver.on_event(BuildEvent::LevelComplete { next_level_count: next_level.len(), }); - total_merge_wall_us += merge_start.elapsed().as_micros() as u64; current_level = next_level; } let bfs_elapsed = start.elapsed(); tracing::debug!("Build phase: {:?}", bfs_elapsed); tracing::info!( - "p1-breakdown bfs_wall={}ms bfs_prefetch_wall_us={} bfs_merge_wall_us={} | {}", + "p1-breakdown bfs_wall={}ms | {}", bfs_elapsed.as_millis(), - total_prefetch_wall_us, - total_merge_wall_us, crate::util::FETCH_TIMINGS.snapshot().summary_line(), ); Ok(()) diff --git a/crates/ruborist/src/resolver/fast_preload.rs b/crates/ruborist/src/resolver/fast_preload.rs new file mode 100644 index 000000000..975c18a81 --- /dev/null +++ b/crates/ruborist/src/resolver/fast_preload.rs @@ -0,0 +1,234 @@ +//! Lean parallel manifest fetcher modeled on `manifest-bench`. +//! +//! Bypasses [`crate::service::registry::UnifiedRegistry`] — and therefore +//! its `OnceMap` gates, [`crate::service::store::ManifestStore`] writes, +//! and `EventReceiver` event dispatch — to drive a flat +//! `FuturesUnordered` over [`crate::service::manifest::fetch_full_manifest`] +//! plus a synchronous transitive walk. The warm +//! [`crate::service::cache::MemoryCache`] it leaves behind makes the +//! subsequent BFS phase a pure cache-hit walk: no network, no rayon +//! re-parse hop on `extract_core_version`. +//! +//! Intended for the lockfile-only path (`utoo deps`) which has no +//! pipeline consumer for `BuildEvent::PackageResolved` — install paths +//! still go through [`super::preload::preload_manifests`] so the +//! pipeline keeps its early-start signal. + +use std::collections::{HashSet, VecDeque}; +use std::sync::Arc; + +use futures::stream::{FuturesUnordered, StreamExt}; + +use crate::model::manifest::CoreVersionManifest; +use crate::model::node::PeerDeps; +use crate::resolver::preload::{Dep, PreloadConfig}; +use crate::resolver::version::resolve_target_version; +use crate::service::{ + FetchManifestOptions, FetchManifestResult, MemoryCache, MetadataFormat, fetch_full_manifest, +}; +use crate::spec::SpecStr; +use crate::util::FETCH_TIMINGS; + +/// Statistics from the lean fetch loop. Mirrors `PreloadStats` shape so +/// the bench-grep regex stays the same. +#[derive(Debug, Default)] +pub struct FastPreloadStats { + pub success_count: usize, + pub failed_count: usize, + pub fetched_names: usize, + pub min_request_ms: u64, + pub max_request_ms: u64, + pub total_request_ms: u64, +} + +/// Collect dependencies from any deps map, filtering out non-registry specs. +fn collect_deps(map: Option<&std::collections::HashMap>) -> Vec { + map.into_iter() + .flatten() + .filter(|(_, spec)| spec.is_registry_spec()) + .map(|(name, spec)| (name.clone(), spec.clone())) + .collect() +} + +/// Extract transitive dependencies from a resolved manifest. +/// devDependencies are omitted (only the root installs devDeps). +fn extract_transitive_deps(manifest: &CoreVersionManifest, peer_deps: PeerDeps) -> Vec { + let mut deps = Vec::new(); + deps.extend(collect_deps(manifest.dependencies.as_ref())); + if peer_deps == PeerDeps::Include { + deps.extend(collect_deps(manifest.peer_dependencies.as_ref())); + } + deps.extend(collect_deps(manifest.optional_dependencies.as_ref())); + deps +} + +/// Resolve `(name, spec)` against the cached `FullManifest` synchronously. +/// +/// Inlines the work that `UnifiedRegistry::resolve_via_full_manifest` does +/// after a cache hit — pick a version, parse just that subset, populate +/// the per-version cache slot the BFS phase will read from. Skips the +/// rayon/`spawn_blocking` hop because the caller is already doing +/// CPU-bound bookkeeping between fetches. +fn settle_spec(name: &str, spec: &str, cache: &MemoryCache, peer_deps: PeerDeps) -> Vec { + let Some(full) = cache.get_full_manifest(name) else { + return Vec::new(); + }; + let Ok(resolved_version) = resolve_target_version((&*full).into(), spec) else { + return Vec::new(); + }; + if let Some(cached) = cache.get_version_manifest(name, &resolved_version) { + return extract_transitive_deps(&cached, peer_deps); + } + let Some(core) = full.get_core_version(&resolved_version) else { + return Vec::new(); + }; + let core_arc = Arc::new(core); + cache.set_version_manifest(name.to_string(), resolved_version, Arc::clone(&core_arc)); + extract_transitive_deps(&core_arc, peer_deps) +} + +/// Manifest-bench-style flat parallel fetch of all transitively-reachable +/// registry manifests. Populates `cache` with both `full_manifests` and +/// `version_manifests` slots so the subsequent BFS does no network and no +/// re-parse. +/// +/// `initial_deps` should already be the union of root+workspace +/// registry edges, with non-registry specs filtered out. +pub async fn fast_preload( + initial_deps: Vec, + registry_url: &str, + cache: &MemoryCache, + config: &PreloadConfig, +) -> FastPreloadStats { + let mut stats = FastPreloadStats::default(); + let mut pending: VecDeque = VecDeque::from(initial_deps); + // Specs we've already enqueued (or settled). Prevents duplicate + // sync resolutions from re-walking the same transitive subtree. + let mut seen_specs: HashSet<(String, String)> = HashSet::new(); + // Names whose full manifest is either cached or in flight. Spec-level + // dedup happens in `seen_specs` above; this set is the gate that + // prevents two concurrent fetches for the same package (sibling + // specs queue against the in-flight one rather than racing). + let mut fetched_names: HashSet = HashSet::new(); + // Specs that arrived while their package's full manifest was still + // in flight — we'll settle them once the fetch lands. + let mut deferred_specs: Vec<(String, String)> = Vec::new(); + let mut futs = FuturesUnordered::new(); + let concurrency = config.concurrency; + let peer_deps = config.peer_deps; + + loop { + while futs.len() < concurrency { + let Some((name, spec)) = pending.pop_front() else { + break; + }; + if !seen_specs.insert((name.clone(), spec.clone())) { + continue; + } + + // Full manifest already cached: skip the network round-trip, + // settle synchronously and queue this package's transitive + // deps. This is the hot path on the second-and-later spec + // for any popular package (lodash, semver, etc.). + if cache.get_full_manifest(&name).is_some() { + let new_deps = settle_spec(&name, &spec, cache, peer_deps); + pending.extend(new_deps); + continue; + } + + // Fetch in flight for this name — defer settling this spec + // until the fetch lands. The deferred set is small (only + // sibling specs for in-flight names) so the linear scan is + // cheaper than another HashMap. + if !fetched_names.insert(name.clone()) { + deferred_specs.push((name, spec)); + continue; + } + + let registry_url = registry_url.to_string(); + let n = name.clone(); + futs.push(async move { + let start = tokio::time::Instant::now(); + let result = fetch_full_manifest(FetchManifestOptions { + registry_url: ®istry_url, + name: &n, + format: MetadataFormat::Abbreviated, + etag: None, + }) + .await; + let elapsed_ms = start.elapsed().as_millis() as u64; + (name, spec, result, elapsed_ms) + }); + } + + if futs.is_empty() { + break; + } + + let Some((name, spec, result, elapsed_ms)) = futs.next().await else { + break; + }; + + if stats.success_count == 0 && stats.failed_count == 0 { + stats.min_request_ms = elapsed_ms; + stats.max_request_ms = elapsed_ms; + } else { + stats.min_request_ms = stats.min_request_ms.min(elapsed_ms); + stats.max_request_ms = stats.max_request_ms.max(elapsed_ms); + } + stats.total_request_ms += elapsed_ms; + + match result { + Ok(FetchManifestResult::Ok(manifest, _etag)) => { + stats.success_count += 1; + stats.fetched_names += 1; + cache.set_full_manifest(name.clone(), Arc::new(manifest)); + + let new_deps = settle_spec(&name, &spec, cache, peer_deps); + pending.extend(new_deps); + + // Drain any sibling specs that arrived while this fetch + // was in flight. `extract_if`-style retain in place. + let mut i = 0; + while i < deferred_specs.len() { + if deferred_specs[i].0 == name { + let (n, s) = deferred_specs.swap_remove(i); + let new_deps = settle_spec(&n, &s, cache, peer_deps); + pending.extend(new_deps); + } else { + i += 1; + } + } + } + Ok(FetchManifestResult::NotModified) => { + // No ETag was sent on these requests, so 304 is unreachable + // here in practice; treat it as a soft-failure to keep the + // path total. + stats.failed_count += 1; + } + Err(e) => { + stats.failed_count += 1; + tracing::debug!("fast_preload failed for {}: {}", name, e); + } + } + } + + let total = stats.success_count + stats.failed_count; + let avg_ms = if total > 0 { + stats.total_request_ms / total as u64 + } else { + 0 + }; + tracing::info!( + "p1-breakdown fast_preload n={} ok={} fail={} avg_req={}ms min={}ms max={}ms | {}", + total, + stats.success_count, + stats.failed_count, + avg_ms, + stats.min_request_ms, + stats.max_request_ms, + FETCH_TIMINGS.snapshot().summary_line(), + ); + + stats +} diff --git a/crates/ruborist/src/resolver/mod.rs b/crates/ruborist/src/resolver/mod.rs index 582e03b31..e7baad988 100644 --- a/crates/ruborist/src/resolver/mod.rs +++ b/crates/ruborist/src/resolver/mod.rs @@ -3,6 +3,7 @@ pub mod builder; pub mod common; pub mod edges; +pub mod fast_preload; #[cfg(feature = "native-git")] pub mod git; #[cfg(feature = "http-tarball")] diff --git a/crates/ruborist/src/service/api.rs b/crates/ruborist/src/service/api.rs index 5a14f2a56..3b9b713ea 100644 --- a/crates/ruborist/src/service/api.rs +++ b/crates/ruborist/src/service/api.rs @@ -36,7 +36,10 @@ use crate::model::package_lock::PackageLock; use crate::model::util::parse_package_spec; use crate::resolver::builder::{ BuildDepsConfig, DevDeps, EdgeContext, PeerDeps, add_edges_from, build_deps_with_config, + gather_preload_deps, }; +use crate::resolver::fast_preload::fast_preload; +use crate::resolver::preload::PreloadConfig; use crate::resolver::runtime::install_runtime_from_map; use crate::resolver::workspace::WorkspaceDiscovery; use crate::spec::Catalogs; @@ -269,6 +272,29 @@ where ); } + // Lockfile-only callers (`utoo deps`) skip the receiver-driven + // `run_preload_phase` because they have no pipeline consumer for + // `BuildEvent::PackageResolved`. Run `fast_preload` instead — a flat + // `FuturesUnordered` over `fetch_full_manifest` that warms the + // `MemoryCache` so the BFS phase below is pure cache-hit. This is + // the manifest-bench-style path; the heavier `preload_manifests` + // path (with `OnceMap` gates + `EventReceiver` events) only runs + // for install paths that need the pipeline signal. + if skip_preload_caller && cache_count == 0 { + let initial_deps = gather_preload_deps(&graph, peer_deps); + let preload_config = PreloadConfig { + peer_deps, + concurrency, + }; + fast_preload( + initial_deps, + registry.registry_url(), + registry.cache(), + &preload_config, + ) + .await; + } + // Preserve the typed error via `Error::new` + `.context(...)` so CLI // renderers (e.g. pm's format_print) can downcast and pretty-print the // dependency chain carried by `ResolveError::WithChain`. From 2e74bba904e391931a71960464932334e0d46e94 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 04:51:27 +0800 Subject: [PATCH 12/24] perf(pm): dispatch fast_preload settle to rayon to free tokio runtime MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v1 of fast_preload called settle_spec inline on the tokio worker — each settle ran simd_json::to_borrowed_value over the full manifest's raw bytes (5–10ms per spec) right on the runtime thread. CI showed it starved sibling fetches: avg_request rose +3ms, avg_parse jumped 5→11ms, p1_resolve regressed +1.0s vs the preload+BFS baseline (4.0s vs 3.0s). Fix: route every settle through extract_core_version_off_runtime (the same rayon::spawn helper the BFS path uses), and merge fetch and settle completions into a single FuturesUnordered so backpressure on either side throttles the other. Sibling specs that arrived during a fetch are now stashed by name (HashMap, not linear scan), then dispatched as their own settle futures when the fetch lands. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/resolver/fast_preload.rs | 248 ++++++++++++------- 1 file changed, 163 insertions(+), 85 deletions(-) diff --git a/crates/ruborist/src/resolver/fast_preload.rs b/crates/ruborist/src/resolver/fast_preload.rs index 975c18a81..faea79752 100644 --- a/crates/ruborist/src/resolver/fast_preload.rs +++ b/crates/ruborist/src/resolver/fast_preload.rs @@ -4,7 +4,7 @@ //! its `OnceMap` gates, [`crate::service::store::ManifestStore`] writes, //! and `EventReceiver` event dispatch — to drive a flat //! `FuturesUnordered` over [`crate::service::manifest::fetch_full_manifest`] -//! plus a synchronous transitive walk. The warm +//! plus a rayon-dispatched per-spec settle. The warm //! [`crate::service::cache::MemoryCache`] it leaves behind makes the //! subsequent BFS phase a pure cache-hit walk: no network, no rayon //! re-parse hop on `extract_core_version`. @@ -13,13 +13,28 @@ //! pipeline consumer for `BuildEvent::PackageResolved` — install paths //! still go through [`super::preload::preload_manifests`] so the //! pipeline keeps its early-start signal. +//! +//! ## Why settle is dispatched off-runtime +//! +//! A "settle" turns a freshly-fetched `FullManifest` plus a spec into a +//! `CoreVersionManifest` for one version, via `simd_json::to_borrowed_value` +//! over the manifest's raw bytes. That parse is 5–10ms per spec on a +//! 100KB body. Calling it inline on the tokio runtime (the v1 of this +//! module) starves the runtime worker — sibling fetches in flight stop +//! draining their sockets while the worker is parsing, which CI showed +//! as `avg_request` rising +3ms and `avg_parse` jumping 5→11ms vs the +//! UnifiedRegistry baseline. Routing settle through `rayon::spawn` +//! (the same path the `extract_core_version_off_runtime` helper takes) +//! keeps the runtime free to drive I/O. -use std::collections::{HashSet, VecDeque}; +use std::collections::{HashMap, HashSet, VecDeque}; +use std::pin::Pin; use std::sync::Arc; +use futures::future::BoxFuture; use futures::stream::{FuturesUnordered, StreamExt}; -use crate::model::manifest::CoreVersionManifest; +use crate::model::manifest::{CoreVersionManifest, FullManifest, extract_core_version_off_runtime}; use crate::model::node::PeerDeps; use crate::resolver::preload::{Dep, PreloadConfig}; use crate::resolver::version::resolve_target_version; @@ -41,8 +56,32 @@ pub struct FastPreloadStats { pub total_request_ms: u64, } +/// Output of one in-flight future. The main loop merges fetch and settle +/// completions through a single `FuturesUnordered` so backpressure on +/// either side throttles the other naturally. +/// +/// `Fetched` is boxed because `FetchManifestResult::Ok` carries a fully- +/// parsed `FullManifest` (`raw` bytes + parsed envelope), which makes +/// the variant large enough that clippy flags the size delta with +/// `Settled`. The cost is one heap allocation per fetched manifest; +/// trivial against the network round-trip we already paid. +#[allow(clippy::large_enum_variant)] +enum FastEvent { + Fetched { + name: String, + primary_spec: String, + result: anyhow::Result, + elapsed_ms: u64, + }, + Settled { + new_deps: Vec, + }, +} + +type FastFut = Pin + Send>>; + /// Collect dependencies from any deps map, filtering out non-registry specs. -fn collect_deps(map: Option<&std::collections::HashMap>) -> Vec { +fn collect_deps(map: Option<&HashMap>) -> Vec { map.into_iter() .flatten() .filter(|(_, spec)| spec.is_registry_spec()) @@ -62,29 +101,41 @@ fn extract_transitive_deps(manifest: &CoreVersionManifest, peer_deps: PeerDeps) deps } -/// Resolve `(name, spec)` against the cached `FullManifest` synchronously. +/// Resolve `(name, spec)` against `full` off the tokio runtime. /// -/// Inlines the work that `UnifiedRegistry::resolve_via_full_manifest` does -/// after a cache hit — pick a version, parse just that subset, populate -/// the per-version cache slot the BFS phase will read from. Skips the -/// rayon/`spawn_blocking` hop because the caller is already doing -/// CPU-bound bookkeeping between fetches. -fn settle_spec(name: &str, spec: &str, cache: &MemoryCache, peer_deps: PeerDeps) -> Vec { - let Some(full) = cache.get_full_manifest(name) else { - return Vec::new(); - }; - let Ok(resolved_version) = resolve_target_version((&*full).into(), spec) else { - return Vec::new(); - }; - if let Some(cached) = cache.get_version_manifest(name, &resolved_version) { - return extract_transitive_deps(&cached, peer_deps); - } - let Some(core) = full.get_core_version(&resolved_version) else { - return Vec::new(); - }; - let core_arc = Arc::new(core); - cache.set_version_manifest(name.to_string(), resolved_version, Arc::clone(&core_arc)); - extract_transitive_deps(&core_arc, peer_deps) +/// Returns the freshly-extracted version manifest's transitive deps so +/// the caller can extend its pending queue. The heavy +/// `simd_json::to_borrowed_value` parse runs inside +/// `extract_core_version_off_runtime`, which dispatches to rayon — same +/// path the BFS phase uses for cold extracts. +fn settle_future( + name: String, + spec: String, + full: Arc, + cache: MemoryCache, + peer_deps: PeerDeps, +) -> BoxFuture<'static, FastEvent> { + Box::pin(async move { + let resolved_version = match resolve_target_version((&*full).into(), &spec) { + Ok(v) => v, + Err(_) => return FastEvent::Settled { new_deps: vec![] }, + }; + if let Some(cached) = cache.get_version_manifest(&name, &resolved_version) { + return FastEvent::Settled { + new_deps: extract_transitive_deps(&cached, peer_deps), + }; + } + let (resolved_version, core) = + extract_core_version_off_runtime(Arc::clone(&full), resolved_version).await; + let new_deps = match core { + Some(core_arc) => { + cache.set_version_manifest(name, resolved_version, Arc::clone(&core_arc)); + extract_transitive_deps(&core_arc, peer_deps) + } + None => Vec::new(), + }; + FastEvent::Settled { new_deps } + }) } /// Manifest-bench-style flat parallel fetch of all transitively-reachable @@ -103,17 +154,15 @@ pub async fn fast_preload( let mut stats = FastPreloadStats::default(); let mut pending: VecDeque = VecDeque::from(initial_deps); // Specs we've already enqueued (or settled). Prevents duplicate - // sync resolutions from re-walking the same transitive subtree. + // settles from re-walking the same transitive subtree. let mut seen_specs: HashSet<(String, String)> = HashSet::new(); - // Names whose full manifest is either cached or in flight. Spec-level - // dedup happens in `seen_specs` above; this set is the gate that - // prevents two concurrent fetches for the same package (sibling - // specs queue against the in-flight one rather than racing). + // Names whose full manifest is in flight or already cached. let mut fetched_names: HashSet = HashSet::new(); - // Specs that arrived while their package's full manifest was still - // in flight — we'll settle them once the fetch lands. - let mut deferred_specs: Vec<(String, String)> = Vec::new(); - let mut futs = FuturesUnordered::new(); + // Sibling specs that arrived while their package's full manifest + // was still in flight. The fetch's completion handler drains this + // bucket — we stash by name so the lookup is one HashMap probe. + let mut deferred_by_name: HashMap> = HashMap::new(); + let mut futs: FuturesUnordered = FuturesUnordered::new(); let concurrency = config.concurrency; let peer_deps = config.peer_deps; @@ -126,28 +175,33 @@ pub async fn fast_preload( continue; } - // Full manifest already cached: skip the network round-trip, - // settle synchronously and queue this package's transitive - // deps. This is the hot path on the second-and-later spec - // for any popular package (lodash, semver, etc.). - if cache.get_full_manifest(&name).is_some() { - let new_deps = settle_spec(&name, &spec, cache, peer_deps); - pending.extend(new_deps); + // Hot path: the full manifest is already cached (a sibling + // spec for this name has already returned). Dispatch a + // settle so the parse work runs on rayon, not on the tokio + // worker — keeps the runtime free for ongoing fetches. + if let Some(full) = cache.get_full_manifest(&name) { + futs.push(Box::pin(settle_future( + name, + spec, + full, + cache.clone(), + peer_deps, + ))); continue; } - // Fetch in flight for this name — defer settling this spec - // until the fetch lands. The deferred set is small (only - // sibling specs for in-flight names) so the linear scan is - // cheaper than another HashMap. + // A fetch for this name is already in flight: stash this + // spec; the fetch's completion handler will dispatch its + // settle. if !fetched_names.insert(name.clone()) { - deferred_specs.push((name, spec)); + deferred_by_name.entry(name).or_default().push(spec); continue; } let registry_url = registry_url.to_string(); + let primary_spec = spec.clone(); let n = name.clone(); - futs.push(async move { + futs.push(Box::pin(async move { let start = tokio::time::Instant::now(); let result = fetch_full_manifest(FetchManifestOptions { registry_url: ®istry_url, @@ -157,58 +211,82 @@ pub async fn fast_preload( }) .await; let elapsed_ms = start.elapsed().as_millis() as u64; - (name, spec, result, elapsed_ms) - }); + FastEvent::Fetched { + name, + primary_spec, + result, + elapsed_ms, + } + })); } if futs.is_empty() { break; } - let Some((name, spec, result, elapsed_ms)) = futs.next().await else { + let Some(event) = futs.next().await else { break; }; - if stats.success_count == 0 && stats.failed_count == 0 { - stats.min_request_ms = elapsed_ms; - stats.max_request_ms = elapsed_ms; - } else { - stats.min_request_ms = stats.min_request_ms.min(elapsed_ms); - stats.max_request_ms = stats.max_request_ms.max(elapsed_ms); - } - stats.total_request_ms += elapsed_ms; + match event { + FastEvent::Fetched { + name, + primary_spec, + result, + elapsed_ms, + } => { + if stats.success_count == 0 && stats.failed_count == 0 { + stats.min_request_ms = elapsed_ms; + stats.max_request_ms = elapsed_ms; + } else { + stats.min_request_ms = stats.min_request_ms.min(elapsed_ms); + stats.max_request_ms = stats.max_request_ms.max(elapsed_ms); + } + stats.total_request_ms += elapsed_ms; - match result { - Ok(FetchManifestResult::Ok(manifest, _etag)) => { - stats.success_count += 1; - stats.fetched_names += 1; - cache.set_full_manifest(name.clone(), Arc::new(manifest)); + match result { + Ok(FetchManifestResult::Ok(manifest, _etag)) => { + stats.success_count += 1; + stats.fetched_names += 1; + let full_arc = Arc::new(manifest); + cache.set_full_manifest(name.clone(), Arc::clone(&full_arc)); - let new_deps = settle_spec(&name, &spec, cache, peer_deps); - pending.extend(new_deps); + // Primary settle. + futs.push(Box::pin(settle_future( + name.clone(), + primary_spec, + Arc::clone(&full_arc), + cache.clone(), + peer_deps, + ))); - // Drain any sibling specs that arrived while this fetch - // was in flight. `extract_if`-style retain in place. - let mut i = 0; - while i < deferred_specs.len() { - if deferred_specs[i].0 == name { - let (n, s) = deferred_specs.swap_remove(i); - let new_deps = settle_spec(&n, &s, cache, peer_deps); - pending.extend(new_deps); - } else { - i += 1; + // Sibling settles that were stashed while the + // fetch was in flight. + if let Some(siblings) = deferred_by_name.remove(&name) { + for sibling_spec in siblings { + futs.push(Box::pin(settle_future( + name.clone(), + sibling_spec, + Arc::clone(&full_arc), + cache.clone(), + peer_deps, + ))); + } + } + } + Ok(FetchManifestResult::NotModified) => { + // No ETag was sent on these requests, so 304 is + // unreachable in practice; treat as soft failure. + stats.failed_count += 1; + } + Err(e) => { + stats.failed_count += 1; + tracing::debug!("fast_preload failed for {}: {}", name, e); } } } - Ok(FetchManifestResult::NotModified) => { - // No ETag was sent on these requests, so 304 is unreachable - // here in practice; treat it as a soft-failure to keep the - // path total. - stats.failed_count += 1; - } - Err(e) => { - stats.failed_count += 1; - tracing::debug!("fast_preload failed for {}: {}", name, e); + FastEvent::Settled { new_deps } => { + pending.extend(new_deps); } } } From 04c9ec34d26fdb97f83014c9a09e241cd64715aa Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 05:19:48 +0800 Subject: [PATCH 13/24] =?UTF-8?q?perf(pm):=20bump=20manifests-concurrency-?= =?UTF-8?q?limit=2064=20=E2=86=92=2096=20(manifest-bench=20best)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Standalone manifest-bench HTTP-only sweep (npmjs, h1) shows wall bottoming at concurrency=96 (1817ms) — earlier 256 regression was caused by rayon-queued parses behind 2 workers, no longer relevant since fetch parse is on spawn_blocking and settle is rayon-dispatched off the runtime. fast_preload's wave-shaped transitive walk currently runs at eff_parallel ~35 against the 64 cap because pending refills lag settles; raising the cap to 96 gives headroom for sustained in-flight on the deep waves without crossing the npmjs per-IP tail-latency cliff that conc 128+ trips. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/pm/src/util/user_config.rs | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/crates/pm/src/util/user_config.rs b/crates/pm/src/util/user_config.rs index a0235830a..f05b0f52f 100644 --- a/crates/pm/src/util/user_config.rs +++ b/crates/pm/src/util/user_config.rs @@ -137,15 +137,18 @@ pub fn get_install_scope() -> InstallScope { // We tried 256 to match bun's observed parallel streams; on GHA the // fetch-breakdown instrumentation showed sum_parse exploded from // ~10ms (local Mac, network-bound) to 728s on first cold run with -// 256 concurrency. Mechanism: parse_json_off_runtime dispatches to -// rayon, which has only num_cpus (=2 on GHA) workers. Bumping -// concurrency to 256 queued 256 parses behind 2 workers → wall -// per-parse jumped from 730µs to 266ms. Net p1 wall *increased* -// 3.10s → 3.33s on phases bench. Keep 64 until we address the -// parse-side queueing (e.g. inline parse on tokio, or a wider -// dedicated parse pool). +// Once we moved fetch parse off rayon to tokio's spawn_blocking pool +// (cap 512) and settle off the runtime via rayon::spawn, the original +// 256-concurrency regression mechanism (parses queued behind 2 rayon +// workers) no longer applies. The standalone manifest-bench HTTP-only +// sweep on GHA (npmjs, conc 32→256) shows wall bottoming out at conc 96 +// (1817ms) and tracking flat-then-rising past that — beyond ~96 +// in-flight, npmjs's per-IP rate degrades and tail latency widens. +// 96 is the sweet spot: enough headroom for the wave-shaped transitive +// dep walk in fast_preload to keep the runtime busy, without paying the +// p99 widening that 128+ shows. static MANIFESTS_CONCURRENCY_LIMIT: LazyLock> = - LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 64)); + LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 96)); pub fn set_manifests_concurrency_limit(value: Option) { MANIFESTS_CONCURRENCY_LIMIT.set(value); From 6455852e518b3cc9859e12442972f40697360d73 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 05:46:06 +0800 Subject: [PATCH 14/24] perf(pm): fast_preload populates (name, spec) cache slot for BFS fast path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `UnifiedRegistry::resolve_version_manifest`'s first cache check (service/registry.rs:347) keys on `(name, spec)` — the original spec string the caller passed, e.g. `^4.0.0`. settle_future was only populating `(name, resolved_version)` (e.g. `4.17.21`), so on every BFS edge for `lodash@^4.0.0`-style specs the warm path missed and fell into the OnceMap inflight gate + `resolve_via_full_manifest` re-walk before recovering the manifest from the `(name, resolved_version)` slot we'd already set. Now settle writes both keys so BFS hits the early-return at service/registry.rs:347 with no further dispatch. Saves ~1 OnceMap+resolve_target_version round-trip per unique (name, spec) the BFS encounters (≈3000 calls on ant-design-x). Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/resolver/fast_preload.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/crates/ruborist/src/resolver/fast_preload.rs b/crates/ruborist/src/resolver/fast_preload.rs index faea79752..c3845a73a 100644 --- a/crates/ruborist/src/resolver/fast_preload.rs +++ b/crates/ruborist/src/resolver/fast_preload.rs @@ -121,6 +121,8 @@ fn settle_future( Err(_) => return FastEvent::Settled { new_deps: vec![] }, }; if let Some(cached) = cache.get_version_manifest(&name, &resolved_version) { + // Populate the (name, spec) slot too — see comment below. + cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&cached)); return FastEvent::Settled { new_deps: extract_transitive_deps(&cached, peer_deps), }; @@ -129,6 +131,18 @@ fn settle_future( extract_core_version_off_runtime(Arc::clone(&full), resolved_version).await; let new_deps = match core { Some(core_arc) => { + // Populate BOTH cache slots so the subsequent BFS hits the + // fast path on its first call: + // * `(name, resolved_version)` — what + // `resolve_via_full_manifest` writes in the cold path, + // and what `extract_core_version_off_runtime`'s callers + // elsewhere expect. + // * `(name, spec)` — what `resolve_version_manifest`'s + // first cache check uses (line 347 in service/registry.rs). + // Without this slot, BFS still pays one OnceMap dispatch + // + `resolve_via_full_manifest` walk per `(name, spec)`, + // even though we've already done that work here. + cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&core_arc)); cache.set_version_manifest(name, resolved_version, Arc::clone(&core_arc)); extract_transitive_deps(&core_arc, peer_deps) } From 4bbcae8083de94ea69b6ef19611cdb59c719ca9c Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 06:12:08 +0800 Subject: [PATCH 15/24] perf(pm): fuse primary settle into fetch task to drop dispatch RTT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous fast_preload (v2) dispatched primary settles to rayon as separate FuturesUnordered futures. CI breakdown showed eff_parallel ~44 against the conc=96 cap — the wave-shaped transitive walk was held back by settle dispatch RTT: each fetch landed → primary settle queued → settle popped → only then did `pending` get transitive deps and fill the next dispatch wave. v3 folds the primary settle into the fetch task itself via `tokio::task::spawn_blocking`. The fetch task does the network round-trip and the primary version-extract on the same blocking pool slot, then returns with the resolved CoreVersionManifest attached. Main loop pulls one Fetched event, immediately extends `pending`, no second `next().await` to wait through the queue. Sibling specs (rare; same name, different range) still go through the rayon settle_future path so the primary path stays lean. Carries primary_spec through FastEvent so the fused path can populate both `(name, primary_spec)` and `(name, resolved_version)` cache slots — preserves the 6455852e BFS fast-path win. FetchOutcome enum replaces by-value FetchManifestResult to avoid a full FullManifest clone (HashMap+Vec) per fetch event. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/resolver/fast_preload.rs | 206 ++++++++++++------- 1 file changed, 135 insertions(+), 71 deletions(-) diff --git a/crates/ruborist/src/resolver/fast_preload.rs b/crates/ruborist/src/resolver/fast_preload.rs index c3845a73a..008030139 100644 --- a/crates/ruborist/src/resolver/fast_preload.rs +++ b/crates/ruborist/src/resolver/fast_preload.rs @@ -4,7 +4,7 @@ //! its `OnceMap` gates, [`crate::service::store::ManifestStore`] writes, //! and `EventReceiver` event dispatch — to drive a flat //! `FuturesUnordered` over [`crate::service::manifest::fetch_full_manifest`] -//! plus a rayon-dispatched per-spec settle. The warm +//! plus a fused-into-fetch primary settle. The warm //! [`crate::service::cache::MemoryCache`] it leaves behind makes the //! subsequent BFS phase a pure cache-hit walk: no network, no rayon //! re-parse hop on `extract_core_version`. @@ -14,18 +14,30 @@ //! still go through [`super::preload::preload_manifests`] so the //! pipeline keeps its early-start signal. //! -//! ## Why settle is dispatched off-runtime +//! ## Why settle is fused into the fetch task //! //! A "settle" turns a freshly-fetched `FullManifest` plus a spec into a //! `CoreVersionManifest` for one version, via `simd_json::to_borrowed_value` //! over the manifest's raw bytes. That parse is 5–10ms per spec on a -//! 100KB body. Calling it inline on the tokio runtime (the v1 of this -//! module) starves the runtime worker — sibling fetches in flight stop -//! draining their sockets while the worker is parsing, which CI showed -//! as `avg_request` rising +3ms and `avg_parse` jumping 5→11ms vs the -//! UnifiedRegistry baseline. Routing settle through `rayon::spawn` -//! (the same path the `extract_core_version_off_runtime` helper takes) -//! keeps the runtime free to drive I/O. +//! 100KB body. +//! +//! v1 ran settle inline on the tokio runtime worker — that starved +//! sibling fetches' I/O drive (CI showed `avg_request` +3ms, +//! `avg_parse` 5→11ms). v2 dispatched settle to rayon via a separate +//! `FuturesUnordered` future, which fixed the runtime starvation but +//! introduced a dispatch RTT: fetch lands → rayon settle queued → settle +//! pops → `pending` finally gets transitive deps. That round-trip held +//! the wave-shaped transitive walk back, capping `eff_parallel` at ~44 +//! against a 96 cap. +//! +//! v3 (this) folds the primary settle into the fetch task itself via +//! `tokio::task::spawn_blocking`. The fetch task awaits both the +//! network round-trip and the version-extract on the same blocking +//! pool slot, then returns with the resolved `CoreVersionManifest` +//! attached. The main loop pulls a single `Fetched` event and +//! immediately extends `pending` — no separate settle pop. Sibling +//! specs (rare; same package, different range) still go through a +//! `Settled` future to keep the primary path lean. use std::collections::{HashMap, HashSet, VecDeque}; use std::pin::Pin; @@ -56,21 +68,31 @@ pub struct FastPreloadStats { pub total_request_ms: u64, } -/// Output of one in-flight future. The main loop merges fetch and settle -/// completions through a single `FuturesUnordered` so backpressure on -/// either side throttles the other naturally. -/// -/// `Fetched` is boxed because `FetchManifestResult::Ok` carries a fully- -/// parsed `FullManifest` (`raw` bytes + parsed envelope), which makes -/// the variant large enough that clippy flags the size delta with -/// `Settled`. The cost is one heap allocation per fetched manifest; -/// trivial against the network round-trip we already paid. -#[allow(clippy::large_enum_variant)] +/// One fetch's primary settle outcome — the resolved version + parsed +/// `CoreVersionManifest` for the spec the fetch was originally issued +/// for. `None` means the spec didn't match any version (caller treats +/// as soft skip). +type PrimarySettle = Option<(String, Arc)>; + +/// Outcome of a fetch task. Owning `Arc` (rather than +/// `FetchManifestResult` by-value) means the fetch task can `Arc::clone` +/// once for the primary settle, then pass ownership along — no full +/// `FullManifest` clone (which would copy the 200-entry `time` +/// HashMap + the `versions` `Vec` per fetch). +enum FetchOutcome { + Ok(Arc), + NotModified, + Err, +} + +/// Output of one in-flight future. The main loop merges fetch and +/// sibling-settle completions through a single `FuturesUnordered`. enum FastEvent { Fetched { name: String, primary_spec: String, - result: anyhow::Result, + outcome: FetchOutcome, + primary_settle: PrimarySettle, elapsed_ms: u64, }, Settled { @@ -101,13 +123,9 @@ fn extract_transitive_deps(manifest: &CoreVersionManifest, peer_deps: PeerDeps) deps } -/// Resolve `(name, spec)` against `full` off the tokio runtime. -/// -/// Returns the freshly-extracted version manifest's transitive deps so -/// the caller can extend its pending queue. The heavy -/// `simd_json::to_borrowed_value` parse runs inside -/// `extract_core_version_off_runtime`, which dispatches to rayon — same -/// path the BFS phase uses for cold extracts. +/// Off-runtime settle for a `(name, spec)` whose `FullManifest` is +/// already cached. Used for sibling specs — multiple ranges on the +/// same package — that arrive after the primary fetch has landed. fn settle_future( name: String, spec: String, @@ -121,7 +139,6 @@ fn settle_future( Err(_) => return FastEvent::Settled { new_deps: vec![] }, }; if let Some(cached) = cache.get_version_manifest(&name, &resolved_version) { - // Populate the (name, spec) slot too — see comment below. cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&cached)); return FastEvent::Settled { new_deps: extract_transitive_deps(&cached, peer_deps), @@ -131,17 +148,6 @@ fn settle_future( extract_core_version_off_runtime(Arc::clone(&full), resolved_version).await; let new_deps = match core { Some(core_arc) => { - // Populate BOTH cache slots so the subsequent BFS hits the - // fast path on its first call: - // * `(name, resolved_version)` — what - // `resolve_via_full_manifest` writes in the cold path, - // and what `extract_core_version_off_runtime`'s callers - // elsewhere expect. - // * `(name, spec)` — what `resolve_version_manifest`'s - // first cache check uses (line 347 in service/registry.rs). - // Without this slot, BFS still pays one OnceMap dispatch - // + `resolve_via_full_manifest` walk per `(name, spec)`, - // even though we've already done that work here. cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&core_arc)); cache.set_version_manifest(name, resolved_version, Arc::clone(&core_arc)); extract_transitive_deps(&core_arc, peer_deps) @@ -152,6 +158,35 @@ fn settle_future( }) } +/// Resolve `(name, spec)` against `full` on tokio's blocking pool. +/// +/// Same shape as `extract_core_version_off_runtime` (which uses rayon), +/// but stays inside the fetch task so the result lands together with +/// the network round-trip — no separate `FuturesUnordered` pop, so +/// `pending` gets the transitive deps the moment the fetch event is +/// drained. Tokio's blocking pool has a 512-thread cap; rayon's is +/// `max(num_cpus, 8)`. With many primary settles arriving in waves, +/// the wider blocking pool absorbs the burst better than rayon would. +async fn resolve_primary_settle(spec: String, full: Arc) -> PrimarySettle { + #[cfg(not(target_arch = "wasm32"))] + { + tokio::task::spawn_blocking(move || { + let resolved = resolve_target_version((&*full).into(), &spec).ok()?; + let core = full.get_core_version(&resolved)?; + Some((resolved, Arc::new(core))) + }) + .await + .ok() + .flatten() + } + #[cfg(target_arch = "wasm32")] + { + let resolved = resolve_target_version((&*full).into(), &spec).ok()?; + let core = full.get_core_version(&resolved)?; + Some((resolved, Arc::new(core))) + } +} + /// Manifest-bench-style flat parallel fetch of all transitively-reachable /// registry manifests. Populates `cache` with both `full_manifests` and /// `version_manifests` slots so the subsequent BFS does no network and no @@ -167,14 +202,14 @@ pub async fn fast_preload( ) -> FastPreloadStats { let mut stats = FastPreloadStats::default(); let mut pending: VecDeque = VecDeque::from(initial_deps); - // Specs we've already enqueued (or settled). Prevents duplicate - // settles from re-walking the same transitive subtree. + // Specs we've already enqueued. Prevents duplicate settles from + // re-walking the same transitive subtree. let mut seen_specs: HashSet<(String, String)> = HashSet::new(); // Names whose full manifest is in flight or already cached. let mut fetched_names: HashSet = HashSet::new(); // Sibling specs that arrived while their package's full manifest - // was still in flight. The fetch's completion handler drains this - // bucket — we stash by name so the lookup is one HashMap probe. + // was still in flight. The fetch's completion handler dispatches + // settles for them, then drains this bucket. let mut deferred_by_name: HashMap> = HashMap::new(); let mut futs: FuturesUnordered = FuturesUnordered::new(); let concurrency = config.concurrency; @@ -189,10 +224,10 @@ pub async fn fast_preload( continue; } - // Hot path: the full manifest is already cached (a sibling - // spec for this name has already returned). Dispatch a - // settle so the parse work runs on rayon, not on the tokio - // worker — keeps the runtime free for ongoing fetches. + // Hot path: a sibling spec for this name has already + // returned, so the full manifest is cached. Settle on + // rayon (off-runtime) — keeps the primary fetch path + // (next branch) clean. if let Some(full) = cache.get_full_manifest(&name) { futs.push(Box::pin(settle_future( name, @@ -205,8 +240,8 @@ pub async fn fast_preload( } // A fetch for this name is already in flight: stash this - // spec; the fetch's completion handler will dispatch its - // settle. + // sibling spec; the fetch's completion handler will + // dispatch a settle for it. if !fetched_names.insert(name.clone()) { deferred_by_name.entry(name).or_default().push(spec); continue; @@ -225,10 +260,30 @@ pub async fn fast_preload( }) .await; let elapsed_ms = start.elapsed().as_millis() as u64; + // Fuse the primary settle into the same task so the + // main loop sees the resolved version + transitive + // deps in the same event — no extra `next().await` to + // wait through the FuturesUnordered queue before + // `pending` can refill. + let (outcome, primary_settle) = match result { + Ok(FetchManifestResult::Ok(manifest, _etag)) => { + let full_arc = Arc::new(manifest); + let settle = + resolve_primary_settle(primary_spec.clone(), Arc::clone(&full_arc)) + .await; + (FetchOutcome::Ok(full_arc), settle) + } + Ok(FetchManifestResult::NotModified) => (FetchOutcome::NotModified, None), + Err(e) => { + tracing::debug!("fast_preload failed for {}: {}", n, e); + (FetchOutcome::Err, None) + } + }; FastEvent::Fetched { name, primary_spec, - result, + outcome, + primary_settle, elapsed_ms, } })); @@ -246,7 +301,8 @@ pub async fn fast_preload( FastEvent::Fetched { name, primary_spec, - result, + outcome, + primary_settle, elapsed_ms, } => { if stats.success_count == 0 && stats.failed_count == 0 { @@ -258,24 +314,36 @@ pub async fn fast_preload( } stats.total_request_ms += elapsed_ms; - match result { - Ok(FetchManifestResult::Ok(manifest, _etag)) => { + match outcome { + FetchOutcome::Ok(full_arc) => { stats.success_count += 1; stats.fetched_names += 1; - let full_arc = Arc::new(manifest); cache.set_full_manifest(name.clone(), Arc::clone(&full_arc)); - // Primary settle. - futs.push(Box::pin(settle_future( - name.clone(), - primary_spec, - Arc::clone(&full_arc), - cache.clone(), - peer_deps, - ))); + // Apply the primary settle (already done inside + // the fetch task via spawn_blocking) — populate + // both `(name, primary_spec)` and + // `(name, resolved_version)` cache slots so BFS + // hits the early-return at registry.rs:347 on + // its first probe, then extend `pending` with + // the spec's transitive deps. + if let Some((resolved_version, core_arc)) = primary_settle { + cache.set_version_manifest( + name.clone(), + primary_spec, + Arc::clone(&core_arc), + ); + cache.set_version_manifest( + name.clone(), + resolved_version, + Arc::clone(&core_arc), + ); + pending.extend(extract_transitive_deps(&core_arc, peer_deps)); + } - // Sibling settles that were stashed while the - // fetch was in flight. + // Sibling specs that were stashed while the + // fetch was in flight: dispatch each as a + // separate settle future. if let Some(siblings) = deferred_by_name.remove(&name) { for sibling_spec in siblings { futs.push(Box::pin(settle_future( @@ -288,14 +356,10 @@ pub async fn fast_preload( } } } - Ok(FetchManifestResult::NotModified) => { - // No ETag was sent on these requests, so 304 is - // unreachable in practice; treat as soft failure. - stats.failed_count += 1; - } - Err(e) => { + FetchOutcome::NotModified | FetchOutcome::Err => { + // 304 is unreachable in practice (no ETag sent); + // both branches treated as soft failure. stats.failed_count += 1; - tracing::debug!("fast_preload failed for {}: {}", name, e); } } } From 671ac98e51e4a7ca4e53149c8bead24b4f144451 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 06:42:55 +0800 Subject: [PATCH 16/24] perf(pm): combined-parse fetch path eliminates per-fetch double simd_json MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fast_preload hot path was paying TWO simd_json passes per manifest: 1. fetch_full_manifest's parse_json_off_runtime did a typed simd_json::serde::from_slice (envelope + IgnoredAny visitor on `versions` keys, ~3-5ms on a 100KB body). 2. Primary settle re-parsed the same raw bytes with simd_json::to_borrowed_value (~5-10ms) to extract one version's subtree. Both passes went through simd_json's Tape constructor — duplicated work. CI showed avg_parse 5-7ms × 2700 fetches = 14-19s of CPU sum on 2-core GHA, where the spawn_blocking pool's overlapping schedule masked some of the cost but not all. Adds `service::manifest::fetch_full_manifest_with_settle`: same HTTP + retry + ETag machinery as `fetch_full_manifest`, but the parse step does ONE `to_borrowed_value` and extracts: * envelope (`name`, `dist-tags`, `versions` keys) into FullManifest manually (no typed serde), and * the resolved version's subtree as a typed CoreVersionManifest (serde-deserializing that single subtree via the borrowed value). fast_preload's fetch task switches to this entry point — primary settle is now a free byproduct of the fetch parse, not a separate `to_borrowed_value` pass. Sibling specs (same name, different range) still go through the rayon settle_future path. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/resolver/fast_preload.rs | 68 ++---- crates/ruborist/src/service/manifest.rs | 208 +++++++++++++++++++ crates/ruborist/src/service/mod.rs | 5 +- 3 files changed, 231 insertions(+), 50 deletions(-) diff --git a/crates/ruborist/src/resolver/fast_preload.rs b/crates/ruborist/src/resolver/fast_preload.rs index 008030139..d049321d8 100644 --- a/crates/ruborist/src/resolver/fast_preload.rs +++ b/crates/ruborist/src/resolver/fast_preload.rs @@ -51,7 +51,8 @@ use crate::model::node::PeerDeps; use crate::resolver::preload::{Dep, PreloadConfig}; use crate::resolver::version::resolve_target_version; use crate::service::{ - FetchManifestOptions, FetchManifestResult, MemoryCache, MetadataFormat, fetch_full_manifest, + FetchManifestOptions, FetchWithSettleResult, MemoryCache, MetadataFormat, + fetch_full_manifest_with_settle, }; use crate::spec::SpecStr; use crate::util::FETCH_TIMINGS; @@ -158,35 +159,6 @@ fn settle_future( }) } -/// Resolve `(name, spec)` against `full` on tokio's blocking pool. -/// -/// Same shape as `extract_core_version_off_runtime` (which uses rayon), -/// but stays inside the fetch task so the result lands together with -/// the network round-trip — no separate `FuturesUnordered` pop, so -/// `pending` gets the transitive deps the moment the fetch event is -/// drained. Tokio's blocking pool has a 512-thread cap; rayon's is -/// `max(num_cpus, 8)`. With many primary settles arriving in waves, -/// the wider blocking pool absorbs the burst better than rayon would. -async fn resolve_primary_settle(spec: String, full: Arc) -> PrimarySettle { - #[cfg(not(target_arch = "wasm32"))] - { - tokio::task::spawn_blocking(move || { - let resolved = resolve_target_version((&*full).into(), &spec).ok()?; - let core = full.get_core_version(&resolved)?; - Some((resolved, Arc::new(core))) - }) - .await - .ok() - .flatten() - } - #[cfg(target_arch = "wasm32")] - { - let resolved = resolve_target_version((&*full).into(), &spec).ok()?; - let core = full.get_core_version(&resolved)?; - Some((resolved, Arc::new(core))) - } -} - /// Manifest-bench-style flat parallel fetch of all transitively-reachable /// registry manifests. Populates `cache` with both `full_manifests` and /// `version_manifests` slots so the subsequent BFS does no network and no @@ -252,28 +224,28 @@ pub async fn fast_preload( let n = name.clone(); futs.push(Box::pin(async move { let start = tokio::time::Instant::now(); - let result = fetch_full_manifest(FetchManifestOptions { - registry_url: ®istry_url, - name: &n, - format: MetadataFormat::Abbreviated, - etag: None, - }) + // Combined fetch + envelope parse + primary settle in + // a single `to_borrowed_value` pass — replaces the old + // pattern of typed-serde envelope parse followed by a + // separate `to_borrowed_value` reparse for version + // extraction. Halves simd_json work per fetch. + let result = fetch_full_manifest_with_settle( + FetchManifestOptions { + registry_url: ®istry_url, + name: &n, + format: MetadataFormat::Abbreviated, + etag: None, + }, + &primary_spec, + ) .await; let elapsed_ms = start.elapsed().as_millis() as u64; - // Fuse the primary settle into the same task so the - // main loop sees the resolved version + transitive - // deps in the same event — no extra `next().await` to - // wait through the FuturesUnordered queue before - // `pending` can refill. let (outcome, primary_settle) = match result { - Ok(FetchManifestResult::Ok(manifest, _etag)) => { - let full_arc = Arc::new(manifest); - let settle = - resolve_primary_settle(primary_spec.clone(), Arc::clone(&full_arc)) - .await; - (FetchOutcome::Ok(full_arc), settle) + Ok(FetchWithSettleResult::Ok(payload)) => { + let full_arc = Arc::new(payload.manifest); + (FetchOutcome::Ok(full_arc), payload.primary_settle) } - Ok(FetchManifestResult::NotModified) => (FetchOutcome::NotModified, None), + Ok(FetchWithSettleResult::NotModified) => (FetchOutcome::NotModified, None), Err(e) => { tracing::debug!("fast_preload failed for {}: {}", n, e); (FetchOutcome::Err, None) diff --git a/crates/ruborist/src/service/manifest.rs b/crates/ruborist/src/service/manifest.rs index 90f1db71b..38db87969 100644 --- a/crates/ruborist/src/service/manifest.rs +++ b/crates/ruborist/src/service/manifest.rs @@ -4,7 +4,11 @@ //! [`crate::service::fetch`] so retry policy stays uniform across registry //! manifest fetches and non-registry resolvers (git, http tarball). +use std::collections::HashMap; +use std::sync::Arc; + use anyhow::{Result, anyhow}; +use serde::Deserialize; use tokio_retry::RetryIf; use super::fetch::{ @@ -12,6 +16,7 @@ use super::fetch::{ }; use super::http::get_client; use crate::model::manifest::{CoreVersionManifest, FullManifest}; +use crate::resolver::version::resolve_target_version; use crate::util::FETCH_TIMINGS; /// Parse JSON bytes on tokio's blocking thread pool. @@ -157,6 +162,209 @@ pub async fn fetch_full_manifest(opts: FetchManifestOptions<'_>) -> Result, + /// `Some` when the requested spec resolves to a real version in + /// `manifest.versions`. `None` only on no-match (rare; usually a + /// spec referring to a yanked or moved version). + pub primary_settle: Option, +} + +/// `(resolved_version, parsed_subtree)` — what +/// [`fetch_full_manifest_with_settle`] hands back to callers that +/// supplied a `primary_spec`. +pub type PrimarySettleResult = (String, Arc); + +#[allow(clippy::large_enum_variant)] +pub enum FetchWithSettleResult { + Ok(FetchWithSettle), + NotModified, +} + +/// Fetch a full manifest and resolve the primary spec from the same +/// parse pass. +/// +/// Where [`fetch_full_manifest`] uses `simd_json::serde::from_slice` +/// to materialize a typed `FullManifest` (cheap envelope, deep +/// `versions` subtrees skipped via `IgnoredAny`) and leaves version +/// subtree extraction to a later `simd_json::to_borrowed_value` +/// reparse, this entry point does the borrowed-value parse once and +/// extracts: +/// * envelope fields needed by the resolver (`name`, `dist-tags`, +/// `versions` keys), +/// * the resolved-version subtree as a typed +/// [`CoreVersionManifest`]. +/// +/// Saves one full simd_json pass on the parse hot path — +/// `fast_preload` uses ~2700 of these per `utoo deps` cold run, so +/// halving the per-fetch parse work meaningfully reduces CPU on +/// 2-core CI. +pub async fn fetch_full_manifest_with_settle( + opts: FetchManifestOptions<'_>, + primary_spec: &str, +) -> Result { + let url = format!("{}/{}", opts.registry_url, opts.name); + let etag_owned = opts.etag.map(|s| s.to_string()); + let primary_spec_owned = primary_spec.to_string(); + let accept = match opts.format { + MetadataFormat::Abbreviated => "application/vnd.npm.install-v1+json", + MetadataFormat::Complete => "application/json", + }; + + RetryIf::spawn( + retry_strategy(), + || { + let url = url.clone(); + let etag = etag_owned.clone(); + let primary_spec = primary_spec_owned.clone(); + async move { + let mut request = get_client() + .map_err(FetchError::Permanent)? + .get(&url) + .header("Accept", accept); + if let Some(etag_value) = &etag { + request = request.header("If-None-Match", etag_value); + } + + let t_request_start = std::time::Instant::now(); + let response = request.send().await.map_err(classify_reqwest_error)?; + let request_us = t_request_start.elapsed().as_micros() as u64; + let status = response.status(); + + if status == reqwest::StatusCode::NOT_MODIFIED { + if etag.is_some() { + return Ok(FetchWithSettleResult::NotModified); + } + return Err(classify_status(status, &url)); + } + + if status.is_success() { + let new_etag = response + .headers() + .get("etag") + .and_then(|v| v.to_str().ok()) + .map(|s| s.to_string()); + + let t_body_start = std::time::Instant::now(); + let raw_bytes = response + .bytes() + .await + .map_err(|e| FetchError::Permanent(anyhow!("Response read error: {e}")))? + .to_vec(); + let body_us = t_body_start.elapsed().as_micros() as u64; + let bytes_len = raw_bytes.len() as u64; + let raw_arc: Arc<[u8]> = Arc::from(raw_bytes); + + let t_parse_start = std::time::Instant::now(); + let parse_result = + parse_envelope_and_settle(Arc::clone(&raw_arc), primary_spec) + .await + .map_err(FetchError::Permanent)?; + let parse_us = t_parse_start.elapsed().as_micros() as u64; + + FETCH_TIMINGS.record(request_us, body_us, parse_us, bytes_len); + + let (manifest, primary_settle) = parse_result; + Ok(FetchWithSettleResult::Ok(FetchWithSettle { + manifest, + etag: new_etag, + primary_settle, + })) + } else { + Err(classify_status(status, &url)) + } + } + }, + is_retryable, + ) + .await + .map_err(|e| match e { + FetchError::Retryable(e) | FetchError::Permanent(e) => { + anyhow!("Failed to fetch {}: {:#}", opts.name, e) + } + }) +} + +/// Off-runtime combined parse: `simd_json::to_borrowed_value` once, +/// extract envelope into [`FullManifest`] + resolve `primary_spec` +/// against the parsed `versions` keys + materialize the resolved +/// version's subtree into [`CoreVersionManifest`]. +/// +/// Constructs `FullManifest` manually rather than via typed serde so +/// the work is exactly one parse pass. Other `FullManifest` fields +/// (`description`, `time`, `maintainers`, etc.) stay at `Default` +/// values — none are read on the resolver hot path. +async fn parse_envelope_and_settle( + raw: Arc<[u8]>, + primary_spec: String, +) -> Result<(FullManifest, Option)> { + #[cfg(not(target_arch = "wasm32"))] + { + tokio::task::spawn_blocking(move || parse_envelope_and_settle_sync(raw, &primary_spec)) + .await + .map_err(|e| anyhow!("spawn_blocking parse panicked: {e}"))? + } + #[cfg(target_arch = "wasm32")] + { + parse_envelope_and_settle_sync(raw, &primary_spec) + } +} + +fn parse_envelope_and_settle_sync( + raw: Arc<[u8]>, + primary_spec: &str, +) -> Result<(FullManifest, Option)> { + use simd_json::prelude::{ValueAsScalar, ValueObjectAccess}; + + let mut buf = (*raw).to_vec(); + let parsed = + simd_json::to_borrowed_value(&mut buf).map_err(|e| anyhow!("JSON parse error: {e}"))?; + + let name = parsed + .get("name") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + .unwrap_or_default(); + + let dist_tags: HashMap = parsed + .get("dist-tags") + .and_then(|v| HashMap::::deserialize(v).ok()) + .unwrap_or_default(); + + let versions_keys: Vec = parsed + .get("versions") + .and_then(simd_json::prelude::ValueAsObject::as_object) + .map(|obj| obj.keys().map(|k| k.to_string()).collect()) + .unwrap_or_default(); + + let manifest = FullManifest { + name, + dist_tags: dist_tags.clone(), + versions: versions_keys, + raw, + ..Default::default() + }; + + // Resolve spec against the just-extracted envelope. + let primary_settle = match resolve_target_version((&manifest).into(), primary_spec) { + Ok(resolved) => parsed + .get("versions") + .and_then(|v| v.get(resolved.as_str())) + .and_then(|version_obj| CoreVersionManifest::deserialize(version_obj).ok()) + .map(|core| (resolved, Arc::new(core))), + Err(_) => None, + }; + + Ok((manifest, primary_settle)) +} + /// Fetch full manifest without ETag / 304 support. /// /// Convenience wrapper around [`fetch_full_manifest`] for callers that never diff --git a/crates/ruborist/src/service/mod.rs b/crates/ruborist/src/service/mod.rs index 13109e994..5adb6bf0b 100644 --- a/crates/ruborist/src/service/mod.rs +++ b/crates/ruborist/src/service/mod.rs @@ -60,8 +60,9 @@ pub use cache::{ pub use fs::{Glob, NoopGlob, exists, read_to_string}; pub use http::client_builder; pub use manifest::{ - FetchManifestOptions, FetchManifestResult, FetchVersionManifestOptions, MetadataFormat, - fetch_full_manifest, fetch_full_manifest_fresh, fetch_version_manifest, + FetchManifestOptions, FetchManifestResult, FetchVersionManifestOptions, FetchWithSettle, + FetchWithSettleResult, MetadataFormat, fetch_full_manifest, fetch_full_manifest_fresh, + fetch_full_manifest_with_settle, fetch_version_manifest, }; pub use registry::UnifiedRegistry; pub use store::{ManifestStore, NoopStore}; From 542d7f144ec700ab5601247eff655399585fedbe Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 07:11:45 +0800 Subject: [PATCH 17/24] =?UTF-8?q?perf(pm):=20bump=20manifests-concurrency-?= =?UTF-8?q?limit=2096=20=E2=86=92=20128?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After 671ac98e's combined-parse fetch path eliminated the double simd_json pass, the spawn_blocking pool's contention ceiling rose enough that bumping concurrency past 96 no longer queues parses behind 2-core CPU. manifest-bench's most recent good-network sweep on GHA showed conc=128 hitting 1500ms vs conc=96 at 1566ms — small but real headroom for fast_preload's late-wave saturation now that initial waves fill faster. Risk: on slower-network runs (npmjs per-IP throttle), conc=128 widens p99. Earlier conc-sweep data was mixed — accepting that variance for the average-case improvement. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/pm/src/util/user_config.rs | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/crates/pm/src/util/user_config.rs b/crates/pm/src/util/user_config.rs index f05b0f52f..2f389379e 100644 --- a/crates/pm/src/util/user_config.rs +++ b/crates/pm/src/util/user_config.rs @@ -137,18 +137,17 @@ pub fn get_install_scope() -> InstallScope { // We tried 256 to match bun's observed parallel streams; on GHA the // fetch-breakdown instrumentation showed sum_parse exploded from // ~10ms (local Mac, network-bound) to 728s on first cold run with -// Once we moved fetch parse off rayon to tokio's spawn_blocking pool -// (cap 512) and settle off the runtime via rayon::spawn, the original -// 256-concurrency regression mechanism (parses queued behind 2 rayon -// workers) no longer applies. The standalone manifest-bench HTTP-only -// sweep on GHA (npmjs, conc 32→256) shows wall bottoming out at conc 96 -// (1817ms) and tracking flat-then-rising past that — beyond ~96 -// in-flight, npmjs's per-IP rate degrades and tail latency widens. -// 96 is the sweet spot: enough headroom for the wave-shaped transitive -// dep walk in fast_preload to keep the runtime busy, without paying the -// p99 widening that 128+ shows. +// Once parse work shrank (combined `to_borrowed_value` pass replaces +// the typed-serde envelope parse + reparse), spawn_blocking pool +// pressure no longer caps us at 96. manifest-bench's HTTP-only sweep +// on GHA (npmjs, h1) consistently picks 96 or 128 as best wall — +// in the most recent good-network run, conc=128 hit 1500ms vs +// conc=96 at 1566ms. Bumping to 128 narrows the gap between +// fast_preload's wave-shaped concurrency floor (eff_parallel ~48 +// because pending takes ~2 wave depths to fill) and the cap, so +// the late-wave saturation has more headroom. static MANIFESTS_CONCURRENCY_LIMIT: LazyLock> = - LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 96)); + LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 128)); pub fn set_manifests_concurrency_limit(value: Option) { MANIFESTS_CONCURRENCY_LIMIT.set(value); From c8768ac4ce8ca26a60a3313e22dba7ac625665d7 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 07:39:24 +0800 Subject: [PATCH 18/24] revert(pm): manifests-concurrency-limit back to 96 542d7f14's conc=128 bench landed in a slow-network run (mb best 2010ms vs 1500ms in the prior good-network run; bun also bumped to 2.14s vs 1.83s). Adjusted gap to mb best stayed flat (~700ms either way), so conc=128 didn't beat 96 across runs. Picking 96 as the conservative default: at-or-near best on every GHA run we've measured, never the worst, and leaves headroom for npmjs's per-IP throttling to absorb without compounding p99. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/pm/src/util/user_config.rs | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/crates/pm/src/util/user_config.rs b/crates/pm/src/util/user_config.rs index 2f389379e..f6924f5aa 100644 --- a/crates/pm/src/util/user_config.rs +++ b/crates/pm/src/util/user_config.rs @@ -137,17 +137,18 @@ pub fn get_install_scope() -> InstallScope { // We tried 256 to match bun's observed parallel streams; on GHA the // fetch-breakdown instrumentation showed sum_parse exploded from // ~10ms (local Mac, network-bound) to 728s on first cold run with -// Once parse work shrank (combined `to_borrowed_value` pass replaces -// the typed-serde envelope parse + reparse), spawn_blocking pool -// pressure no longer caps us at 96. manifest-bench's HTTP-only sweep -// on GHA (npmjs, h1) consistently picks 96 or 128 as best wall — -// in the most recent good-network run, conc=128 hit 1500ms vs -// conc=96 at 1566ms. Bumping to 128 narrows the gap between -// fast_preload's wave-shaped concurrency floor (eff_parallel ~48 -// because pending takes ~2 wave depths to fill) and the cap, so -// the late-wave saturation has more headroom. +// manifest-bench's HTTP-only sweep on GHA (npmjs, h1) bottoms out +// somewhere in the 96-128 band — which one wins varies with npmjs's +// per-IP latency on each run (good runs picked 128, slow-network +// runs flattened the curve and even regressed at 128 due to wider +// p99 from queued requests). 96 is the conservative pick: it's at +// or near best on every run we've measured, never the worst, and +// leaves headroom for npmjs to throttle without compounding queue +// time. Combined-parse fetch (671ac98e) made the spawn_blocking +// pool no longer a contention bottleneck, but didn't change the +// network-side variance — that's what caps the useful concurrency. static MANIFESTS_CONCURRENCY_LIMIT: LazyLock> = - LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 128)); + LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 96)); pub fn set_manifests_concurrency_limit(value: Option) { MANIFESTS_CONCURRENCY_LIMIT.set(value); From 3be7487d7ad772667ac125ce82955432c257f8d3 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 09:57:58 +0800 Subject: [PATCH 19/24] perf(pm): mb_resolve experimental fetch path (parallel track to fast_preload) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds resolver::mb_resolve module + service::build_deps_mb entry point as a parallel-track alternative to fast_preload, structured to match manifest-bench's main-loop shape as closely as correctness allows. Hypothesis under test: fast_preload's eff_parallel caps at ~50/96 because the FastEvent enum match + cache writes + sibling deferred bookkeeping in the main loop competes with tokio runtime workers for the 2 CPU cores on GHA, stalling socket I/O drive. mb_fetch pushes ALL per-fetch work into the spawned future itself (including cache writes), so the main loop is reduced to: while let Some(deps) = futs.next().await { pending.extend(deps); refill_to_cap(...); } Sibling specs (multiple ranges on same package) are NOT deferred at queue level — racing fetches for the same name both proceed. The race converges naturally: first fetch to land populates full_manifests, subsequent racers find the cache hit on entry and short-circuit to a sibling-style settle. Wastes ~5-50 network requests in real workloads but eliminates the HashMap probe + drain overhead from the hot loop. Wired in via UTOO_RESOLVE=mb env var: - Context::build_deps (utoo deps) routes through build_deps_mb - pipeline::resolve_with_pipeline (utoo install) also routes through it; pipeline workers still start but don't pipeline during fetch (mb_fetch emits no PackageResolved events) — install becomes phase-sequential, useful for resolve-phase A/B. bench script enables UTOO_RESOLVE=mb so CI measures the new path against existing baselines (utoo-next/utoo-npm/bun ignore the env var). Comment the export line to A/B back against fast_preload. Old fast_preload + UnifiedRegistry paths untouched. Co-Authored-By: Claude Opus 4.7 (1M context) --- bench/pm-bench-phases.sh | 7 + crates/pm/src/helper/ruborist_context.rs | 12 +- crates/pm/src/service/pipeline/mod.rs | 17 +- crates/ruborist/src/resolver/mb_resolve.rs | 243 +++++++++++++++++++++ crates/ruborist/src/resolver/mod.rs | 1 + crates/ruborist/src/service/api.rs | 161 ++++++++++++++ crates/ruborist/src/service/mod.rs | 2 +- 7 files changed, 440 insertions(+), 3 deletions(-) create mode 100644 crates/ruborist/src/resolver/mb_resolve.rs diff --git a/bench/pm-bench-phases.sh b/bench/pm-bench-phases.sh index 226ffb751..26e43388c 100755 --- a/bench/pm-bench-phases.sh +++ b/bench/pm-bench-phases.sh @@ -22,6 +22,13 @@ UTOO_NEXT_CACHE="${UTOO_NEXT_CACHE:-/tmp/utoo-next-bench-cache}" BUN_CACHE="${BUN_CACHE:-/tmp/bun-bench-cache}" export BUN_INSTALL_CACHE_DIR="$BUN_CACHE" +# Route the current `utoo` binary's resolve phase through the +# experimental `mb_resolve` flat-fetch path. Other PMs ignore this +# env var (utoo-next is built from origin/next which doesn't have +# the flag; utoo-npm/bun ignore unrecognized env vars). Comment out +# to A/B against the default `fast_preload` path. +export UTOO_RESOLVE=mb + # Drop optional baselines from the PM list when their binary is not wired # up — UTOO_NPM_BIN is set by CI's "Install utoo@npm" step, UTOO_NEXT_BIN # by the optional "Build next branch utoo" step. Local runs without them diff --git a/crates/pm/src/helper/ruborist_context.rs b/crates/pm/src/helper/ruborist_context.rs index bc4d7faa1..542664f8c 100644 --- a/crates/pm/src/helper/ruborist_context.rs +++ b/crates/pm/src/helper/ruborist_context.rs @@ -87,10 +87,20 @@ impl Context { /// Used by the lockfile-only path (`utoo deps`). No pipeline consumes /// `PackageResolved` events here, so preload is pure overhead — BFS's /// own per-level parallel prefetch warms the manifest cache. + /// + /// Set `UTOO_RESOLVE=mb` to opt into the experimental + /// manifest-bench-style fetch path (`build_deps_mb`) for A/B + /// benchmarking against the current `fast_preload`. pub async fn build_deps(cwd: PathBuf) -> anyhow::Result { let mut options = Self::deps_options(cwd.clone(), ProgressReceiver).await; options.skip_preload = true; - let output = utoo_ruborist::service::build_deps(options).await?; + let use_mb = std::env::var("UTOO_RESOLVE").as_deref() == Ok("mb"); + let output = if use_mb { + tracing::debug!("UTOO_RESOLVE=mb: routing to build_deps_mb"); + utoo_ruborist::service::build_deps_mb(options).await? + } else { + utoo_ruborist::service::build_deps(options).await? + }; spawn_save_project_cache(cwd, output.project_cache.clone()); Ok(output) } diff --git a/crates/pm/src/service/pipeline/mod.rs b/crates/pm/src/service/pipeline/mod.rs index 719d31d13..4169ca88d 100644 --- a/crates/pm/src/service/pipeline/mod.rs +++ b/crates/pm/src/service/pipeline/mod.rs @@ -41,7 +41,22 @@ pub async fn resolve_with_pipeline(root_path: &std::path::Path) -> anyhow::Resul let (options, channels) = Context::pipeline_deps_options(root_path.to_path_buf()).await; let handles = worker::start_workers(channels, root_path.to_path_buf()); - let output = utoo_ruborist::service::build_deps(options).await?; + // `UTOO_RESOLVE=mb` reroutes install through the experimental + // mb-style fetch path. Pipeline workers are still started, but + // because mb_fetch doesn't emit `PackageResolved` events, the + // pipeline only fires once BFS completes (graph_to_package_lock + // emits `PackagePlaced` from BFS). Install becomes + // phase-sequential — fetch all manifests, then download + + // clone. Useful for A/B benchmarking the resolve phase in + // isolation; the pipelining advantage of the default path is + // lost. + let use_mb = std::env::var("UTOO_RESOLVE").as_deref() == Ok("mb"); + let output = if use_mb { + tracing::debug!("UTOO_RESOLVE=mb: routing install resolve to build_deps_mb"); + utoo_ruborist::service::build_deps_mb(options).await? + } else { + utoo_ruborist::service::build_deps(options).await? + }; save_package_lock(root_path, &output.lock).await?; spawn_save_project_cache(root_path.to_path_buf(), output.project_cache); diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs new file mode 100644 index 000000000..2928638be --- /dev/null +++ b/crates/ruborist/src/resolver/mb_resolve.rs @@ -0,0 +1,243 @@ +//! Manifest-bench-style flat manifest fetcher (experimental new pipeline). +//! +//! A parallel-track alternative to [`super::fast_preload`], structured +//! to match `manifest-bench`'s main-loop shape as closely as +//! correctness allows. The hypothesis under test: `fast_preload`'s +//! eff_parallel caps at ~50 against a 96-cap because the main loop's +//! CPU work (FastEvent enum match + cache writes + sibling-deferred +//! bookkeeping + Box::pin allocation) competes with tokio runtime +//! workers for the 2 cores on GHA, stalling socket I/O drive. +//! +//! `mb_resolve` pushes ALL per-fetch work into the spawned future +//! itself (cache writes included) so the main loop is reduced to: +//! +//! ```ignore +//! while let Some(deps) = futs.next().await { +//! pending.extend(deps); +//! refill_to_cap(&mut futs, &mut pending, ...); +//! } +//! ``` +//! +//! Sibling specs (multiple ranges on the same package) are NOT +//! deferred at queue level — if two specs for the same name race, +//! both fetch. This wastes a small number of network requests (~5-50 +//! across a real install) but keeps the main loop's per-event cost +//! minimal (no HashMap probe / drain). The race converges: whichever +//! fetch lands first populates `full_manifests`; subsequent racers +//! find the cache hit on entry and short-circuit to a sibling-style +//! settle without re-fetching. +//! +//! Wiring: opt-in via `UTOO_RESOLVE=mb` env var. Both `utoo deps` +//! and `utoo install` route through this when set; install loses +//! pipelining (mb_fetch doesn't emit `PackageResolved` events) but +//! gains the lean main loop for resolve-phase A/B testing. + +use std::collections::{HashSet, VecDeque}; +use std::sync::Arc; + +use futures::stream::{FuturesUnordered, StreamExt}; + +use crate::model::manifest::{CoreVersionManifest, FullManifest}; +use crate::model::node::PeerDeps; +use crate::resolver::preload::{Dep, PreloadConfig}; +use crate::resolver::version::resolve_target_version; +use crate::service::{ + FetchManifestOptions, FetchWithSettleResult, MemoryCache, MetadataFormat, + fetch_full_manifest_with_settle, +}; +use crate::spec::SpecStr; +use crate::util::FETCH_TIMINGS; + +#[derive(Debug, Default)] +pub struct MbFetchStats { + pub success: usize, + pub fail: usize, +} + +/// Collect dependencies from a deps map, filtering non-registry specs. +fn collect_deps(map: Option<&std::collections::HashMap>) -> Vec { + map.into_iter() + .flatten() + .filter(|(_, spec)| spec.is_registry_spec()) + .map(|(name, spec)| (name.clone(), spec.clone())) + .collect() +} + +fn extract_transitive(manifest: &CoreVersionManifest, peer_deps: PeerDeps) -> Vec { + let mut out = Vec::new(); + out.extend(collect_deps(manifest.dependencies.as_ref())); + if peer_deps == PeerDeps::Include { + out.extend(collect_deps(manifest.peer_dependencies.as_ref())); + } + out.extend(collect_deps(manifest.optional_dependencies.as_ref())); + out +} + +/// Settle one (name, spec) against an already-cached `FullManifest`. +/// Used for sibling specs (or racing-fetch losers) — extracts the +/// resolved version's `CoreVersionManifest` on the blocking pool, +/// populates both `(name, spec)` and `(name, resolved_version)` cache +/// slots so BFS hits the early-return fast path. +async fn settle_sibling( + name: String, + spec: String, + full: Arc, + cache: MemoryCache, + peer_deps: PeerDeps, +) -> Vec { + let Ok(resolved) = resolve_target_version((&*full).into(), &spec) else { + return Vec::new(); + }; + if let Some(cached) = cache.get_version_manifest(&name, &resolved) { + cache.set_version_manifest(name, spec, Arc::clone(&cached)); + return extract_transitive(&cached, peer_deps); + } + + let resolved_for_parse = resolved.clone(); + let full_for_parse = Arc::clone(&full); + let core_opt = tokio::task::spawn_blocking(move || { + full_for_parse + .get_core_version(&resolved_for_parse) + .map(Arc::new) + }) + .await + .ok() + .flatten(); + + let Some(core_arc) = core_opt else { + return Vec::new(); + }; + cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc)); + cache.set_version_manifest(name, resolved, Arc::clone(&core_arc)); + extract_transitive(&core_arc, peer_deps) +} + +/// Self-contained per-spec future. Either fetches `(name)`'s full +/// manifest from the registry (if not yet cached), or settles against +/// an already-cached one. In both cases it: +/// * writes `full_manifests` and `version_manifests` cache slots +/// for the resolved spec, +/// * returns the spec's transitive deps for the main loop to +/// enqueue. +/// +/// Racing-fetch handling: two specs for the same name dispatched +/// concurrently both enter the fetch branch (no in-flight gate). The +/// second one re-issues a network round-trip; the cost is bounded by +/// the small number of sibling specs in real workloads (<2% in +/// ant-design-x). Last writer to `cache.set_full_manifest` wins; +/// content is identical so correctness is preserved. +async fn fetch_or_settle( + name: String, + spec: String, + registry_url: String, + cache: MemoryCache, + peer_deps: PeerDeps, +) -> Vec { + // Sibling fast path: full manifest already cached. + if let Some(full) = cache.get_full_manifest(&name) { + return settle_sibling(name, spec, full, cache, peer_deps).await; + } + + let result = fetch_full_manifest_with_settle( + FetchManifestOptions { + registry_url: ®istry_url, + name: &name, + format: MetadataFormat::Abbreviated, + etag: None, + }, + &spec, + ) + .await; + + let Ok(FetchWithSettleResult::Ok(payload)) = result else { + return Vec::new(); + }; + + let full_arc = Arc::new(payload.manifest); + cache.set_full_manifest(name.clone(), Arc::clone(&full_arc)); + + let Some((resolved, core_arc)) = payload.primary_settle else { + return Vec::new(); + }; + cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc)); + cache.set_version_manifest(name, resolved, Arc::clone(&core_arc)); + extract_transitive(&core_arc, peer_deps) +} + +/// Manifest-bench-style flat parallel fetch. See module docs for the +/// rationale. +pub async fn mb_fetch( + initial_deps: Vec, + registry_url: &str, + cache: &MemoryCache, + config: &PreloadConfig, +) -> MbFetchStats { + let mut stats = MbFetchStats::default(); + let mut pending: VecDeque = initial_deps.into(); + let mut seen: HashSet<(String, String)> = HashSet::new(); + let mut futs = FuturesUnordered::new(); + let cap = config.concurrency; + let peer_deps = config.peer_deps; + let registry_url = registry_url.to_string(); + + let start = tokio::time::Instant::now(); + + // Initial fill — same shape as the refill below. + while futs.len() < cap { + let Some((name, spec)) = pending.pop_front() else { + break; + }; + if !seen.insert((name.clone(), spec.clone())) { + continue; + } + futs.push(Box::pin(fetch_or_settle( + name, + spec, + registry_url.clone(), + cache.clone(), + peer_deps, + ))); + } + + while let Some(transitive) = futs.next().await { + if transitive.is_empty() { + // Empty result is ambiguous (no transitive deps OR fetch + // failed) — `MbFetchStats` only tracks success/fail at a + // coarse level. The fetch-timings counters (recorded + // inside `fetch_full_manifest_with_settle`) carry the + // detailed per-fetch metrics. + stats.fail += 1; + } else { + stats.success += 1; + } + pending.extend(transitive); + + // Refill — same body as the initial fill above. + while futs.len() < cap { + let Some((name, spec)) = pending.pop_front() else { + break; + }; + if !seen.insert((name.clone(), spec.clone())) { + continue; + } + futs.push(Box::pin(fetch_or_settle( + name, + spec, + registry_url.clone(), + cache.clone(), + peer_deps, + ))); + } + } + + let wall = start.elapsed(); + tracing::info!( + "p1-breakdown mb_fetch wall={}ms ok={} fail={} | {}", + wall.as_millis(), + stats.success, + stats.fail, + FETCH_TIMINGS.snapshot().summary_line(), + ); + + stats +} diff --git a/crates/ruborist/src/resolver/mod.rs b/crates/ruborist/src/resolver/mod.rs index e7baad988..2d0a288d9 100644 --- a/crates/ruborist/src/resolver/mod.rs +++ b/crates/ruborist/src/resolver/mod.rs @@ -8,6 +8,7 @@ pub mod fast_preload; pub mod git; #[cfg(feature = "http-tarball")] pub mod http; +pub mod mb_resolve; pub mod preload; pub mod registry; pub mod runtime; diff --git a/crates/ruborist/src/service/api.rs b/crates/ruborist/src/service/api.rs index 3b9b713ea..9687fc875 100644 --- a/crates/ruborist/src/service/api.rs +++ b/crates/ruborist/src/service/api.rs @@ -39,6 +39,7 @@ use crate::resolver::builder::{ gather_preload_deps, }; use crate::resolver::fast_preload::fast_preload; +use crate::resolver::mb_resolve::mb_fetch; use crate::resolver::preload::PreloadConfig; use crate::resolver::runtime::install_runtime_from_map; use crate::resolver::workspace::WorkspaceDiscovery; @@ -332,6 +333,166 @@ where }) } +/// Experimental parallel-track entry point: structurally identical to +/// [`build_deps`] but routes the manifest-fetch phase through +/// [`crate::resolver::mb_resolve::mb_fetch`] instead of +/// [`crate::resolver::fast_preload::fast_preload`]. +/// +/// Intended for A/B benchmarking: install + lockfile-only callers can +/// opt in via `UTOO_RESOLVE=mb` (wired in `pm::helper::ruborist_context`). +/// All other behavior — workspace discovery, runtime injection, BFS, +/// graph→lock serialization, project cache export — is the same as +/// `build_deps`. The `EventReceiver` still receives BFS events; it +/// does NOT receive `PreloadFetching` / `PreloadProgress` events +/// because mb_fetch is silent (matches `manifest-bench`'s zero-event +/// loop). +/// +/// **Install-path note:** `pipeline_deps_options` callers that need +/// `PackageResolved` events to drive the download/clone pipeline +/// won't pipeline under this path — mb_fetch finishes all fetches +/// before BFS starts. Use only for `utoo deps`-style workloads, or +/// accept that install becomes phase-sequential. +pub async fn build_deps_mb(options: BuildDepsOptions) -> Result +where + G: Glob + Clone, + R: EventReceiver, +{ + let BuildDepsOptions { + cwd, + registry_url, + cache_dir, + manifest_store, + warm_project_cache, + concurrency, + peer_deps, + glob, + receiver, + supports_semver, + catalogs, + skip_preload: _, + } = options; + + // Steps 1-6: structurally identical to `build_deps` — read + // package.json, inject runtime deps, build initial graph, add + // root edges, discover and add workspaces. + let discovery = WorkspaceDiscovery::new(glob.clone()); + let root_path = discovery.find_root_path(&cwd).await?; + let pkg_path = root_path.join("package.json"); + let mut pkg: PackageJson = super::fs::read_json(&pkg_path) + .await + .map_err(|e| anyhow::anyhow!("Failed to read/parse package.json: {}", e))?; + + if let Some(engines) = &pkg.engines { + let runtime_deps = install_runtime_from_map(engines); + if !runtime_deps.is_empty() { + for (name, version) in runtime_deps { + pkg.optional_dependencies + .get_or_insert_with(HashMap::new) + .entry(name) + .or_insert(version); + } + } + } + + let mut graph = DependencyGraph::from_package_json(root_path.clone(), pkg.clone()); + let root_index = graph.root_index; + let edge_ctx = EdgeContext::new(peer_deps, DevDeps::Include).with_catalogs(&catalogs); + add_edges_from(&mut graph, root_index, &pkg, &edge_ctx); + + let workspaces = discovery.find_workspaces_from_pkg(&root_path, &pkg).await?; + for workspace in workspaces { + let ws_pkg = workspace.package_json; + let workspace_node = + PackageNode::workspace_from_package_json(workspace.path.clone(), ws_pkg.clone()); + let workspace_index = graph.add_node(workspace_node); + let link_node = PackageNode::link_from_package_json(workspace.path.clone(), ws_pkg.clone()); + let link_index = graph.add_node(link_node); + graph.add_physical_edge(root_index, workspace_index); + graph.add_physical_edge(root_index, link_index); + let dep_edge_id = graph.add_dependency_edge( + root_index, + workspace.name.clone(), + &ws_pkg.version, + EdgeType::Prod, + ); + graph.mark_dependency_resolved(dep_edge_id, workspace_index); + add_edges_from(&mut graph, workspace_index, &ws_pkg, &edge_ctx); + } + + // Step 7-8: cache + registry, same as `build_deps`. Warm project + // cache is honored. + let package_cache = Arc::new(PackageCache::default()); + let (cache_count, _) = prepopulate_warm_cache(&package_cache, warm_project_cache.as_ref()); + + let mut builder = UnifiedRegistry::builder() + .registry(®istry_url) + .cache(package_cache) + .store(Arc::clone(&manifest_store)); + if let Some(semver) = supports_semver { + builder = builder.supports_semver(semver); + } + let registry = builder.build(); + + // Run mb_fetch instead of fast_preload — pre-warms cache by + // walking transitive deps via flat FuturesUnordered. Skipped if + // the warm project cache already covers the workload. + if cache_count == 0 { + let initial_deps = gather_preload_deps(&graph, peer_deps); + let preload_config = PreloadConfig { + peer_deps, + concurrency, + }; + mb_fetch( + initial_deps, + registry.registry_url(), + registry.cache(), + &preload_config, + ) + .await; + } + + // BFS phase reads the now-warm cache. `skip_preload=true` skips + // the receiver-driven preload — mb_fetch already ran. + let mut config = BuildDepsConfig::default() + .with_peer_deps(peer_deps) + .with_concurrency(concurrency) + .with_skip_preload(true) + .with_catalogs(catalogs); + if let Some(dir) = cache_dir { + config = config.with_cache_dir(dir); + } + + build_deps_with_config(&mut graph, ®istry, config, &receiver) + .await + .map_err(|e| anyhow::Error::new(e).context("Dependency resolution failed"))?; + + let t_serialize_start = std::time::Instant::now(); + let (packages, _total) = graph.serialize_to_packages(&root_path); + let serialize_us = t_serialize_start.elapsed().as_micros() as u64; + + let t_cache_export_start = std::time::Instant::now(); + let mut project_cache = ProjectCacheData::default(); + for (key, manifest) in registry.cache().export_version_manifests() { + let (name, spec) = parse_package_spec(&key); + let version = manifest.version.clone(); + let pkg_cache = project_cache.cache.entry(name.to_string()).or_default(); + pkg_cache.specs.insert(spec.to_string(), version.clone()); + pkg_cache.manifests.insert(version, (*manifest).clone()); + } + let cache_export_us = t_cache_export_start.elapsed().as_micros() as u64; + + tracing::info!( + "p1-breakdown serialize_us={} cache_export_us={}", + serialize_us, + cache_export_us, + ); + + Ok(BuildDepsOutput { + lock: PackageLock::new(&pkg.name, &pkg.version, packages), + project_cache, + }) +} + /// Pre-populate `cache` from a warm project cache. Returns /// `(loaded, missing)` — `loaded` is the count of usable spec→manifest /// entries; `missing` counts specs whose resolved version had no manifest diff --git a/crates/ruborist/src/service/mod.rs b/crates/ruborist/src/service/mod.rs index 5adb6bf0b..7a7cf8ca8 100644 --- a/crates/ruborist/src/service/mod.rs +++ b/crates/ruborist/src/service/mod.rs @@ -52,7 +52,7 @@ mod manifest; mod registry; mod store; -pub use api::{BuildDepsOptions, BuildDepsOutput, build_deps}; +pub use api::{BuildDepsOptions, BuildDepsOutput, build_deps, build_deps_mb}; pub use cache::{ CacheStats, MemoryCache, PackageCache, ProjectCacheData, ProjectPackageCache, Versions, VersionsInfo, From 02cc12e7a23214672215a1ee1efd6317e7ce6d8c Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 10:39:27 +0800 Subject: [PATCH 20/24] =?UTF-8?q?perf(pm):=20mb=5Fresolve=20v3=20=E2=80=94?= =?UTF-8?q?=20two-phase=20pure=20HTTP=20+=20rayon=20batch=20parse?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v1/v2 ran parse work in spawn_blocking inside each fetch future, which competed with tokio runtime workers for the 2 GHA cores. CI showed eff_parallel capped at 47/96 vs manifest-bench standalone's 75/96 on the same box. Hypothesis: parse CPU starves socket drive. v3 separates the two phases: * PHASE 1 — `mb_style_pure_fetch` is a structural copy of `manifest-bench`'s main loop: future body does ONLY GET + body recv, refill 1-for-1 on completion. Zero per-future CPU work, so tokio runtime workers retain full CPU for socket drive. * PHASE 2 — bulk rayon par_iter parse: for each body, parse `FullManifest` envelope via simd_json::to_borrowed_value, resolve every queued spec for this name against the just-parsed manifest, populate cache slots, collect transitive deps. Runs off the tokio runtime entirely (spawn_blocking → rayon par_iter). Phases alternate until pending exhausted. Typical project: 3-5 iterations as the dep tree fans out wave by wave. The point of the split is the `phase1_http_wall` trace — measured in isolation from any parse work, it should match manifest-bench's standalone wall (~1.5-2.0s for 2733 names @ conc=96). If it does, the remaining gap to mb is concentrated in phase 2 work, which is inherent to discovering transitive deps from a non-flat name list. Tracing per iteration: p1-breakdown mb_fetch iter=N phase1_http_wall=Xms n=Y bytes=Z p1-breakdown mb_fetch iter=N phase2_parse_wall=Xms settles=Y new_transitives=Z p1-breakdown mb_fetch total_wall=Xms iters=Y Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/resolver/mb_resolve.rs | 494 ++++++++++++++------- 1 file changed, 332 insertions(+), 162 deletions(-) diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs index 2928638be..05e1bf038 100644 --- a/crates/ruborist/src/resolver/mb_resolve.rs +++ b/crates/ruborist/src/resolver/mb_resolve.rs @@ -1,61 +1,87 @@ -//! Manifest-bench-style flat manifest fetcher (experimental new pipeline). +//! Two-phase manifest fetcher: phase 1 pure HTTP (mirrors +//! `manifest-bench` standalone exactly), phase 2 rayon batch parse + +//! settle. //! -//! A parallel-track alternative to [`super::fast_preload`], structured -//! to match `manifest-bench`'s main-loop shape as closely as -//! correctness allows. The hypothesis under test: `fast_preload`'s -//! eff_parallel caps at ~50 against a 96-cap because the main loop's -//! CPU work (FastEvent enum match + cache writes + sibling-deferred -//! bookkeeping + Box::pin allocation) competes with tokio runtime -//! workers for the 2 cores on GHA, stalling socket I/O drive. +//! ## Phase split //! -//! `mb_resolve` pushes ALL per-fetch work into the spawned future -//! itself (cache writes included) so the main loop is reduced to: +//! Per-fetch parse work was the real bottleneck in v1/v2 — `simd_json` +//! ran in `spawn_blocking` threads that competed with tokio runtime +//! workers for CPU on the 2-core GHA box. When 50+ parses ran in +//! parallel, tokio workers couldn't drive sockets, so `eff_parallel` +//! capped at ~47 against the 96 cap (vs `manifest-bench` standalone's +//! 75 on the same box). //! -//! ```ignore -//! while let Some(deps) = futs.next().await { -//! pending.extend(deps); -//! refill_to_cap(&mut futs, &mut pending, ...); -//! } -//! ``` +//! v3 separates the work: //! -//! Sibling specs (multiple ranges on the same package) are NOT -//! deferred at queue level — if two specs for the same name race, -//! both fetch. This wastes a small number of network requests (~5-50 -//! across a real install) but keeps the main loop's per-event cost -//! minimal (no HashMap probe / drain). The race converges: whichever -//! fetch lands first populates `full_manifests`; subsequent racers -//! find the cache hit on entry and short-circuit to a sibling-style -//! settle without re-fetching. +//! - **Phase 1** — `mb_style_pure_fetch` is a structural copy of +//! `manifest-bench`'s main loop: `spawn_one` (GET + body recv, +//! nothing else) + 1-for-1 refill on completion. The future body +//! has zero CPU work, so the tokio runtime workers retain full CPU +//! to drive sockets and `eff_parallel` reaches the same level as +//! the standalone bench. //! -//! Wiring: opt-in via `UTOO_RESOLVE=mb` env var. Both `utoo deps` -//! and `utoo install` route through this when set; install loses -//! pipelining (mb_fetch doesn't emit `PackageResolved` events) but -//! gains the lean main loop for resolve-phase A/B testing. +//! - **Phase 2** — bulk parse on rayon (off the tokio runtime). For +//! each fetched body: parse `FullManifest` envelope, resolve every +//! spec we need for this name, materialize `CoreVersionManifest` +//! subtrees, populate cache slots, collect transitive deps for the +//! next iteration. +//! +//! Phases alternate until `pending` is empty (typical project: 3-5 +//! iterations as transitive deps fan out wave by wave). +//! +//! Phase 1 is the line we measure against `manifest-bench` — +//! `p1-breakdown mb_fetch_iter=N phase1_http_wall=...` traces let us +//! check eff_parallel directly. +//! +//! Wired in via `UTOO_RESOLVE=mb` env var (see +//! `pm::helper::ruborist_context::Context::build_deps`). -use std::collections::{HashSet, VecDeque}; +use std::collections::{HashMap, HashSet}; use std::sync::Arc; +use bytes::Bytes; use futures::stream::{FuturesUnordered, StreamExt}; +use rayon::prelude::*; +use serde::Deserialize; use crate::model::manifest::{CoreVersionManifest, FullManifest}; use crate::model::node::PeerDeps; use crate::resolver::preload::{Dep, PreloadConfig}; use crate::resolver::version::resolve_target_version; -use crate::service::{ - FetchManifestOptions, FetchWithSettleResult, MemoryCache, MetadataFormat, - fetch_full_manifest_with_settle, -}; +use crate::service::MemoryCache; +use crate::service::http::get_client; use crate::spec::SpecStr; -use crate::util::FETCH_TIMINGS; #[derive(Debug, Default)] pub struct MbFetchStats { pub success: usize, pub fail: usize, + pub iterations: usize, +} + +/// Phase 1 result: one body per fetched name. `bytes` is `None` on +/// transport / non-2xx — kept in the result vector so phase 2 can +/// account for it, but contributes no settle work. +struct FetchOutcome { + name: String, + bytes: Option, } -/// Collect dependencies from a deps map, filtering non-registry specs. -fn collect_deps(map: Option<&std::collections::HashMap>) -> Vec { +/// Phase 2 per-name output. `full` is `None` on parse failure. +struct ParseOutcome { + name: String, + full: Option>, + /// Per-spec settled subtrees: `(spec, resolved_version, core)`. + /// Empty when the body failed to fetch / parse, or when no spec + /// resolves against the manifest. + settled: Vec<(String, String, Arc)>, + /// Transitive deps collected across all settled subtrees for this + /// name. Already filtered to registry specs; the main loop dedups + /// against `done_names` before queueing. + transitives: Vec, +} + +fn collect_deps(map: Option<&HashMap>) -> Vec { map.into_iter() .flatten() .filter(|(_, spec)| spec.is_registry_spec()) @@ -73,99 +99,177 @@ fn extract_transitive(manifest: &CoreVersionManifest, peer_deps: PeerDeps) -> Ve out } -/// Settle one (name, spec) against an already-cached `FullManifest`. -/// Used for sibling specs (or racing-fetch losers) — extracts the -/// resolved version's `CoreVersionManifest` on the blocking pool, -/// populates both `(name, spec)` and `(name, resolved_version)` cache -/// slots so BFS hits the early-return fast path. -async fn settle_sibling( - name: String, - spec: String, - full: Arc, - cache: MemoryCache, - peer_deps: PeerDeps, -) -> Vec { - let Ok(resolved) = resolve_target_version((&*full).into(), &spec) else { - return Vec::new(); +/// Phase 1 — structural copy of `manifest-bench`'s main loop. Future +/// body does ONLY GET + body recv; no parse, no cache writes, no +/// dedup. Returns one `FetchOutcome` per input name in arrival order. +async fn mb_style_pure_fetch( + names: Vec, + registry_url: &str, + concurrency: usize, +) -> Vec { + let client = match get_client() { + Ok(c) => c.clone(), + Err(e) => { + tracing::warn!("get_client failed: {e}"); + return Vec::new(); + } }; - if let Some(cached) = cache.get_version_manifest(&name, &resolved) { - cache.set_version_manifest(name, spec, Arc::clone(&cached)); - return extract_transitive(&cached, peer_deps); - } - let resolved_for_parse = resolved.clone(); - let full_for_parse = Arc::clone(&full); - let core_opt = tokio::task::spawn_blocking(move || { - full_for_parse - .get_core_version(&resolved_for_parse) - .map(Arc::new) - }) - .await - .ok() - .flatten(); + let mut results: Vec = Vec::with_capacity(names.len()); + let mut futs = FuturesUnordered::new(); + let mut idx = 0usize; - let Some(core_arc) = core_opt else { - return Vec::new(); + let spawn_one = |client: &reqwest::Client, + registry_url: &str, + name: String, + futs: &mut FuturesUnordered<_>| { + let url = format!("{}/{}", registry_url, name); + let client = client.clone(); + futs.push(Box::pin(async move { + let bytes = match client + .get(&url) + .header("accept", "application/vnd.npm.install-v1+json") + .send() + .await + { + Ok(resp) if resp.status().is_success() => resp.bytes().await.ok(), + _ => None, + }; + FetchOutcome { name, bytes } + })); }; - cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc)); - cache.set_version_manifest(name, resolved, Arc::clone(&core_arc)); - extract_transitive(&core_arc, peer_deps) + + while idx < names.len() && futs.len() < concurrency { + spawn_one(&client, registry_url, names[idx].clone(), &mut futs); + idx += 1; + } + + while let Some(outcome) = futs.next().await { + results.push(outcome); + if idx < names.len() { + spawn_one(&client, registry_url, names[idx].clone(), &mut futs); + idx += 1; + } + } + + results } -/// Self-contained per-spec future. Either fetches `(name)`'s full -/// manifest from the registry (if not yet cached), or settles against -/// an already-cached one. In both cases it: -/// * writes `full_manifests` and `version_manifests` cache slots -/// for the resolved spec, -/// * returns the spec's transitive deps for the main loop to -/// enqueue. -/// -/// Racing-fetch handling: two specs for the same name dispatched -/// concurrently both enter the fetch branch (no in-flight gate). The -/// second one re-issues a network round-trip; the cost is bounded by -/// the small number of sibling specs in real workloads (<2% in -/// ant-design-x). Last writer to `cache.set_full_manifest` wins; -/// content is identical so correctness is preserved. -async fn fetch_or_settle( +/// Sync phase 2 worker: parse one body, settle all specs we need for +/// this name. Runs on rayon (called from `par_iter` in +/// `parse_settle_batch`). +fn parse_one_body( name: String, - spec: String, - registry_url: String, - cache: MemoryCache, + raw: Bytes, + specs: Vec, peer_deps: PeerDeps, -) -> Vec { - // Sibling fast path: full manifest already cached. - if let Some(full) = cache.get_full_manifest(&name) { - return settle_sibling(name, spec, full, cache, peer_deps).await; - } +) -> ParseOutcome { + use simd_json::prelude::{ValueAsScalar, ValueObjectAccess}; - let result = fetch_full_manifest_with_settle( - FetchManifestOptions { - registry_url: ®istry_url, - name: &name, - format: MetadataFormat::Abbreviated, - etag: None, - }, - &spec, - ) - .await; - - let Ok(FetchWithSettleResult::Ok(payload)) = result else { - return Vec::new(); + let raw_arc: Arc<[u8]> = Arc::from(raw.as_ref()); + let mut buf = raw.to_vec(); + let parsed = match simd_json::to_borrowed_value(&mut buf) { + Ok(v) => v, + Err(_) => { + return ParseOutcome { + name, + full: None, + settled: Vec::new(), + transitives: Vec::new(), + }; + } }; - let full_arc = Arc::new(payload.manifest); - cache.set_full_manifest(name.clone(), Arc::clone(&full_arc)); + let envelope_name = parsed + .get("name") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + .unwrap_or_else(|| name.clone()); + let dist_tags: HashMap = parsed + .get("dist-tags") + .and_then(|v| HashMap::::deserialize(v).ok()) + .unwrap_or_default(); + let versions_keys: Vec = parsed + .get("versions") + .and_then(simd_json::prelude::ValueAsObject::as_object) + .map(|obj| obj.keys().map(|k| k.to_string()).collect()) + .unwrap_or_default(); - let Some((resolved, core_arc)) = payload.primary_settle else { - return Vec::new(); + let full = FullManifest { + name: envelope_name, + dist_tags, + versions: versions_keys, + raw: Arc::clone(&raw_arc), + ..Default::default() }; - cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc)); - cache.set_version_manifest(name, resolved, Arc::clone(&core_arc)); - extract_transitive(&core_arc, peer_deps) + let full_arc = Arc::new(full); + + // For each requested spec, resolve + extract version subtree. + // Cache the per-(name, version) `CoreVersionManifest` so sibling + // specs that resolve to the same version reuse the same Arc. + let mut version_cache: HashMap> = HashMap::new(); + let mut settled = Vec::with_capacity(specs.len()); + let mut transitives = Vec::new(); + + for spec in specs { + let Ok(resolved_version) = resolve_target_version((&*full_arc).into(), &spec) else { + continue; + }; + let core_arc = if let Some(cached) = version_cache.get(&resolved_version) { + Arc::clone(cached) + } else { + let Some(core) = parsed + .get("versions") + .and_then(|v| v.get(resolved_version.as_str())) + .and_then(|version_obj| CoreVersionManifest::deserialize(version_obj).ok()) + else { + continue; + }; + let arc = Arc::new(core); + version_cache.insert(resolved_version.clone(), Arc::clone(&arc)); + arc + }; + transitives.extend(extract_transitive(&core_arc, peer_deps)); + settled.push((spec, resolved_version, core_arc)); + } + + ParseOutcome { + name, + full: Some(full_arc), + settled, + transitives, + } +} + +/// Phase 2 dispatcher: hands the batch to rayon, awaits the result. +async fn parse_settle_batch( + bodies: Vec, + by_name: HashMap>, + peer_deps: PeerDeps, +) -> Vec { + let work: Vec<(String, Bytes, Vec)> = bodies + .into_iter() + .filter_map(|f| { + let bytes = f.bytes?; + let specs = by_name.get(&f.name).cloned().unwrap_or_default(); + Some((f.name, bytes, specs)) + }) + .collect(); + + if work.is_empty() { + return Vec::new(); + } + + tokio::task::spawn_blocking(move || { + work.into_par_iter() + .map(|(name, raw, specs)| parse_one_body(name, raw, specs, peer_deps)) + .collect::>() + }) + .await + .unwrap_or_default() } -/// Manifest-bench-style flat parallel fetch. See module docs for the -/// rationale. +/// Two-phase mb-style fetch with rayon batch parse. See module docs. pub async fn mb_fetch( initial_deps: Vec, registry_url: &str, @@ -173,70 +277,136 @@ pub async fn mb_fetch( config: &PreloadConfig, ) -> MbFetchStats { let mut stats = MbFetchStats::default(); - let mut pending: VecDeque = initial_deps.into(); - let mut seen: HashSet<(String, String)> = HashSet::new(); - let mut futs = FuturesUnordered::new(); - let cap = config.concurrency; + let mut pending_specs: Vec = initial_deps; + let mut done_names: HashSet = HashSet::new(); + let conc = config.concurrency; let peer_deps = config.peer_deps; - let registry_url = registry_url.to_string(); + let total_start = tokio::time::Instant::now(); - let start = tokio::time::Instant::now(); + while !pending_specs.is_empty() { + stats.iterations += 1; + let iter = stats.iterations; - // Initial fill — same shape as the refill below. - while futs.len() < cap { - let Some((name, spec)) = pending.pop_front() else { - break; - }; - if !seen.insert((name.clone(), spec.clone())) { - continue; + // Group this iteration's pending specs by name. + let mut by_name: HashMap> = HashMap::new(); + for (name, spec) in pending_specs.drain(..) { + by_name.entry(name).or_default().push(spec); } - futs.push(Box::pin(fetch_or_settle( - name, - spec, - registry_url.clone(), - cache.clone(), - peer_deps, - ))); - } - while let Some(transitive) = futs.next().await { - if transitive.is_empty() { - // Empty result is ambiguous (no transitive deps OR fetch - // failed) — `MbFetchStats` only tracks success/fail at a - // coarse level. The fetch-timings counters (recorded - // inside `fetch_full_manifest_with_settle`) carry the - // detailed per-fetch metrics. - stats.fail += 1; - } else { - stats.success += 1; + // Names whose full manifest is already cached from a prior + // iteration: settle their siblings synchronously (cheap + // semver match + cache lookup; no parse if version_manifest + // already cached, otherwise quick simd_json subtree extract). + let mut sibling_only: Vec<(String, Vec)> = Vec::new(); + let mut to_fetch: Vec = Vec::with_capacity(by_name.len()); + for (name, specs) in &by_name { + if done_names.contains(name) { + sibling_only.push((name.clone(), specs.clone())); + } else { + to_fetch.push(name.clone()); + } } - pending.extend(transitive); - // Refill — same body as the initial fill above. - while futs.len() < cap { - let Some((name, spec)) = pending.pop_front() else { - break; + // Sibling settles (rare on real workloads — most names appear + // exactly once across the whole walk). + for (name, specs) in sibling_only { + let Some(full) = cache.get_full_manifest(&name) else { + continue; }; - if !seen.insert((name.clone(), spec.clone())) { + for spec in specs { + let Ok(resolved) = resolve_target_version((&*full).into(), &spec) else { + continue; + }; + if let Some(cached) = cache.get_version_manifest(&name, &resolved) { + cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&cached)); + pending_specs.extend(extract_transitive(&cached, peer_deps)); + continue; + } + if let Some(core) = full.get_core_version(&resolved) { + let core_arc = Arc::new(core); + cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&core_arc)); + cache.set_version_manifest(name.clone(), resolved, Arc::clone(&core_arc)); + pending_specs.extend(extract_transitive(&core_arc, peer_deps)); + } + } + } + + if to_fetch.is_empty() { + // Iteration drained pending entirely via sibling settles. + continue; + } + + // PHASE 1 — pure HTTP, mb-style. + let p1_start = tokio::time::Instant::now(); + let bodies = mb_style_pure_fetch(to_fetch.clone(), registry_url, conc).await; + let p1_wall = p1_start.elapsed().as_millis(); + let total_bytes: usize = bodies + .iter() + .map(|b| b.bytes.as_ref().map(|v| v.len()).unwrap_or(0)) + .sum(); + tracing::info!( + "p1-breakdown mb_fetch iter={} phase1_http_wall={}ms n={} bytes={}", + iter, + p1_wall, + to_fetch.len(), + total_bytes, + ); + + // PHASE 2 — rayon batch parse + settle. + let p2_start = tokio::time::Instant::now(); + let by_name_for_parse = by_name + .iter() + .filter(|(name, _)| !done_names.contains(*name)) + .map(|(n, s)| (n.clone(), s.clone())) + .collect::>(); + let parsed = parse_settle_batch(bodies, by_name_for_parse, peer_deps).await; + let p2_wall = p2_start.elapsed().as_millis(); + + let mut new_transitives: Vec = Vec::new(); + let mut settle_count = 0usize; + let mut fail_count = 0usize; + for outcome in parsed { + done_names.insert(outcome.name.clone()); + let Some(full_arc) = outcome.full else { + fail_count += 1; continue; + }; + cache.set_full_manifest(outcome.name.clone(), Arc::clone(&full_arc)); + for (spec, resolved, core) in outcome.settled { + cache.set_version_manifest(outcome.name.clone(), spec, Arc::clone(&core)); + cache.set_version_manifest(outcome.name.clone(), resolved, Arc::clone(&core)); + settle_count += 1; } - futs.push(Box::pin(fetch_or_settle( - name, - spec, - registry_url.clone(), - cache.clone(), - peer_deps, - ))); + new_transitives.extend(outcome.transitives); } + // Names that fetched but failed parse — still mark done so we + // don't refetch them next iteration. + for name in to_fetch { + done_names.insert(name); + } + + stats.success += settle_count; + stats.fail += fail_count; + + tracing::info!( + "p1-breakdown mb_fetch iter={} phase2_parse_wall={}ms settles={} fail={} new_transitives={}", + iter, + p2_wall, + settle_count, + fail_count, + new_transitives.len(), + ); + + pending_specs.extend(new_transitives); } - let wall = start.elapsed(); + let total_wall = total_start.elapsed().as_millis(); tracing::info!( - "p1-breakdown mb_fetch wall={}ms ok={} fail={} | {}", - wall.as_millis(), + "p1-breakdown mb_fetch total_wall={}ms iters={} settled={} fail={}", + total_wall, + stats.iterations, stats.success, stats.fail, - FETCH_TIMINGS.snapshot().summary_line(), ); stats From 24165fb6d355d78cc606b69773fe2dc466560834 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 11:09:29 +0800 Subject: [PATCH 21/24] =?UTF-8?q?fix(pm):=20mb=5Fresolve=20v3=20=E2=80=94?= =?UTF-8?q?=20restore=20spec-level=20dedup=20to=20terminate?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v3 dropped the (name, spec) HashSet from v1/v2 thinking name-level dedup via done_names was sufficient. It wasn't: sibling-settle's extract_transitive can re-introduce specs we've already settled (peer/optional dep cycles trivially trigger this), so the outer while-loop never terminated. CI 25589397823 hung on `Run phase-isolated benchmark · npmjs` for ~25 min before being cancelled — the bench's first utoo p1_resolve hyperfine run got stuck in an infinite settle loop. Fix: maintain `seen_specs: HashSet<(String, String)>` across all iterations; filter both initial seed and every wave of new transitives through it before extending pending_specs. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/resolver/mb_resolve.rs | 42 ++++++++++++++++------ 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs index 05e1bf038..7ef0b5d85 100644 --- a/crates/ruborist/src/resolver/mb_resolve.rs +++ b/crates/ruborist/src/resolver/mb_resolve.rs @@ -278,11 +278,20 @@ pub async fn mb_fetch( ) -> MbFetchStats { let mut stats = MbFetchStats::default(); let mut pending_specs: Vec = initial_deps; + // (name, spec) pairs we've already processed (settled or queued + // to settle). Without this, sibling-settle's transitive deps can + // re-introduce already-walked specs and the outer loop never + // terminates — peer / optional dep cycles trivially trigger this. + let mut seen_specs: HashSet<(String, String)> = HashSet::new(); let mut done_names: HashSet = HashSet::new(); let conc = config.concurrency; let peer_deps = config.peer_deps; let total_start = tokio::time::Instant::now(); + // Filter the initial seed through `seen_specs` too — root + workspace + // edges can list the same dep multiple times across workspaces. + pending_specs.retain(|(n, s)| seen_specs.insert((n.clone(), s.clone()))); + while !pending_specs.is_empty() { stats.iterations += 1; let iter = stats.iterations; @@ -308,7 +317,8 @@ pub async fn mb_fetch( } // Sibling settles (rare on real workloads — most names appear - // exactly once across the whole walk). + // exactly once across the whole walk). New transitives go + // through `seen_specs` dedup before joining `pending_specs`. for (name, specs) in sibling_only { let Some(full) = cache.get_full_manifest(&name) else { continue; @@ -317,17 +327,22 @@ pub async fn mb_fetch( let Ok(resolved) = resolve_target_version((&*full).into(), &spec) else { continue; }; - if let Some(cached) = cache.get_version_manifest(&name, &resolved) { + let new_deps = if let Some(cached) = cache.get_version_manifest(&name, &resolved) { cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&cached)); - pending_specs.extend(extract_transitive(&cached, peer_deps)); - continue; - } - if let Some(core) = full.get_core_version(&resolved) { + extract_transitive(&cached, peer_deps) + } else if let Some(core) = full.get_core_version(&resolved) { let core_arc = Arc::new(core); cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&core_arc)); cache.set_version_manifest(name.clone(), resolved, Arc::clone(&core_arc)); - pending_specs.extend(extract_transitive(&core_arc, peer_deps)); - } + extract_transitive(&core_arc, peer_deps) + } else { + Vec::new() + }; + pending_specs.extend( + new_deps + .into_iter() + .filter(|(n, s)| seen_specs.insert((n.clone(), s.clone()))), + ); } } @@ -388,16 +403,21 @@ pub async fn mb_fetch( stats.success += settle_count; stats.fail += fail_count; + let new_unique: Vec = new_transitives + .into_iter() + .filter(|(n, s)| seen_specs.insert((n.clone(), s.clone()))) + .collect(); + tracing::info!( - "p1-breakdown mb_fetch iter={} phase2_parse_wall={}ms settles={} fail={} new_transitives={}", + "p1-breakdown mb_fetch iter={} phase2_parse_wall={}ms settles={} fail={} new_unique={}", iter, p2_wall, settle_count, fail_count, - new_transitives.len(), + new_unique.len(), ); - pending_specs.extend(new_transitives); + pending_specs.extend(new_unique); } let total_wall = total_start.elapsed().as_millis(); From 41822b081c713758fdbd633513d7257258f39d45 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 11:51:06 +0800 Subject: [PATCH 22/24] =?UTF-8?q?perf(pm):=20preload-bench=20=E2=80=94=20s?= =?UTF-8?q?elf-contained=20streaming=20preload=20baseline?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New crate `crates/preload-bench/` is a fully-standalone bench that: * Uses the SAME HTTP setup as `manifest-bench` (own reqwest::Client built per rep with aws-lc-rs TLS, pool_max_idle_per_host(256), no proxy, default DNS, no retry, h1_only). * Discovers names by walking transitive deps from a package.json root — instead of consuming a flat name list like manifest-bench. * Per-future does GET + body recv + spawn_blocking parse → returns transitive deps → main loop refills on completion. * No dependency on ruborist or any utoo internals (own simd_json, own dedup, own everything). The point: prove (or disprove) that a fully ruborist-independent streaming preload can hit standalone manifest-bench's wall on the same workload. ruborist's path runs at ~2.18s for ant-design's ~2700 names; manifest-bench standalone runs the same workload at ~1.6s. The gap could be in any number of things — DNS layer, retry, pool config, parse-CPU contention, registry single-flight gates. preload-bench eliminates all of those simultaneously so we can read the wall directly. Wired into bench-phases-linux: builds + uploads preload-bench binary alongside manifest-bench, then runs a conc=64/96/128 sweep against the same project after the standalone manifest-bench sweep. bench script reverts UTOO_RESOLVE=mb so utoo runs default fast_preload — gives a third datapoint (utoo wall on integrated path) alongside manifest-bench (HTTP-only ceiling) and preload-bench (streaming-with-walk ceiling). Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/pm-e2e-bench.yml | 69 ++++ Cargo.toml | 1 + bench/pm-bench-phases.sh | 12 +- crates/preload-bench/Cargo.toml | 38 +++ crates/preload-bench/src/main.rs | 505 +++++++++++++++++++++++++++++ 5 files changed, 619 insertions(+), 6 deletions(-) create mode 100644 crates/preload-bench/Cargo.toml create mode 100644 crates/preload-bench/src/main.rs diff --git a/.github/workflows/pm-e2e-bench.yml b/.github/workflows/pm-e2e-bench.yml index b25f5c380..eb560969b 100644 --- a/.github/workflows/pm-e2e-bench.yml +++ b/.github/workflows/pm-e2e-bench.yml @@ -161,6 +161,25 @@ jobs: name: manifest-bench-linux-x64 path: target/x86_64-unknown-linux-gnu/release/manifest-bench retention-days: 1 + # preload-bench: same HTTP setup as manifest-bench, but discovers + # names by walking transitive deps from a package.json root — + # tests whether a fully self-contained streaming preload can match + # standalone manifest-bench's wall on the same workload that + # ruborist's path runs at ~2.18s. + - name: Build preload-bench + if: > + (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'benchmark')) || + (github.event_name == 'workflow_dispatch' && (inputs.target == 'pm-bench-phases' || inputs.target == 'pm-bench-pcap')) + run: cargo build --release --target x86_64-unknown-linux-gnu -p preload-bench + - name: Upload preload-bench binary + if: > + (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'benchmark')) || + (github.event_name == 'workflow_dispatch' && (inputs.target == 'pm-bench-phases' || inputs.target == 'pm-bench-pcap')) + uses: actions/upload-artifact@v4 + with: + name: preload-bench-linux-x64 + path: target/x86_64-unknown-linux-gnu/release/preload-bench + retention-days: 1 # Piggyback on the already-built target/ from the step above: when the # PR is labeled `benchmark`, overlay origin/next's tree onto the current # workdir and re-run cargo build. cargo's incremental compile only @@ -547,6 +566,20 @@ jobs: chmod +x /tmp/manifest-bench-dist/manifest-bench mv /tmp/manifest-bench-dist/manifest-bench /tmp/manifest-bench echo "MANIFEST_BENCH_BIN=/tmp/manifest-bench" >> $GITHUB_ENV + # Self-contained streaming preload bench — same HTTP setup as + # manifest-bench but discovers names via transitive walk from a + # package.json. Used to test whether a fully-isolated path can + # match standalone manifest-bench's wall on the same workload. + - name: Download preload-bench binary + uses: actions/download-artifact@v4 + with: + name: preload-bench-linux-x64 + path: /tmp/preload-bench-dist + - name: Install preload-bench + run: | + chmod +x /tmp/preload-bench-dist/preload-bench + mv /tmp/preload-bench-dist/preload-bench /tmp/preload-bench + echo "PRELOAD_BENCH_BIN=/tmp/preload-bench" >> $GITHUB_ENV - name: Verify tools run: | hyperfine --version @@ -645,6 +678,42 @@ jobs: "$MANIFEST_BENCH_BIN" --lockfile package-lock.json --registry "$REGISTRY" \ --concurrency 128 --reps 2 --http1-only --user-agent "Bun/1.2.21" || true } 2>&1 | tee "$MB_LOG" + # Self-contained streaming preload (transitive walk from + # package.json) — same HTTP setup as manifest-bench but with a + # streaming FuturesUnordered + per-future parse. This tests + # whether a fully ruborist-independent path can hit standalone + # manifest-bench's wall under the same project workload. + - name: Standalone preload-bench (transitive walk sweep) + env: + PROJECT: ${{ github.event.inputs.project || 'ant-design' }} + REGISTRY: 'https://registry.npmjs.org' + run: | + set -eu + mkdir -p /tmp/pm-bench-output + PROJECT_DIR="/tmp/pm-bench/$PROJECT" + if [ ! -d "$PROJECT_DIR" ]; then + echo "no project dir; skipping preload-bench"; exit 0 + fi + PJ="$PROJECT_DIR/package.json" + if [ ! -f "$PJ" ]; then + echo "no package.json; skipping preload-bench"; exit 0 + fi + + PB_LOG=/tmp/pm-bench-output/preload-bench-npmjs.log + { + echo "============================================================" + echo "preload-bench: streaming transitive-walk preload" + echo " Self-contained (no ruborist deps). Same HTTP setup as" + echo " manifest-bench, but discovers names by walking transitive" + echo " deps from package.json instead of consuming a flat list." + echo "============================================================" + for CAP in 64 96 128; do + echo + echo "--- concurrency=$CAP, h1, transitive walk ---" + "$PRELOAD_BENCH_BIN" --package-json "$PJ" --registry "$REGISTRY" \ + --concurrency "$CAP" --reps 4 || true + done + } 2>&1 | tee "$PB_LOG" - name: Upload bench logs if: always() uses: actions/upload-artifact@v4 diff --git a/Cargo.toml b/Cargo.toml index 0574a185a..4b2836c06 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,6 +2,7 @@ resolver = "2" members = [ "crates/manifest-bench", + "crates/preload-bench", "crates/pack-api", "crates/pack-cli", "crates/pack-core", diff --git a/bench/pm-bench-phases.sh b/bench/pm-bench-phases.sh index 26e43388c..b025ebc6f 100755 --- a/bench/pm-bench-phases.sh +++ b/bench/pm-bench-phases.sh @@ -22,12 +22,12 @@ UTOO_NEXT_CACHE="${UTOO_NEXT_CACHE:-/tmp/utoo-next-bench-cache}" BUN_CACHE="${BUN_CACHE:-/tmp/bun-bench-cache}" export BUN_INSTALL_CACHE_DIR="$BUN_CACHE" -# Route the current `utoo` binary's resolve phase through the -# experimental `mb_resolve` flat-fetch path. Other PMs ignore this -# env var (utoo-next is built from origin/next which doesn't have -# the flag; utoo-npm/bun ignore unrecognized env vars). Comment out -# to A/B against the default `fast_preload` path. -export UTOO_RESOLVE=mb +# utoo path defaults to fast_preload (combined-parse) so we have a +# stable baseline to compare against. preload-bench is run as a +# separate standalone tool by the CI workflow — its wall is the +# self-contained-streaming reference, ruborist's utoo p1_resolve +# wall is the integrated path. The gap between them is what +# remains to close. # Drop optional baselines from the PM list when their binary is not wired # up — UTOO_NPM_BIN is set by CI's "Install utoo@npm" step, UTOO_NEXT_BIN diff --git a/crates/preload-bench/Cargo.toml b/crates/preload-bench/Cargo.toml new file mode 100644 index 000000000..9d37d7769 --- /dev/null +++ b/crates/preload-bench/Cargo.toml @@ -0,0 +1,38 @@ +[package] +name = "preload-bench" +version = "0.0.0" +edition = "2024" +license = "MIT" +publish = false +description = "Self-contained streaming-with-transitive-walk manifest preload bench. Reproduces manifest-bench's standalone fetch loop but discovers transitive deps from package.json instead of consuming a flat name list. No dependency on ruborist or any utoo internals." + +[[bin]] +name = "preload-bench" +path = "src/main.rs" + +# tombi: format.rules.table-keys-order.disabled = true +[dependencies] +anyhow = { workspace = true } +clap = { workspace = true } +futures = "0.3" +serde = { version = "1", features = ["derive"] } +serde_json = { workspace = true } +simd-json = "0.17" +tokio = { workspace = true, features = ["macros", "rt-multi-thread", "fs", "time"] } + +# Same TLS/DNS choices as manifest-bench so the only delta vs that bench +# is the transitive-walk loop. +reqwest = { version = "0.12", default-features = false, features = [ + "brotli", + "gzip", + "http2", + "rustls-tls-native-roots-no-provider", + "socks" +] } +rustls = { version = "0.23", default-features = false, features = [ + "aws-lc-rs", + "logging", + "std", + "tls12" +] } +rustls-native-certs = "0.8" diff --git a/crates/preload-bench/src/main.rs b/crates/preload-bench/src/main.rs new file mode 100644 index 000000000..46f917d19 --- /dev/null +++ b/crates/preload-bench/src/main.rs @@ -0,0 +1,505 @@ +//! Self-contained streaming preload bench with transitive walking. +//! +//! Same HTTP setup as `manifest-bench` (own `reqwest::Client` built +//! per rep with `aws-lc-rs` TLS, `pool_max_idle_per_host(256)`, no +//! proxy, default DNS, no retry). The only delta vs `manifest-bench` +//! is that this bench discovers names by walking transitive deps +//! from a `package.json` root, instead of consuming a flat name +//! list. +//! +//! Why a separate crate: ruborist's manifest-fetch path goes through +//! several service layers (custom DNS resolver, retry, cache, +//! single-flight gates, event receivers). Each layer might add +//! overhead. This bench bypasses all of them — same shape as +//! manifest-bench, just with a streaming `FuturesUnordered` that +//! refills from a pending queue extended by parsed transitive deps. +//! +//! Reports both the standalone preload wall and a per-rep eff_parallel +//! number so we can compare directly against manifest-bench's +//! `phase_wall` + `avg_conc` for the same workload. +//! +//! Output (one line per rep, matching manifest-bench shape): +//! [rep N] preload_wall=Xms n=Y bytes=Z avg_conc=N.N parse_sum=Wms 200=A 4xx=B err=C + +use std::collections::{HashMap, HashSet, VecDeque}; +use std::path::PathBuf; +use std::sync::Arc; +use std::time::Instant; + +use anyhow::{Context, Result, anyhow}; +use clap::Parser; +use futures::stream::{FuturesUnordered, StreamExt}; +use serde::Deserialize; + +#[derive(Parser, Debug)] +#[command( + name = "preload-bench", + about = "Streaming preload bench with transitive walking (self-contained)" +)] +struct Args { + /// Registry base URL. + #[arg(long, default_value = "https://registry.npmjs.org")] + registry: String, + + /// Path to a `package.json` to walk from. Reads `dependencies` + + /// `devDependencies` + `optionalDependencies` as the initial seed. + #[arg(long)] + package_json: PathBuf, + + /// Maximum concurrent in-flight requests. + #[arg(long, default_value_t = 96)] + concurrency: usize, + + /// Number of times to repeat the whole walk (fresh client per rep). + #[arg(long, default_value_t = 4)] + reps: usize, + + /// Force HTTP/1.1. + #[arg(long, default_value_t = true)] + http1_only: bool, + + /// Override `User-Agent`. + #[arg(long)] + user_agent: Option, + + /// Include `peerDependencies` when walking transitives. Off by + /// default (matches utoo's default). + #[arg(long)] + include_peer: bool, +} + +#[derive(Deserialize)] +struct PackageJson { + #[serde(default)] + dependencies: HashMap, + #[serde(default, rename = "devDependencies")] + dev_dependencies: HashMap, + #[serde(default, rename = "optionalDependencies")] + optional_dependencies: HashMap, +} + +#[tokio::main] +async fn main() -> Result<()> { + let args = Args::parse(); + + let raw = std::fs::read_to_string(&args.package_json) + .with_context(|| format!("read {:?}", args.package_json))?; + let pkg: PackageJson = serde_json::from_str(&raw).context("parse package.json")?; + let initial: Vec<(String, String)> = pkg + .dependencies + .into_iter() + .chain(pkg.dev_dependencies) + .chain(pkg.optional_dependencies) + .filter(|(_, spec)| is_registry_spec(spec)) + .collect(); + + println!( + "preload-bench: registry={} concurrency={} reps={} initial={} h1_only={} ua={} include_peer={}", + args.registry, + args.concurrency, + args.reps, + initial.len(), + args.http1_only, + args.user_agent.as_deref().unwrap_or(""), + args.include_peer, + ); + + for rep in 1..=args.reps { + run_once(&args, &initial, rep).await?; + } + + Ok(()) +} + +/// Quick registry-spec check (a `^...` / `~...` / `latest` / etc). +/// Excludes `file:`, `link:`, `workspace:`, `git+`, `https://`, and +/// `/` shorthand. Same intent as ruborist's +/// `SpecStr::is_registry_spec` but inlined to keep this crate +/// dependency-free. +fn is_registry_spec(spec: &str) -> bool { + if spec.is_empty() { + return true; // bare entries default to "*" + } + let lower = spec.to_ascii_lowercase(); + if lower.starts_with("file:") + || lower.starts_with("link:") + || lower.starts_with("workspace:") + || lower.starts_with("portal:") + || lower.starts_with("git+") + || lower.starts_with("git://") + || lower.starts_with("github:") + || lower.starts_with("https://") + || lower.starts_with("http://") + { + return false; + } + // `/` shorthand — exactly one '/' and no '@' prefix on + // first segment (rules out scoped names like `@scope/pkg`). + if let Some((head, tail)) = spec.split_once('/') + && !head.starts_with('@') + && !tail.is_empty() + && !tail.contains('/') + { + return false; + } + true +} + +#[derive(Debug, Default)] +struct RepStats { + n: usize, + bytes: usize, + parse_sum_us: u128, + busy_us: u128, + sum_us: u128, + ok_200: usize, + err_4xx: usize, + err_other: usize, +} + +async fn run_once(args: &Args, initial: &[(String, String)], rep: usize) -> Result<()> { + let client = build_client(args)?; + let registry = Arc::new(args.registry.trim_end_matches('/').to_string()); + let concurrency = args.concurrency; + let include_peer = args.include_peer; + + let phase_start = Instant::now(); + let mut stats = RepStats::default(); + + // (name, spec) dedup — same shape as ruborist's seen_specs but + // self-contained. We dedup the *spec* level because two specs on + // the same name might resolve to different versions. + let mut seen: HashSet<(String, String)> = HashSet::new(); + let mut pending: VecDeque<(String, String)> = VecDeque::new(); + for (name, spec) in initial { + if seen.insert((name.clone(), spec.clone())) { + pending.push_back((name.clone(), spec.clone())); + } + } + + // Sibling-fetch dedup: when two specs for the same name are both + // pending, only one fetch is issued; subsequent specs settle from + // the cached body. Keyed by name. Maps name → cached parsed body + // (`Arc>`) once the first fetch lands. + let body_cache: Arc>>>> = + Arc::new(std::sync::Mutex::new(HashMap::new())); + let mut in_flight_names: HashSet = HashSet::new(); + let mut deferred_by_name: HashMap> = HashMap::new(); + + let mut futs: FuturesUnordered = FuturesUnordered::new(); + + loop { + while futs.len() < concurrency { + let Some((name, spec)) = pending.pop_front() else { + break; + }; + + // If the body is already cached (sibling spec for an + // already-fetched name), spawn a settle-only future. + if let Some(raw) = body_cache.lock().unwrap().get(&name).cloned() { + let n = name.clone(); + let s = spec.clone(); + let fut: Fut = Box::pin(settle_only(n, s, raw, include_peer)); + futs.push(fut); + continue; + } + + // First time seeing this name: fetch + settle. Stash any + // sibling specs that arrive while in-flight. + if !in_flight_names.insert(name.clone()) { + deferred_by_name.entry(name).or_default().push(spec); + continue; + } + + spawn_fetch( + &client, + ®istry, + name, + spec, + Arc::clone(&body_cache), + include_peer, + &mut futs, + ); + } + + if futs.is_empty() { + break; + } + + let Some(out) = futs.next().await else { break }; + stats.n += 1; + stats.busy_us += out.busy_us; + stats.sum_us += out.sum_us; + stats.parse_sum_us += out.parse_us; + stats.bytes += out.bytes; + match out.status { + 200 => stats.ok_200 += 1, + 400..=499 => stats.err_4xx += 1, + _ => stats.err_other += 1, + } + + // Drain sibling specs for this name now that body is cached. + if out.fetched + && let Some(siblings) = deferred_by_name.remove(&out.name) + && let Some(raw) = body_cache.lock().unwrap().get(&out.name).cloned() + { + for sibling_spec in siblings { + let n = out.name.clone(); + let r = Arc::clone(&raw); + let fut: Fut = Box::pin(settle_only(n, sibling_spec, r, include_peer)); + futs.push(fut); + } + } + + // Extend pending with new transitives, dedup by (name, spec). + for (name, spec) in out.transitives { + if seen.insert((name.clone(), spec.clone())) { + pending.push_back((name, spec)); + } + } + } + + let phase_wall_ms = phase_start.elapsed().as_millis(); + let parse_sum_ms = stats.parse_sum_us / 1000; + // avg_conc = sum_request_us / busy_window_us. busy_us isn't a true + // merged-interval here (we don't track per-req start/end timestamps + // for that), so use phase_wall as the denominator — slightly + // pessimistic but consistent. + let avg_conc = if phase_wall_ms > 0 { + stats.sum_us as f64 / 1000.0 / phase_wall_ms as f64 + } else { + 0.0 + }; + + println!( + "[rep {rep}] preload_wall={phase_wall_ms}ms n={} bytes={} parse_sum={parse_sum_ms}ms avg_conc={avg_conc:.1} 200={} 4xx={} err={}", + stats.n, stats.bytes, stats.ok_200, stats.err_4xx, stats.err_other, + ); + Ok(()) +} + +#[derive(Debug)] +struct FetchOutcome { + name: String, + /// `(name, spec)` transitive deps unfolded by parsing the resolved + /// version's `dependencies` / `optionalDependencies` (and + /// optionally `peerDependencies`). + transitives: Vec<(String, String)>, + /// `true` if this future fetched the body (vs settle-only on a + /// cached body); only fetchers populate `body_cache` and trigger + /// sibling drain. + fetched: bool, + /// HTTP status code (200 / 4xx / 5xx / 0 on transport error). + status: u16, + /// Body byte count (0 on error). + bytes: usize, + /// Self-reported per-future busy_us — `end - start`. Approximate. + busy_us: u128, + /// Sum of all per-future durations summed by the main loop. + sum_us: u128, + /// Parse work done inside this future (for accounting). + parse_us: u128, +} + +type Fut = std::pin::Pin + Send>>; + +fn spawn_fetch( + client: &reqwest::Client, + registry: &Arc, + name: String, + spec: String, + body_cache: Arc>>>>, + include_peer: bool, + futs: &mut FuturesUnordered, +) { + let url = format!("{}/{}", registry, name); + let client = client.clone(); + let fut: Fut = Box::pin(async move { + let start = Instant::now(); + let req = client + .get(&url) + .header("accept", "application/vnd.npm.install-v1+json") + .send(); + let (raw_bytes, status) = match req.await { + Ok(resp) => { + let status = resp.status().as_u16(); + let body = resp.bytes().await.map(|b| b.to_vec()).unwrap_or_default(); + (body, status) + } + Err(_) => (Vec::new(), 0), + }; + let bytes = raw_bytes.len(); + + let (parse_us, transitives) = if status == 200 && !raw_bytes.is_empty() { + let raw_arc = Arc::new(raw_bytes); + body_cache + .lock() + .unwrap() + .insert(name.clone(), Arc::clone(&raw_arc)); + // Move the Arc> into spawn_blocking; the parser + // mutates a clone, so the cached copy is unaffected. + let spec_for_parse = spec.clone(); + let parse_start = Instant::now(); + let result = tokio::task::spawn_blocking(move || { + parse_and_extract(&raw_arc, &spec_for_parse, include_peer) + }) + .await + .ok() + .flatten() + .unwrap_or_default(); + (parse_start.elapsed().as_micros(), result) + } else { + (0, Vec::new()) + }; + + let end = Instant::now(); + let busy_us = end.duration_since(start).as_micros(); + FetchOutcome { + name, + transitives, + fetched: true, + status, + bytes, + busy_us, + sum_us: busy_us, + parse_us, + } + }); + futs.push(fut); +} + +async fn settle_only( + name: String, + spec: String, + raw: Arc>, + include_peer: bool, +) -> FetchOutcome { + let start = Instant::now(); + let parse_start = start; + let transitives = tokio::task::spawn_blocking(move || { + parse_and_extract(&raw, &spec, include_peer).unwrap_or_default() + }) + .await + .unwrap_or_default(); + let parse_us = parse_start.elapsed().as_micros(); + let end = Instant::now(); + let busy_us = end.duration_since(start).as_micros(); + FetchOutcome { + name, + transitives, + fetched: false, + status: 200, + bytes: 0, + busy_us, + sum_us: busy_us, + parse_us, + } +} + +/// Parse a manifest body, resolve `spec` against the version list, +/// extract that version's transitive deps. Single +/// `simd_json::to_borrowed_value` pass for the whole body — same as +/// ruborist's combined-parse path, but inlined here so this crate +/// has no ruborist dependency. +fn parse_and_extract( + raw: &Arc>, + spec: &str, + include_peer: bool, +) -> Option> { + use simd_json::prelude::{ValueAsObject, ValueObjectAccess}; + + let mut buf = (**raw).clone(); + let parsed = simd_json::to_borrowed_value(&mut buf).ok()?; + + let dist_tags: HashMap = parsed + .get("dist-tags") + .and_then(|v| HashMap::::deserialize(v).ok()) + .unwrap_or_default(); + let versions_obj = parsed.get("versions").and_then(ValueAsObject::as_object)?; + + // Resolve spec. Three cases: dist-tag match, exact-version key, or + // semver range (we approximate with "first version that satisfies" + // — preload-bench is a measurement tool, not a real resolver, so + // we tolerate slight selection differences vs ruborist for the + // purpose of timing the network path). + let resolved = if let Some(via_tag) = dist_tags.get(spec) { + via_tag.clone() + } else if versions_obj.contains_key(spec) { + spec.to_string() + } else if let Some(latest) = dist_tags.get("latest") + && spec_satisfied_by(spec, latest) + { + latest.clone() + } else { + // Last-resort: pick the lexicographically-largest version. Not + // semver-correct but bounded by the version set, and good + // enough for timing. + versions_obj.keys().max().map(|k| k.to_string())? + }; + + let version_obj = versions_obj.get(resolved.as_str())?; + let mut out: Vec<(String, String)> = Vec::new(); + + if let Some(deps) = version_obj.get("dependencies") + && let Ok(map) = HashMap::::deserialize(deps) + { + out.extend(map.into_iter().filter(|(_, s)| is_registry_spec(s))); + } + if include_peer + && let Some(deps) = version_obj.get("peerDependencies") + && let Ok(map) = HashMap::::deserialize(deps) + { + out.extend(map.into_iter().filter(|(_, s)| is_registry_spec(s))); + } + if let Some(deps) = version_obj.get("optionalDependencies") + && let Ok(map) = HashMap::::deserialize(deps) + { + out.extend(map.into_iter().filter(|(_, s)| is_registry_spec(s))); + } + Some(out) +} + +/// Crude semver-satisfies check: only handles `^X.Y.Z` and `~X.Y.Z` +/// against an exact target. Sufficient for "does latest satisfy spec" +/// in this measurement context — full semver is in the resolver, not +/// the bench. +fn spec_satisfied_by(spec: &str, target: &str) -> bool { + let s = spec.trim(); + let body = s + .strip_prefix('^') + .or_else(|| s.strip_prefix('~')) + .unwrap_or(s); + target.starts_with(body) || target == body +} + +fn build_client(args: &Args) -> Result { + // Install aws-lc-rs as the default crypto provider (idempotent — + // first call wins). Same setup as manifest-bench. + let _ = rustls::crypto::aws_lc_rs::default_provider().install_default(); + + let mut roots = rustls::RootCertStore::empty(); + let native = rustls_native_certs::load_native_certs(); + for cert in native.certs { + let _ = roots.add(cert); + } + + let tls_config = rustls::ClientConfig::builder_with_provider(std::sync::Arc::new( + rustls::crypto::aws_lc_rs::default_provider(), + )) + .with_safe_default_protocol_versions() + .map_err(|e| anyhow!("rustls protocol versions: {e}"))? + .with_root_certificates(roots) + .with_no_client_auth(); + + let mut builder = reqwest::Client::builder() + .use_preconfigured_tls(tls_config) + .no_proxy() + .pool_max_idle_per_host(256); + if args.http1_only { + builder = builder.http1_only(); + } + if let Some(ua) = &args.user_agent { + builder = builder.user_agent(ua); + } + builder.build().context("build reqwest client") +} From 01d15130d01cb6768d2fe5b4d4c577a7b4139a03 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 13:55:56 +0800 Subject: [PATCH 23/24] perf(pm): integrate standalone preload into ruborist for lockfile-only path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Step 1 of staged service-layer ablation. Rewrites mb_resolve as a fully self-contained streaming preload mirroring preload-bench's loop shape verbatim, but living inside ruborist so it can populate MemoryCache for the BFS phase. Bypasses every other ruborist service layer: * service::http::get_client — own reqwest::Client built per call, no global LazyLock, no shared_resolver dns layer, no connect_timeout, pool_max_idle_per_host(256). * service::manifest::fetch_full_manifest_with_settle — own GET + body.bytes() + spawn_blocking(simd_json::to_borrowed_value), no RetryIf, no FETCH_TIMINGS. * service::registry::UnifiedRegistry — no OnceMap, no ManifestStore, no EventReceiver. Only service::* touched is MemoryCache writes (DashMap inserts) so BFS has data to read from. PM is unaware: dispatch happens entirely inside service::api::build_deps when skip_preload=true and no warm cache. Removes the previous UTOO_RESOLVE=mb env-var gating from pm::helper::ruborist_context::Context::build_deps and pipeline::resolve_with_pipeline. Removes the now-unused service::api::build_deps_mb sibling entry point. Expected: utoo p1_resolve drops from ~2.67s toward preload-bench's ~2.57s (or better since ruborist fetches fewer names than preload-bench). The remaining gap to mb's ~1.99s would isolate incremental layer effects we add back next: - tokio runtime config / cooperative scheduling - reqwest::Client provider differences (TLS, DNS) - cache layer (DashMap vs DiskManifestStore reads on the cold path) Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/pm/src/helper/ruborist_context.rs | 22 +- crates/pm/src/service/pipeline/mod.rs | 17 +- crates/ruborist/src/resolver/mb_resolve.rs | 597 ++++++++++----------- crates/ruborist/src/service/api.rs | 175 +----- crates/ruborist/src/service/mod.rs | 2 +- 5 files changed, 289 insertions(+), 524 deletions(-) diff --git a/crates/pm/src/helper/ruborist_context.rs b/crates/pm/src/helper/ruborist_context.rs index 542664f8c..c8b758a6f 100644 --- a/crates/pm/src/helper/ruborist_context.rs +++ b/crates/pm/src/helper/ruborist_context.rs @@ -84,23 +84,17 @@ impl Context { /// [`BuildDepsOutput`] (lock + project cache); the project cache is /// persisted in the background. /// - /// Used by the lockfile-only path (`utoo deps`). No pipeline consumes - /// `PackageResolved` events here, so preload is pure overhead — BFS's - /// own per-level parallel prefetch warms the manifest cache. - /// - /// Set `UTOO_RESOLVE=mb` to opt into the experimental - /// manifest-bench-style fetch path (`build_deps_mb`) for A/B - /// benchmarking against the current `fast_preload`. + /// Used by the lockfile-only path (`utoo deps`). With + /// `skip_preload=true`, ruborist's `service::api::build_deps` + /// internally routes through `mb_resolve::mb_fetch` — a + /// standalone manifest-bench-style preload that bypasses + /// `service::http` / `service::manifest` / `service::registry` + /// for the cold-cache lockfile-only workload. PM doesn't see + /// the dispatch. pub async fn build_deps(cwd: PathBuf) -> anyhow::Result { let mut options = Self::deps_options(cwd.clone(), ProgressReceiver).await; options.skip_preload = true; - let use_mb = std::env::var("UTOO_RESOLVE").as_deref() == Ok("mb"); - let output = if use_mb { - tracing::debug!("UTOO_RESOLVE=mb: routing to build_deps_mb"); - utoo_ruborist::service::build_deps_mb(options).await? - } else { - utoo_ruborist::service::build_deps(options).await? - }; + let output = utoo_ruborist::service::build_deps(options).await?; spawn_save_project_cache(cwd, output.project_cache.clone()); Ok(output) } diff --git a/crates/pm/src/service/pipeline/mod.rs b/crates/pm/src/service/pipeline/mod.rs index 4169ca88d..719d31d13 100644 --- a/crates/pm/src/service/pipeline/mod.rs +++ b/crates/pm/src/service/pipeline/mod.rs @@ -41,22 +41,7 @@ pub async fn resolve_with_pipeline(root_path: &std::path::Path) -> anyhow::Resul let (options, channels) = Context::pipeline_deps_options(root_path.to_path_buf()).await; let handles = worker::start_workers(channels, root_path.to_path_buf()); - // `UTOO_RESOLVE=mb` reroutes install through the experimental - // mb-style fetch path. Pipeline workers are still started, but - // because mb_fetch doesn't emit `PackageResolved` events, the - // pipeline only fires once BFS completes (graph_to_package_lock - // emits `PackagePlaced` from BFS). Install becomes - // phase-sequential — fetch all manifests, then download + - // clone. Useful for A/B benchmarking the resolve phase in - // isolation; the pipelining advantage of the default path is - // lost. - let use_mb = std::env::var("UTOO_RESOLVE").as_deref() == Ok("mb"); - let output = if use_mb { - tracing::debug!("UTOO_RESOLVE=mb: routing install resolve to build_deps_mb"); - utoo_ruborist::service::build_deps_mb(options).await? - } else { - utoo_ruborist::service::build_deps(options).await? - }; + let output = utoo_ruborist::service::build_deps(options).await?; save_package_lock(root_path, &output.lock).await?; spawn_save_project_cache(root_path.to_path_buf(), output.project_cache); diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs index 7ef0b5d85..7e1376330 100644 --- a/crates/ruborist/src/resolver/mb_resolve.rs +++ b/crates/ruborist/src/resolver/mb_resolve.rs @@ -1,47 +1,42 @@ -//! Two-phase manifest fetcher: phase 1 pure HTTP (mirrors -//! `manifest-bench` standalone exactly), phase 2 rayon batch parse + -//! settle. +//! Standalone manifest preload for the lockfile-only path. //! -//! ## Phase split +//! Mirrors `crates/preload-bench`'s loop shape verbatim, but lives +//! inside ruborist so it can populate `MemoryCache` for the BFS phase +//! to read. Used by `service::api::build_deps` whenever the caller +//! has `skip_preload=true` and no warm project cache — i.e. the +//! `utoo deps` (lockfile-only) path. //! -//! Per-fetch parse work was the real bottleneck in v1/v2 — `simd_json` -//! ran in `spawn_blocking` threads that competed with tokio runtime -//! workers for CPU on the 2-core GHA box. When 50+ parses ran in -//! parallel, tokio workers couldn't drive sockets, so `eff_parallel` -//! capped at ~47 against the 96 cap (vs `manifest-bench` standalone's -//! 75 on the same box). +//! Bypasses every other ruborist service layer: +//! * `service::http::get_client` — own `reqwest::Client` built per +//! call, no global LazyLock, no `dns_resolver(shared_resolver)`, +//! no `connect_timeout`, `pool_max_idle_per_host(256)` matching +//! `preload-bench` / `manifest-bench`. +//! * `service::manifest::fetch_full_manifest_with_settle` — own +//! `reqwest::get + body.bytes() + spawn_blocking(simd_json +//! to_borrowed_value)`, no `RetryIf`, no `FETCH_TIMINGS`. +//! * `service::registry::UnifiedRegistry` — no `OnceMap` inflight +//! gates, no `ManifestStore`, no `EventReceiver`. //! -//! v3 separates the work: +//! The only `service::*` touched is `MemoryCache::set_full_manifest` +//! and `MemoryCache::set_version_manifest` — thin DashMap wrappers +//! the BFS phase reads from. Without that, BFS would have nothing to +//! resolve against. //! -//! - **Phase 1** — `mb_style_pure_fetch` is a structural copy of -//! `manifest-bench`'s main loop: `spawn_one` (GET + body recv, -//! nothing else) + 1-for-1 refill on completion. The future body -//! has zero CPU work, so the tokio runtime workers retain full CPU -//! to drive sockets and `eff_parallel` reaches the same level as -//! the standalone bench. -//! -//! - **Phase 2** — bulk parse on rayon (off the tokio runtime). For -//! each fetched body: parse `FullManifest` envelope, resolve every -//! spec we need for this name, materialize `CoreVersionManifest` -//! subtrees, populate cache slots, collect transitive deps for the -//! next iteration. -//! -//! Phases alternate until `pending` is empty (typical project: 3-5 -//! iterations as transitive deps fan out wave by wave). -//! -//! Phase 1 is the line we measure against `manifest-bench` — -//! `p1-breakdown mb_fetch_iter=N phase1_http_wall=...` traces let us -//! check eff_parallel directly. -//! -//! Wired in via `UTOO_RESOLVE=mb` env var (see -//! `pm::helper::ruborist_context::Context::build_deps`). - -use std::collections::{HashMap, HashSet}; +//! Why a separate path: same-run CI data shows `preload-bench` +//! (self-contained, transitive walk, 4153 fetches) lands at ~2.57s +//! while ruborist's existing `fast_preload` path (combined parse via +//! service layers, 2733 fetches) lands at ~2.67s on the same network +//! — so on a per-fetch basis the service-layer path is ~50 % slower. +//! Removing the layers should close that gap. + +use std::collections::{HashMap, HashSet, VecDeque}; +use std::pin::Pin; use std::sync::Arc; +use std::time::Instant; -use bytes::Bytes; +use anyhow::{Context, Result}; use futures::stream::{FuturesUnordered, StreamExt}; -use rayon::prelude::*; +use parking_lot::Mutex; use serde::Deserialize; use crate::model::manifest::{CoreVersionManifest, FullManifest}; @@ -49,38 +44,29 @@ use crate::model::node::PeerDeps; use crate::resolver::preload::{Dep, PreloadConfig}; use crate::resolver::version::resolve_target_version; use crate::service::MemoryCache; -use crate::service::http::get_client; use crate::spec::SpecStr; #[derive(Debug, Default)] pub struct MbFetchStats { pub success: usize, pub fail: usize, - pub iterations: usize, -} - -/// Phase 1 result: one body per fetched name. `bytes` is `None` on -/// transport / non-2xx — kept in the result vector so phase 2 can -/// account for it, but contributes no settle work. -struct FetchOutcome { - name: String, - bytes: Option, } -/// Phase 2 per-name output. `full` is `None` on parse failure. -struct ParseOutcome { - name: String, - full: Option>, - /// Per-spec settled subtrees: `(spec, resolved_version, core)`. - /// Empty when the body failed to fetch / parse, or when no spec - /// resolves against the manifest. - settled: Vec<(String, String, Arc)>, - /// Transitive deps collected across all settled subtrees for this - /// name. Already filtered to registry specs; the main loop dedups - /// against `done_names` before queueing. - transitives: Vec, +/// Build a fresh `reqwest::Client` matching `preload-bench` / +/// `manifest-bench` exactly, except for the TLS provider — those +/// benches use aws-lc-rs but we keep ruborist's existing default +/// rustls (ring on Linux). If A/B data shows TLS is the remaining +/// gap, we'll add the aws-lc-rs deps separately. +fn build_mb_client() -> Result { + reqwest::Client::builder() + .no_proxy() + .pool_max_idle_per_host(256) + .http1_only() + .build() + .context("build reqwest client for mb_resolve") } +/// Collect deps from a deps map, filtering non-registry specs. fn collect_deps(map: Option<&HashMap>) -> Vec { map.into_iter() .flatten() @@ -99,177 +85,183 @@ fn extract_transitive(manifest: &CoreVersionManifest, peer_deps: PeerDeps) -> Ve out } -/// Phase 1 — structural copy of `manifest-bench`'s main loop. Future -/// body does ONLY GET + body recv; no parse, no cache writes, no -/// dedup. Returns one `FetchOutcome` per input name in arrival order. -async fn mb_style_pure_fetch( - names: Vec, - registry_url: &str, - concurrency: usize, -) -> Vec { - let client = match get_client() { - Ok(c) => c.clone(), - Err(e) => { - tracing::warn!("get_client failed: {e}"); - return Vec::new(); - } - }; - - let mut results: Vec = Vec::with_capacity(names.len()); - let mut futs = FuturesUnordered::new(); - let mut idx = 0usize; - - let spawn_one = |client: &reqwest::Client, - registry_url: &str, - name: String, - futs: &mut FuturesUnordered<_>| { - let url = format!("{}/{}", registry_url, name); - let client = client.clone(); - futs.push(Box::pin(async move { - let bytes = match client - .get(&url) - .header("accept", "application/vnd.npm.install-v1+json") - .send() - .await - { - Ok(resp) if resp.status().is_success() => resp.bytes().await.ok(), - _ => None, - }; - FetchOutcome { name, bytes } - })); - }; +/// What a future returns when it lands. The main loop uses +/// `transitives` to extend `pending`, plus the cache writes already +/// happened inside the future. Only `fetched=true` futures populate +/// `body_cache` and trigger sibling drain. +struct FetchOutcome { + name: String, + transitives: Vec, + fetched: bool, +} - while idx < names.len() && futs.len() < concurrency { - spawn_one(&client, registry_url, names[idx].clone(), &mut futs); - idx += 1; - } +type Fut = Pin + Send>>; - while let Some(outcome) = futs.next().await { - results.push(outcome); - if idx < names.len() { - spawn_one(&client, registry_url, names[idx].clone(), &mut futs); - idx += 1; - } - } +/// `(name, spec) → (FullManifest, resolved_version, version_subtree, transitive_deps)`. +type ParseResult = ( + Arc, + String, + Arc, + Vec, +); - results -} +/// Single combined parse: one `simd_json::to_borrowed_value` over the +/// raw body extracts the envelope (name, dist-tags, versions keys) +/// AND deserializes the resolved version's `CoreVersionManifest` +/// subtree. Same shape as the parse step in `preload-bench`. +fn parse_combined(raw: Arc<[u8]>, spec: &str, peer_deps: PeerDeps) -> Option { + use simd_json::prelude::{ValueAsObject, ValueAsScalar, ValueObjectAccess}; -/// Sync phase 2 worker: parse one body, settle all specs we need for -/// this name. Runs on rayon (called from `par_iter` in -/// `parse_settle_batch`). -fn parse_one_body( - name: String, - raw: Bytes, - specs: Vec, - peer_deps: PeerDeps, -) -> ParseOutcome { - use simd_json::prelude::{ValueAsScalar, ValueObjectAccess}; - - let raw_arc: Arc<[u8]> = Arc::from(raw.as_ref()); - let mut buf = raw.to_vec(); - let parsed = match simd_json::to_borrowed_value(&mut buf) { - Ok(v) => v, - Err(_) => { - return ParseOutcome { - name, - full: None, - settled: Vec::new(), - transitives: Vec::new(), - }; - } - }; + let mut buf = (*raw).to_vec(); + let parsed = simd_json::to_borrowed_value(&mut buf).ok()?; - let envelope_name = parsed + let name = parsed .get("name") .and_then(|v| v.as_str()) .map(|s| s.to_string()) - .unwrap_or_else(|| name.clone()); + .unwrap_or_default(); let dist_tags: HashMap = parsed .get("dist-tags") .and_then(|v| HashMap::::deserialize(v).ok()) .unwrap_or_default(); let versions_keys: Vec = parsed .get("versions") - .and_then(simd_json::prelude::ValueAsObject::as_object) + .and_then(ValueAsObject::as_object) .map(|obj| obj.keys().map(|k| k.to_string()).collect()) .unwrap_or_default(); let full = FullManifest { - name: envelope_name, + name, dist_tags, versions: versions_keys, - raw: Arc::clone(&raw_arc), + raw: Arc::clone(&raw), ..Default::default() }; - let full_arc = Arc::new(full); - - // For each requested spec, resolve + extract version subtree. - // Cache the per-(name, version) `CoreVersionManifest` so sibling - // specs that resolve to the same version reuse the same Arc. - let mut version_cache: HashMap> = HashMap::new(); - let mut settled = Vec::with_capacity(specs.len()); - let mut transitives = Vec::new(); - - for spec in specs { - let Ok(resolved_version) = resolve_target_version((&*full_arc).into(), &spec) else { - continue; + + let resolved = resolve_target_version((&full).into(), spec).ok()?; + let core = parsed + .get("versions") + .and_then(|v| v.get(resolved.as_str())) + .and_then(|version_obj| CoreVersionManifest::deserialize(version_obj).ok())?; + let core_arc = Arc::new(core); + let transitives = extract_transitive(&core_arc, peer_deps); + + Some((Arc::new(full), resolved, core_arc, transitives)) +} + +/// Fetch + combined parse + cache write for one `(name, spec)`. +/// Future body owns all per-fetch work; main loop only extends +/// `pending` from the returned transitives and refills `futs`. +fn spawn_fetch( + client: reqwest::Client, + registry_url: Arc, + name: String, + spec: String, + cache: MemoryCache, + body_cache: Arc>>>, + peer_deps: PeerDeps, +) -> Fut { + Box::pin(async move { + let url = format!("{}/{}", registry_url, name); + let resp = match client + .get(&url) + .header("accept", "application/vnd.npm.install-v1+json") + .send() + .await + { + Ok(r) if r.status().is_success() => r, + _ => { + return FetchOutcome { + name, + transitives: Vec::new(), + fetched: true, + }; + } }; - let core_arc = if let Some(cached) = version_cache.get(&resolved_version) { - Arc::clone(cached) - } else { - let Some(core) = parsed - .get("versions") - .and_then(|v| v.get(resolved_version.as_str())) - .and_then(|version_obj| CoreVersionManifest::deserialize(version_obj).ok()) - else { - continue; - }; - let arc = Arc::new(core); - version_cache.insert(resolved_version.clone(), Arc::clone(&arc)); - arc + let raw_bytes = match resp.bytes().await { + Ok(b) => b, + Err(_) => { + return FetchOutcome { + name, + transitives: Vec::new(), + fetched: true, + }; + } + }; + let raw_arc: Arc<[u8]> = Arc::from(raw_bytes.as_ref()); + // Stash in body_cache early so concurrent sibling specs + // arriving slightly after see it on their pending pop. + body_cache.lock().insert(name.clone(), Arc::clone(&raw_arc)); + + let spec_for_parse = spec.clone(); + let peer = peer_deps; + let parsed = + tokio::task::spawn_blocking(move || parse_combined(raw_arc, &spec_for_parse, peer)) + .await + .ok() + .flatten(); + + let transitives = match parsed { + Some((full_arc, resolved, core_arc, transitives)) => { + cache.set_full_manifest(name.clone(), Arc::clone(&full_arc)); + cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc)); + cache.set_version_manifest(name.clone(), resolved, core_arc); + transitives + } + None => Vec::new(), }; - transitives.extend(extract_transitive(&core_arc, peer_deps)); - settled.push((spec, resolved_version, core_arc)); - } - ParseOutcome { - name, - full: Some(full_arc), - settled, - transitives, - } + FetchOutcome { + name, + transitives, + fetched: true, + } + }) } -/// Phase 2 dispatcher: hands the batch to rayon, awaits the result. -async fn parse_settle_batch( - bodies: Vec, - by_name: HashMap>, +/// Settle-only future for a sibling spec whose `(name)` body already +/// landed via a sibling fetch. Same combined parse, no network. +fn spawn_settle( + name: String, + spec: String, + raw: Arc<[u8]>, + cache: MemoryCache, peer_deps: PeerDeps, -) -> Vec { - let work: Vec<(String, Bytes, Vec)> = bodies - .into_iter() - .filter_map(|f| { - let bytes = f.bytes?; - let specs = by_name.get(&f.name).cloned().unwrap_or_default(); - Some((f.name, bytes, specs)) +) -> Fut { + Box::pin(async move { + let spec_for_parse = spec.clone(); + let peer = peer_deps; + let parsed = tokio::task::spawn_blocking(move || { + parse_combined(Arc::clone(&raw), &spec_for_parse, peer) }) - .collect(); - - if work.is_empty() { - return Vec::new(); - } + .await + .ok() + .flatten(); + + let transitives = match parsed { + Some((full_arc, resolved, core_arc, transitives)) => { + // Don't overwrite full_manifest — the original fetcher + // already set it. Only populate the version-manifest + // slots so BFS hits the (name, spec) early-return. + cache.set_full_manifest(name.clone(), full_arc); + cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc)); + cache.set_version_manifest(name.clone(), resolved, core_arc); + transitives + } + None => Vec::new(), + }; - tokio::task::spawn_blocking(move || { - work.into_par_iter() - .map(|(name, raw, specs)| parse_one_body(name, raw, specs, peer_deps)) - .collect::>() + FetchOutcome { + name, + transitives, + fetched: false, + } }) - .await - .unwrap_or_default() } -/// Two-phase mb-style fetch with rayon batch parse. See module docs. +/// Streaming preload with transitive walk. Self-contained — no +/// dependency on `service::http` / `service::manifest` / +/// `service::registry` beyond `MemoryCache` writes. pub async fn mb_fetch( initial_deps: Vec, registry_url: &str, @@ -277,154 +269,109 @@ pub async fn mb_fetch( config: &PreloadConfig, ) -> MbFetchStats { let mut stats = MbFetchStats::default(); - let mut pending_specs: Vec = initial_deps; - // (name, spec) pairs we've already processed (settled or queued - // to settle). Without this, sibling-settle's transitive deps can - // re-introduce already-walked specs and the outer loop never - // terminates — peer / optional dep cycles trivially trigger this. - let mut seen_specs: HashSet<(String, String)> = HashSet::new(); - let mut done_names: HashSet = HashSet::new(); - let conc = config.concurrency; + let total_start = Instant::now(); + + let client = match build_mb_client() { + Ok(c) => c, + Err(e) => { + tracing::warn!("mb_resolve client build failed: {e}"); + return stats; + } + }; + let registry = Arc::new(registry_url.trim_end_matches('/').to_string()); + let cap = config.concurrency; let peer_deps = config.peer_deps; - let total_start = tokio::time::Instant::now(); - // Filter the initial seed through `seen_specs` too — root + workspace - // edges can list the same dep multiple times across workspaces. - pending_specs.retain(|(n, s)| seen_specs.insert((n.clone(), s.clone()))); + // Spec-level dedup across the entire run. + let mut seen: HashSet<(String, String)> = HashSet::new(); + let mut pending: VecDeque = VecDeque::new(); + for (name, spec) in initial_deps { + if seen.insert((name.clone(), spec.clone())) { + pending.push_back((name, spec)); + } + } - while !pending_specs.is_empty() { - stats.iterations += 1; - let iter = stats.iterations; + // Sibling-fetch dedup: when two specs for the same name are both + // in flight, only the first fires a fetch; the second arrives at + // the cached body and goes through `spawn_settle` instead. + let body_cache: Arc>>> = Arc::new(Mutex::new(HashMap::new())); + let mut in_flight_names: HashSet = HashSet::new(); + let mut deferred_by_name: HashMap> = HashMap::new(); - // Group this iteration's pending specs by name. - let mut by_name: HashMap> = HashMap::new(); - for (name, spec) in pending_specs.drain(..) { - by_name.entry(name).or_default().push(spec); - } + let mut futs: FuturesUnordered = FuturesUnordered::new(); - // Names whose full manifest is already cached from a prior - // iteration: settle their siblings synchronously (cheap - // semver match + cache lookup; no parse if version_manifest - // already cached, otherwise quick simd_json subtree extract). - let mut sibling_only: Vec<(String, Vec)> = Vec::new(); - let mut to_fetch: Vec = Vec::with_capacity(by_name.len()); - for (name, specs) in &by_name { - if done_names.contains(name) { - sibling_only.push((name.clone(), specs.clone())); - } else { - to_fetch.push(name.clone()); + loop { + // Refill to cap. + while futs.len() < cap { + let Some((name, spec)) = pending.pop_front() else { + break; + }; + // Sibling fast path: body already cached. + if let Some(raw) = body_cache.lock().get(&name).cloned() { + futs.push(spawn_settle(name, spec, raw, cache.clone(), peer_deps)); + continue; } - } - - // Sibling settles (rare on real workloads — most names appear - // exactly once across the whole walk). New transitives go - // through `seen_specs` dedup before joining `pending_specs`. - for (name, specs) in sibling_only { - let Some(full) = cache.get_full_manifest(&name) else { + // Defer if a fetch for this name is already in flight. + if !in_flight_names.insert(name.clone()) { + deferred_by_name.entry(name).or_default().push(spec); continue; - }; - for spec in specs { - let Ok(resolved) = resolve_target_version((&*full).into(), &spec) else { - continue; - }; - let new_deps = if let Some(cached) = cache.get_version_manifest(&name, &resolved) { - cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&cached)); - extract_transitive(&cached, peer_deps) - } else if let Some(core) = full.get_core_version(&resolved) { - let core_arc = Arc::new(core); - cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&core_arc)); - cache.set_version_manifest(name.clone(), resolved, Arc::clone(&core_arc)); - extract_transitive(&core_arc, peer_deps) - } else { - Vec::new() - }; - pending_specs.extend( - new_deps - .into_iter() - .filter(|(n, s)| seen_specs.insert((n.clone(), s.clone()))), - ); } + futs.push(spawn_fetch( + client.clone(), + Arc::clone(®istry), + name, + spec, + cache.clone(), + Arc::clone(&body_cache), + peer_deps, + )); } - if to_fetch.is_empty() { - // Iteration drained pending entirely via sibling settles. - continue; + if futs.is_empty() { + break; } - // PHASE 1 — pure HTTP, mb-style. - let p1_start = tokio::time::Instant::now(); - let bodies = mb_style_pure_fetch(to_fetch.clone(), registry_url, conc).await; - let p1_wall = p1_start.elapsed().as_millis(); - let total_bytes: usize = bodies - .iter() - .map(|b| b.bytes.as_ref().map(|v| v.len()).unwrap_or(0)) - .sum(); - tracing::info!( - "p1-breakdown mb_fetch iter={} phase1_http_wall={}ms n={} bytes={}", - iter, - p1_wall, - to_fetch.len(), - total_bytes, - ); - - // PHASE 2 — rayon batch parse + settle. - let p2_start = tokio::time::Instant::now(); - let by_name_for_parse = by_name - .iter() - .filter(|(name, _)| !done_names.contains(*name)) - .map(|(n, s)| (n.clone(), s.clone())) - .collect::>(); - let parsed = parse_settle_batch(bodies, by_name_for_parse, peer_deps).await; - let p2_wall = p2_start.elapsed().as_millis(); - - let mut new_transitives: Vec = Vec::new(); - let mut settle_count = 0usize; - let mut fail_count = 0usize; - for outcome in parsed { - done_names.insert(outcome.name.clone()); - let Some(full_arc) = outcome.full else { - fail_count += 1; - continue; - }; - cache.set_full_manifest(outcome.name.clone(), Arc::clone(&full_arc)); - for (spec, resolved, core) in outcome.settled { - cache.set_version_manifest(outcome.name.clone(), spec, Arc::clone(&core)); - cache.set_version_manifest(outcome.name.clone(), resolved, Arc::clone(&core)); - settle_count += 1; - } - new_transitives.extend(outcome.transitives); - } - // Names that fetched but failed parse — still mark done so we - // don't refetch them next iteration. - for name in to_fetch { - done_names.insert(name); + let Some(out) = futs.next().await else { break }; + + if out.transitives.is_empty() && out.fetched { + // Empty result from a fetch is ambiguous (no transitives + // OR a fetch/parse failure). Track conservatively as + // success — the FETCH_TIMINGS-equivalent counter is + // omitted in this path on purpose to keep the future + // body lean. + stats.success += 1; + } else if out.fetched { + stats.success += 1; } - stats.success += settle_count; - stats.fail += fail_count; - - let new_unique: Vec = new_transitives - .into_iter() - .filter(|(n, s)| seen_specs.insert((n.clone(), s.clone()))) - .collect(); - - tracing::info!( - "p1-breakdown mb_fetch iter={} phase2_parse_wall={}ms settles={} fail={} new_unique={}", - iter, - p2_wall, - settle_count, - fail_count, - new_unique.len(), - ); + // Drain sibling specs deferred while the fetch was in flight. + if out.fetched + && let Some(siblings) = deferred_by_name.remove(&out.name) + && let Some(raw) = body_cache.lock().get(&out.name).cloned() + { + for sibling_spec in siblings { + futs.push(spawn_settle( + out.name.clone(), + sibling_spec, + Arc::clone(&raw), + cache.clone(), + peer_deps, + )); + } + } - pending_specs.extend(new_unique); + // Extend pending with new transitive specs, dedup. + for (name, spec) in out.transitives { + if seen.insert((name.clone(), spec.clone())) { + pending.push_back((name, spec)); + } + } } let total_wall = total_start.elapsed().as_millis(); tracing::info!( - "p1-breakdown mb_fetch total_wall={}ms iters={} settled={} fail={}", + "p1-breakdown mb_fetch wall={}ms ok={} fail={}", total_wall, - stats.iterations, stats.success, stats.fail, ); diff --git a/crates/ruborist/src/service/api.rs b/crates/ruborist/src/service/api.rs index 9687fc875..06079b248 100644 --- a/crates/ruborist/src/service/api.rs +++ b/crates/ruborist/src/service/api.rs @@ -38,7 +38,6 @@ use crate::resolver::builder::{ BuildDepsConfig, DevDeps, EdgeContext, PeerDeps, add_edges_from, build_deps_with_config, gather_preload_deps, }; -use crate::resolver::fast_preload::fast_preload; use crate::resolver::mb_resolve::mb_fetch; use crate::resolver::preload::PreloadConfig; use crate::resolver::runtime::install_runtime_from_map; @@ -275,19 +274,19 @@ where // Lockfile-only callers (`utoo deps`) skip the receiver-driven // `run_preload_phase` because they have no pipeline consumer for - // `BuildEvent::PackageResolved`. Run `fast_preload` instead — a flat - // `FuturesUnordered` over `fetch_full_manifest` that warms the - // `MemoryCache` so the BFS phase below is pure cache-hit. This is - // the manifest-bench-style path; the heavier `preload_manifests` - // path (with `OnceMap` gates + `EventReceiver` events) only runs - // for install paths that need the pipeline signal. + // `BuildEvent::PackageResolved`. Route through `mb_fetch` — a + // ruborist-internal standalone preload that bypasses + // `service::http`, `service::manifest`, and `service::registry` + // to match `manifest-bench`'s loop shape directly. PM is + // unaware: this dispatch happens entirely inside ruborist when + // `skip_preload=true` and there's no warm project cache. if skip_preload_caller && cache_count == 0 { let initial_deps = gather_preload_deps(&graph, peer_deps); let preload_config = PreloadConfig { peer_deps, concurrency, }; - fast_preload( + mb_fetch( initial_deps, registry.registry_url(), registry.cache(), @@ -333,166 +332,6 @@ where }) } -/// Experimental parallel-track entry point: structurally identical to -/// [`build_deps`] but routes the manifest-fetch phase through -/// [`crate::resolver::mb_resolve::mb_fetch`] instead of -/// [`crate::resolver::fast_preload::fast_preload`]. -/// -/// Intended for A/B benchmarking: install + lockfile-only callers can -/// opt in via `UTOO_RESOLVE=mb` (wired in `pm::helper::ruborist_context`). -/// All other behavior — workspace discovery, runtime injection, BFS, -/// graph→lock serialization, project cache export — is the same as -/// `build_deps`. The `EventReceiver` still receives BFS events; it -/// does NOT receive `PreloadFetching` / `PreloadProgress` events -/// because mb_fetch is silent (matches `manifest-bench`'s zero-event -/// loop). -/// -/// **Install-path note:** `pipeline_deps_options` callers that need -/// `PackageResolved` events to drive the download/clone pipeline -/// won't pipeline under this path — mb_fetch finishes all fetches -/// before BFS starts. Use only for `utoo deps`-style workloads, or -/// accept that install becomes phase-sequential. -pub async fn build_deps_mb(options: BuildDepsOptions) -> Result -where - G: Glob + Clone, - R: EventReceiver, -{ - let BuildDepsOptions { - cwd, - registry_url, - cache_dir, - manifest_store, - warm_project_cache, - concurrency, - peer_deps, - glob, - receiver, - supports_semver, - catalogs, - skip_preload: _, - } = options; - - // Steps 1-6: structurally identical to `build_deps` — read - // package.json, inject runtime deps, build initial graph, add - // root edges, discover and add workspaces. - let discovery = WorkspaceDiscovery::new(glob.clone()); - let root_path = discovery.find_root_path(&cwd).await?; - let pkg_path = root_path.join("package.json"); - let mut pkg: PackageJson = super::fs::read_json(&pkg_path) - .await - .map_err(|e| anyhow::anyhow!("Failed to read/parse package.json: {}", e))?; - - if let Some(engines) = &pkg.engines { - let runtime_deps = install_runtime_from_map(engines); - if !runtime_deps.is_empty() { - for (name, version) in runtime_deps { - pkg.optional_dependencies - .get_or_insert_with(HashMap::new) - .entry(name) - .or_insert(version); - } - } - } - - let mut graph = DependencyGraph::from_package_json(root_path.clone(), pkg.clone()); - let root_index = graph.root_index; - let edge_ctx = EdgeContext::new(peer_deps, DevDeps::Include).with_catalogs(&catalogs); - add_edges_from(&mut graph, root_index, &pkg, &edge_ctx); - - let workspaces = discovery.find_workspaces_from_pkg(&root_path, &pkg).await?; - for workspace in workspaces { - let ws_pkg = workspace.package_json; - let workspace_node = - PackageNode::workspace_from_package_json(workspace.path.clone(), ws_pkg.clone()); - let workspace_index = graph.add_node(workspace_node); - let link_node = PackageNode::link_from_package_json(workspace.path.clone(), ws_pkg.clone()); - let link_index = graph.add_node(link_node); - graph.add_physical_edge(root_index, workspace_index); - graph.add_physical_edge(root_index, link_index); - let dep_edge_id = graph.add_dependency_edge( - root_index, - workspace.name.clone(), - &ws_pkg.version, - EdgeType::Prod, - ); - graph.mark_dependency_resolved(dep_edge_id, workspace_index); - add_edges_from(&mut graph, workspace_index, &ws_pkg, &edge_ctx); - } - - // Step 7-8: cache + registry, same as `build_deps`. Warm project - // cache is honored. - let package_cache = Arc::new(PackageCache::default()); - let (cache_count, _) = prepopulate_warm_cache(&package_cache, warm_project_cache.as_ref()); - - let mut builder = UnifiedRegistry::builder() - .registry(®istry_url) - .cache(package_cache) - .store(Arc::clone(&manifest_store)); - if let Some(semver) = supports_semver { - builder = builder.supports_semver(semver); - } - let registry = builder.build(); - - // Run mb_fetch instead of fast_preload — pre-warms cache by - // walking transitive deps via flat FuturesUnordered. Skipped if - // the warm project cache already covers the workload. - if cache_count == 0 { - let initial_deps = gather_preload_deps(&graph, peer_deps); - let preload_config = PreloadConfig { - peer_deps, - concurrency, - }; - mb_fetch( - initial_deps, - registry.registry_url(), - registry.cache(), - &preload_config, - ) - .await; - } - - // BFS phase reads the now-warm cache. `skip_preload=true` skips - // the receiver-driven preload — mb_fetch already ran. - let mut config = BuildDepsConfig::default() - .with_peer_deps(peer_deps) - .with_concurrency(concurrency) - .with_skip_preload(true) - .with_catalogs(catalogs); - if let Some(dir) = cache_dir { - config = config.with_cache_dir(dir); - } - - build_deps_with_config(&mut graph, ®istry, config, &receiver) - .await - .map_err(|e| anyhow::Error::new(e).context("Dependency resolution failed"))?; - - let t_serialize_start = std::time::Instant::now(); - let (packages, _total) = graph.serialize_to_packages(&root_path); - let serialize_us = t_serialize_start.elapsed().as_micros() as u64; - - let t_cache_export_start = std::time::Instant::now(); - let mut project_cache = ProjectCacheData::default(); - for (key, manifest) in registry.cache().export_version_manifests() { - let (name, spec) = parse_package_spec(&key); - let version = manifest.version.clone(); - let pkg_cache = project_cache.cache.entry(name.to_string()).or_default(); - pkg_cache.specs.insert(spec.to_string(), version.clone()); - pkg_cache.manifests.insert(version, (*manifest).clone()); - } - let cache_export_us = t_cache_export_start.elapsed().as_micros() as u64; - - tracing::info!( - "p1-breakdown serialize_us={} cache_export_us={}", - serialize_us, - cache_export_us, - ); - - Ok(BuildDepsOutput { - lock: PackageLock::new(&pkg.name, &pkg.version, packages), - project_cache, - }) -} - /// Pre-populate `cache` from a warm project cache. Returns /// `(loaded, missing)` — `loaded` is the count of usable spec→manifest /// entries; `missing` counts specs whose resolved version had no manifest diff --git a/crates/ruborist/src/service/mod.rs b/crates/ruborist/src/service/mod.rs index 7a7cf8ca8..5adb6bf0b 100644 --- a/crates/ruborist/src/service/mod.rs +++ b/crates/ruborist/src/service/mod.rs @@ -52,7 +52,7 @@ mod manifest; mod registry; mod store; -pub use api::{BuildDepsOptions, BuildDepsOutput, build_deps, build_deps_mb}; +pub use api::{BuildDepsOptions, BuildDepsOutput, build_deps}; pub use cache::{ CacheStats, MemoryCache, PackageCache, ProjectCacheData, ProjectPackageCache, Versions, VersionsInfo, From 05486b5028961768791ffa3c97d518b33b738d50 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 13:59:05 +0800 Subject: [PATCH 24/24] experiment(pm): swap DiskManifestStore for NoopStore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Side experiment branched off perf/p1-resolve-concurrency at 01d15130. Question under test: how much of utoo's p1/p3 wall comes from the per-fetch disk-cache existence-check IO that service::registry::UnifiedRegistry issues alongside each manifest fetch (store.load_versions / store.load_version_manifest + fire-and-forget store.store_*)? Swaps `Context::manifest_store` from `DiskManifestStore` to `NoopStore`, which makes every store call a no-op without touching the filesystem. Affects ALL paths that go through `Context`: * `utoo deps` (lockfile-only): already bypasses UnifiedRegistry via mb_resolve, so no perf impact expected — confirms baseline. * `utoo install` (pipeline path): preload_manifests still goes through UnifiedRegistry, so this swap removes per-fetch disk IO from the install resolve phase. p3_cold_install delta is the meaningful number. * BFS edges that miss MemoryCache and fall into resolve_via_full_manifest: no disk fallback, so a cold cache miss falls straight to network instead of checking disk first. NOT for landing — measurement-only branch. Compare against 01d15130 to read the disk-cache IO cost. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/pm/src/helper/ruborist_context.rs | 22 +++++++++++++++++++++- crates/pm/src/util/manifest_store.rs | 6 ++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/crates/pm/src/helper/ruborist_context.rs b/crates/pm/src/helper/ruborist_context.rs index c8b758a6f..e9226243b 100644 --- a/crates/pm/src/helper/ruborist_context.rs +++ b/crates/pm/src/helper/ruborist_context.rs @@ -9,6 +9,9 @@ use utoo_ruborist::service::{ use crate::service::pipeline::{PipelineChannels, PipelineReceiver}; use crate::util::cache::get_cache_dir; use crate::util::logger::ProgressReceiver; +// EXPERIMENT: DiskManifestStore swapped for NoopStore (see manifest_store +// fn below), so the disk-backed store is unused on this branch. +#[allow(unused_imports)] use crate::util::manifest_store::DiskManifestStore; use crate::util::project_cache; use crate::util::user_config::{ @@ -40,8 +43,25 @@ pub(crate) type Registry = UnifiedRegistry; pub(crate) struct Context; impl Context { + /// EXPERIMENT (experiment/no-disk-cache branch): swap + /// `DiskManifestStore` for `NoopStore` so every + /// `store.load_versions` / `store.load_version_manifest` call in + /// `service::registry::UnifiedRegistry` returns `None` without + /// touching the filesystem, and every `store.store_*` call is a + /// no-op. Used to A/B test how much of utoo's p1/p3 wall comes + /// from the per-fetch disk-cache existence-check IO that the + /// registry layer issues alongside each manifest fetch. + /// + /// Affects ALL paths that build `BuildDepsOptions` via this + /// helper (`deps_options` → `pipeline_deps_options`, + /// `build_deps`). The new `mb_resolve` lockfile-only path + /// already bypasses `UnifiedRegistry` entirely, so it sees no + /// effect from this swap; the install path (which still goes + /// through `UnifiedRegistry` for the pipeline preload) does see + /// the difference, and so does any BFS edge that misses + /// `MemoryCache` and falls into `resolve_via_full_manifest`. fn manifest_store() -> Arc { - Arc::new(DiskManifestStore::new(get_cache_dir())) + Arc::new(utoo_ruborist::service::NoopStore) } /// Create BuildDepsOptions with a custom event receiver. diff --git a/crates/pm/src/util/manifest_store.rs b/crates/pm/src/util/manifest_store.rs index 7f9c61bb1..b1fee9818 100644 --- a/crates/pm/src/util/manifest_store.rs +++ b/crates/pm/src/util/manifest_store.rs @@ -19,10 +19,15 @@ use utoo_ruborist::service::{ManifestStore, VersionsInfo}; use crate::util::json::read_json_file; +// EXPERIMENT: ruborist_context swaps DiskManifestStore for NoopStore on +// this branch — type stays defined to keep the import path valid, but +// fields go unread. +#[allow(dead_code)] pub struct DiskManifestStore { cache_dir: PathBuf, } +#[allow(dead_code)] impl DiskManifestStore { pub fn new(cache_dir: PathBuf) -> Self { Self { cache_dir } @@ -75,6 +80,7 @@ impl ManifestStore for DiskManifestStore { /// Serialize `value` and write to `path`. On `NotFound`, create the parent /// directory and retry once — saves the mkdir syscall on every warm-cache /// rewrite. Errors are logged at debug; disk cache is opportunistic. +#[allow(dead_code)] async fn write_json(path: &Path, value: &T) { let bytes = match serde_json::to_vec(value) { Ok(b) => b,