From 2bf71158cde397de4b423ce100385a0e5561e900 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Fri, 8 May 2026 21:56:24 +0800 Subject: [PATCH 01/32] =?UTF-8?q?perf(pm):=20bump=20manifests-concurrency-?= =?UTF-8?q?limit=2064=20=E2=86=92=20256=20+=20add=20fetch=20breakdown?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit p1_resolve has been ~0.9s behind bun on phases bench for the past several PRs. Pcap on prior runs measured bun opening ~260 parallel TCP streams against registry.npmjs.org for resolve, while utoo opened ~70 (the 64 manifests-concurrency-limit cap was at saturation). Adding fetch-breakdown timing in ruborist showed where p1's 22s (local Mac) actually goes: fetch-timings: n=2730 sum_request = 1089s (88% — TCP+TLS+HTTP RTT to first byte) sum_body = 138s (11% — body download) sum_parse = 2s (0.16% — simd_json on rayon) The dominant cost is per-request RTT, not parsing or body transfer. The lever is the cap on concurrent in-flight requests. This commit: 1. Adds `crates/ruborist/src/util/timing.rs` — process-wide atomic accumulator that records per-fetch (request_us, body_us, parse_us, bytes) inside both `fetch_full_manifest` and `fetch_version_manifest`. Reset before each preload phase, dumped at INFO level after preload + bfs. 2. Bumps `manifests-concurrency-limit` default 64 → 256 to match bun's observed working point against npmjs.org. CI bench will validate. Expected: p1 utoo wall drops toward bun's range (~2.3s on GHA). Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/pm/src/util/user_config.rs | 10 +- crates/ruborist/src/resolver/builder.rs | 17 ++- crates/ruborist/src/service/manifest.rs | 24 ++++- crates/ruborist/src/util/mod.rs | 2 + crates/ruborist/src/util/timing.rs | 134 ++++++++++++++++++++++++ 5 files changed, 181 insertions(+), 6 deletions(-) create mode 100644 crates/ruborist/src/util/timing.rs diff --git a/crates/pm/src/util/user_config.rs b/crates/pm/src/util/user_config.rs index 34ee45a34..bc281fb40 100644 --- a/crates/pm/src/util/user_config.rs +++ b/crates/pm/src/util/user_config.rs @@ -132,9 +132,15 @@ pub fn get_install_scope() -> InstallScope { INSTALL_SCOPE.get().copied().unwrap_or_default() } -// Manifest fetch concurrency configuration +// Manifest fetch concurrency configuration. +// +// 256 to match bun's observed ~260 parallel TCP streams against +// registry.npmjs.org. Local fetch-breakdown instrumentation showed +// 88% of preload-phase work is in per-request RTT (TCP+TLS+server), +// only 11% body, 0.16% parse — so the dominant lever for p1 wall is +// the cap on concurrent in-flight manifest requests. static MANIFESTS_CONCURRENCY_LIMIT: LazyLock> = - LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 64)); + LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 256)); pub fn set_manifests_concurrency_limit(value: Option) { MANIFESTS_CONCURRENCY_LIMIT.set(value); diff --git a/crates/ruborist/src/resolver/builder.rs b/crates/ruborist/src/resolver/builder.rs index b0bf2794c..166372c91 100644 --- a/crates/ruborist/src/resolver/builder.rs +++ b/crates/ruborist/src/resolver/builder.rs @@ -756,6 +756,7 @@ async fn run_preload_phase( return; } + crate::util::FETCH_TIMINGS.reset(); let start = tokio::time::Instant::now(); let initial_deps = gather_preload_deps(graph, config.peer_deps); @@ -794,7 +795,13 @@ async fn run_preload_phase( failed: stats.failed_count, }); - tracing::debug!("Preload phase: {:?}", start.elapsed()); + let preload_elapsed = start.elapsed(); + tracing::debug!("Preload phase: {:?}", preload_elapsed); + tracing::info!( + "p1-breakdown preload_wall={}ms | {}", + preload_elapsed.as_millis(), + crate::util::FETCH_TIMINGS.snapshot().summary_line(), + ); } /// Run the BFS traversal phase to build the dependency tree. @@ -896,7 +903,13 @@ async fn run_bfs_phase( current_level = next_level; } - tracing::debug!("Build phase: {:?}", start.elapsed()); + let bfs_elapsed = start.elapsed(); + tracing::debug!("Build phase: {:?}", bfs_elapsed); + tracing::info!( + "p1-breakdown bfs_wall={}ms | {}", + bfs_elapsed.as_millis(), + crate::util::FETCH_TIMINGS.snapshot().summary_line(), + ); Ok(()) } diff --git a/crates/ruborist/src/service/manifest.rs b/crates/ruborist/src/service/manifest.rs index 74baf3b9c..36bc6a85a 100644 --- a/crates/ruborist/src/service/manifest.rs +++ b/crates/ruborist/src/service/manifest.rs @@ -12,6 +12,7 @@ use super::fetch::{ }; use super::http::get_client; use crate::model::manifest::{CoreVersionManifest, FullManifest}; +use crate::util::FETCH_TIMINGS; /// Parse JSON bytes on rayon's CPU thread pool (native) or inline /// (wasm32). Keeps the tokio runtime free of `simd_json` work so other @@ -91,7 +92,9 @@ pub async fn fetch_full_manifest(opts: FetchManifestOptions<'_>) -> Result) -> Result(bytes) + let body_us = t_body_start.elapsed().as_micros() as u64; + let bytes_len = bytes.len() as u64; + let t_parse_start = std::time::Instant::now(); + let parsed = parse_json_off_runtime::(bytes) .await - .map_err(FetchError::Permanent) + .map_err(FetchError::Permanent); + let parse_us = t_parse_start.elapsed().as_micros() as u64; + if parsed.is_ok() { + FETCH_TIMINGS.record(request_us, body_us, parse_us, bytes_len); + } + parsed } else { Err(classify_status(response.status(), &url)) } diff --git a/crates/ruborist/src/util/mod.rs b/crates/ruborist/src/util/mod.rs index 649e47c95..a7f0b7b7d 100644 --- a/crates/ruborist/src/util/mod.rs +++ b/crates/ruborist/src/util/mod.rs @@ -1,6 +1,8 @@ //! Shared utility primitives for ruborist and downstream consumers. pub mod oncemap; +pub mod timing; pub use crate::model::util::{PackageNameStr, parse_package_spec, read_package_json}; pub use oncemap::OnceMap; +pub use timing::{FETCH_TIMINGS, FetchTimings, FetchTimingsSnapshot}; diff --git a/crates/ruborist/src/util/timing.rs b/crates/ruborist/src/util/timing.rs new file mode 100644 index 000000000..f50e921b9 --- /dev/null +++ b/crates/ruborist/src/util/timing.rs @@ -0,0 +1,134 @@ +//! Per-phase manifest fetch timing accumulator for p1 perf investigation. +//! +//! Splits each `fetch_*_manifest` call into three observable pieces: +//! - `request_us`: from `request.send().await` to response headers +//! received. Captures TCP connect (when not pooled), TLS handshake, +//! HTTP request roundtrip, and server-side processing. +//! - `body_us`: from response headers to the entire JSON body buffered. +//! Network-bandwidth bound for large packuments. +//! - `parse_us`: from full body buffered to a typed manifest. CPU bound +//! (simd_json on a spawn_blocking thread). +//! +//! `parse_us` is wall-clock for the await on `parse_json_off_runtime` — +//! since JSON parse runs on `spawn_blocking`, this includes scheduling +//! latency rather than pure CPU time. Together with the per-fetch total +//! already tracked in `preload_manifests`, this lets us answer "where +//! did p1's wall time go?" without external profiling. +//! +//! All counters are `AtomicU64` so the recording path is lock-free. +//! Numbers are reset between resolves via [`reset()`] so successive +//! `utoo deps` invocations report independently. + +use std::sync::atomic::{AtomicU64, Ordering}; + +/// Per-process accumulator for manifest fetch timings. +#[derive(Default, Debug)] +pub struct FetchTimings { + /// Number of fetches recorded (full + version manifest). + pub count: AtomicU64, + /// Sum of microseconds spent in `request.send().await`. + pub request_us: AtomicU64, + /// Sum of microseconds spent in `response.bytes().await`. + pub body_us: AtomicU64, + /// Sum of microseconds spent awaiting `parse_json_off_runtime`. + pub parse_us: AtomicU64, + /// Sum of body bytes received across all fetches. + pub bytes: AtomicU64, +} + +impl FetchTimings { + /// Record one fetch's split timings. Call once per successful fetch. + pub fn record(&self, request_us: u64, body_us: u64, parse_us: u64, bytes: u64) { + self.count.fetch_add(1, Ordering::Relaxed); + self.request_us.fetch_add(request_us, Ordering::Relaxed); + self.body_us.fetch_add(body_us, Ordering::Relaxed); + self.parse_us.fetch_add(parse_us, Ordering::Relaxed); + self.bytes.fetch_add(bytes, Ordering::Relaxed); + } + + /// Reset all counters to zero. + pub fn reset(&self) { + self.count.store(0, Ordering::Relaxed); + self.request_us.store(0, Ordering::Relaxed); + self.body_us.store(0, Ordering::Relaxed); + self.parse_us.store(0, Ordering::Relaxed); + self.bytes.store(0, Ordering::Relaxed); + } + + /// Snapshot of the current accumulator state. + pub fn snapshot(&self) -> FetchTimingsSnapshot { + FetchTimingsSnapshot { + count: self.count.load(Ordering::Relaxed), + request_us: self.request_us.load(Ordering::Relaxed), + body_us: self.body_us.load(Ordering::Relaxed), + parse_us: self.parse_us.load(Ordering::Relaxed), + bytes: self.bytes.load(Ordering::Relaxed), + } + } +} + +/// Immutable snapshot suitable for printing. +#[derive(Debug, Clone, Copy)] +pub struct FetchTimingsSnapshot { + pub count: u64, + pub request_us: u64, + pub body_us: u64, + pub parse_us: u64, + pub bytes: u64, +} + +impl FetchTimingsSnapshot { + /// One-line summary for tracing logs. + pub fn summary_line(&self) -> String { + if self.count == 0 { + return "fetch-timings: no requests recorded".to_string(); + } + let count = self.count; + let avg_req = self.request_us / count; + let avg_body = self.body_us / count; + let avg_parse = self.parse_us / count; + let avg_bytes = self.bytes / count; + format!( + "fetch-timings: n={} sum_request={}ms sum_body={}ms sum_parse={}ms total_bytes={}MB | avg_request={}us avg_body={}us avg_parse={}us avg_bytes={}KB", + count, + self.request_us / 1_000, + self.body_us / 1_000, + self.parse_us / 1_000, + self.bytes / 1_000_000, + avg_req, + avg_body, + avg_parse, + avg_bytes / 1_024, + ) + } +} + +/// Process-wide manifest fetch timing accumulator. +pub static FETCH_TIMINGS: FetchTimings = FetchTimings { + count: AtomicU64::new(0), + request_us: AtomicU64::new(0), + body_us: AtomicU64::new(0), + parse_us: AtomicU64::new(0), + bytes: AtomicU64::new(0), +}; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn record_and_snapshot() { + FETCH_TIMINGS.reset(); + FETCH_TIMINGS.record(100, 200, 300, 1024); + FETCH_TIMINGS.record(150, 250, 350, 2048); + let snap = FETCH_TIMINGS.snapshot(); + assert_eq!(snap.count, 2); + assert_eq!(snap.request_us, 250); + assert_eq!(snap.body_us, 450); + assert_eq!(snap.parse_us, 650); + assert_eq!(snap.bytes, 3072); + FETCH_TIMINGS.reset(); + let snap2 = FETCH_TIMINGS.snapshot(); + assert_eq!(snap2.count, 0); + } +} From 8ac97ae036ab97cb986ce19109af18e130dbc1cd Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Fri, 8 May 2026 22:25:36 +0800 Subject: [PATCH 02/32] =?UTF-8?q?chore(p1):=20revert=20concurrency=20256?= =?UTF-8?q?=20=E2=86=92=2064=20+=20restore=20manifest-bench?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two changes after the GHA bench on the previous commit (PR #2916, run 25559625024) showed the concurrency=256 hypothesis was wrong on GHA's environment. Revert concurrency 256 → 64 --------------------------- The new fetch-timing instrumentation shipped in the previous commit caught the surprise: GHA's pcap-vs-local profile is the *opposite* of what local Mac measurements suggested. metric local Mac GHA Linux avg_request 399ms 70ms ← network MUCH faster on GHA avg_body 50ms 20ms avg_parse 730µs 266ms ← parse 365× SLOWER on GHA Mechanism: `parse_json_off_runtime` dispatches to `rayon::spawn`, and rayon's pool size is `num_cpus` (= 2 on GHA ubuntu-latest). Bumping concurrency 64 → 256 queued 256 manifest parses behind 2 rayon workers — head-of-line blocking. avg_parse jumped from ~10ms to 266ms wall, dragging p1 utoo wall from 3.10s up to 3.33s. Restore manifest-bench ---------------------- Brought back `crates/manifest-bench` (originally landed in the post-#2818 driver hunt, dropped in af714eb3 once #2818 graduated). It's a single-binary HTTP-only fetch tool that strips out the ruborist pipeline (no BFS, no dedup, no parse, no project cache, no lockfile write) — fires `GET /` in parallel and reports the same diag shape as the new `p1-breakdown` lines. Goal: separate the network ceiling from the resolver pipeline so the next round of p1 experiments (parse offload, partial parse, dedicated parse pool, etc.) can be evaluated against a stable "pure network" baseline. Knobs (unchanged from the original drop): --concurrency N sweep without rebuilding utoo --reps N run same workload back-to-back --single-version use //latest (smaller bodies) --user-agent X UA-fingerprint experiments --http1-only H2 vs H1 toggle --accept X override Accept header Same TLS stack as ruborist (rustls + aws-lc-rs, native roots). Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.toml | 1 + crates/manifest-bench/Cargo.toml | 37 +++ crates/manifest-bench/src/main.rs | 371 ++++++++++++++++++++++++++++++ crates/pm/src/util/user_config.rs | 19 +- 4 files changed, 421 insertions(+), 7 deletions(-) create mode 100644 crates/manifest-bench/Cargo.toml create mode 100644 crates/manifest-bench/src/main.rs diff --git a/Cargo.toml b/Cargo.toml index ef4a4f926..0574a185a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,7 @@ [workspace] resolver = "2" members = [ + "crates/manifest-bench", "crates/pack-api", "crates/pack-cli", "crates/pack-core", diff --git a/crates/manifest-bench/Cargo.toml b/crates/manifest-bench/Cargo.toml new file mode 100644 index 000000000..5b01e57c0 --- /dev/null +++ b/crates/manifest-bench/Cargo.toml @@ -0,0 +1,37 @@ +[package] +name = "manifest-bench" +version = "0.0.0" +edition = "2024" +license = "MIT" +publish = false +description = "Standalone HTTP-only manifest fetch benchmark, isolating network behaviour from ruborist's resolver pipeline." + +[[bin]] +name = "manifest-bench" +path = "src/main.rs" + +# tombi: format.rules.table-keys-order.disabled = true +[dependencies] +anyhow = { workspace = true } +clap = { workspace = true } +futures = "0.3" +serde = { version = "1", features = ["derive"] } +serde_json = { workspace = true } +tokio = { workspace = true, features = ["macros", "rt-multi-thread", "fs", "time"] } + +# Identical TLS / DNS choices to ruborist so we measure the *protocol* +# characteristics of the same stack, not a different implementation. +reqwest = { version = "0.12", default-features = false, features = [ + "brotli", + "gzip", + "http2", + "rustls-tls-native-roots-no-provider", + "socks" +] } +rustls = { version = "0.23", default-features = false, features = [ + "aws-lc-rs", + "logging", + "std", + "tls12" +] } +rustls-native-certs = "0.8" diff --git a/crates/manifest-bench/src/main.rs b/crates/manifest-bench/src/main.rs new file mode 100644 index 000000000..fa70f3fe4 --- /dev/null +++ b/crates/manifest-bench/src/main.rs @@ -0,0 +1,371 @@ +//! Standalone HTTP-only manifest fetch benchmark. +//! +//! Isolates the network behaviour of `reqwest + rustls + tokio` from +//! ruborist's resolver pipeline (BFS, dedup, parse, lockfile, project +//! cache). Reads a list of package names, builds manifest URLs, fires +//! parallel `GET` requests, records `(start, end)` per request, and +//! reports the same diag shape as ruborist's `Preload HTTP diag` line. +//! +//! Two input modes: +//! - `--names-file ` — newline-separated package names +//! - `--lockfile ` — a npm-style package-lock.json; we extract +//! the `packages.*` (v3) or `dependencies.*` (v2) keys +//! +//! Two registry modes: +//! - `/` — full manifest endpoint (default, npmjs) +//! - `//latest` — single-version endpoint +//! (gated behind `--single-version`) +//! +//! Each request reads the body to completion (we only measure I/O, no +//! parse). Output: same fields as preload's HTTP diag for direct +//! comparison. + +use std::path::PathBuf; +use std::sync::Arc; +use std::time::Instant; + +use anyhow::{Context, Result, anyhow}; +use clap::Parser; +use futures::stream::{FuturesUnordered, StreamExt}; + +#[derive(Parser, Debug)] +#[command( + name = "manifest-bench", + about = "HTTP-only manifest fetch bench (no parse, no resolver)" +)] +struct Args { + /// Registry base URL. + #[arg(long, default_value = "https://registry.npmjs.org")] + registry: String, + + /// File of newline-separated package names. Mutually exclusive with `--lockfile`. + #[arg(long, conflicts_with = "lockfile")] + names_file: Option, + + /// `package-lock.json` file. Reads top-level `packages.*.name` keys. + #[arg(long)] + lockfile: Option, + + /// Maximum concurrent in-flight requests. + #[arg(long, default_value_t = 128)] + concurrency: usize, + + /// Number of times to repeat the whole sweep (each iteration is a + /// fresh `reqwest::Client`, so connection pool / TLS handshake + /// costs are paid each time, matching `hyperfine` cold-start). + #[arg(long, default_value_t = 1)] + reps: usize, + + /// Use the single-version endpoint `//latest` instead of the + /// full-manifest endpoint `/`. Smaller bodies, more requests + /// served per byte. + #[arg(long)] + single_version: bool, + + /// Override `Accept` header. Default mimics ruborist's preload + /// (`application/vnd.npm.install-v1+json` — abbreviated metadata). + #[arg(long)] + accept: Option, + + /// Override `User-Agent`. Default uses reqwest's default. Try + /// `Bun/1.x.x` to test whether Cloudflare differentiates by UA. + #[arg(long)] + user_agent: Option, + + /// Force HTTP/1.1 (no H2 negotiation). Default lets ALPN decide. + #[arg(long)] + http1_only: bool, +} + +#[tokio::main] +async fn main() -> Result<()> { + let args = Args::parse(); + + let names = load_names(&args)?; + if names.is_empty() { + return Err(anyhow!("no package names found in input")); + } + + println!( + "manifest-bench: registry={} concurrency={} reps={} names={} h1_only={} single_version={} accept={} ua={}", + args.registry, + args.concurrency, + args.reps, + names.len(), + args.http1_only, + args.single_version, + args.accept.as_deref().unwrap_or(""), + args.user_agent.as_deref().unwrap_or(""), + ); + + for rep in 1..=args.reps { + run_once(&args, &names, rep).await?; + } + + Ok(()) +} + +fn load_names(args: &Args) -> Result> { + if let Some(path) = &args.names_file { + let raw = std::fs::read_to_string(path).with_context(|| format!("read {path:?}"))?; + return Ok(raw + .lines() + .map(str::trim) + .filter(|s| !s.is_empty() && !s.starts_with('#')) + .map(str::to_string) + .collect()); + } + + if let Some(path) = &args.lockfile { + let raw = std::fs::read_to_string(path).with_context(|| format!("read {path:?}"))?; + return extract_lockfile_names(&raw); + } + + Err(anyhow!("provide --names-file or --lockfile")) +} + +/// Pull unique package names from an npm v3 lockfile (`packages.*`) +/// or an older v2 lockfile (`dependencies.*`). +fn extract_lockfile_names(raw: &str) -> Result> { + use std::collections::BTreeSet; + + let v: serde_json::Value = serde_json::from_str(raw).context("parse lockfile JSON")?; + let mut names: BTreeSet = BTreeSet::new(); + + if let Some(packages) = v.get("packages").and_then(|p| p.as_object()) { + for key in packages.keys() { + if key.is_empty() { + continue; + } + // npm v3 packages key like "node_modules/foo" or + // "node_modules/@scope/bar/node_modules/baz" — take the + // last path segment (or @scope/name pair). + let last = last_module_name(key); + if !last.is_empty() { + names.insert(last); + } + } + } else if let Some(deps) = v.get("dependencies").and_then(|d| d.as_object()) { + for key in deps.keys() { + names.insert(key.clone()); + } + } + + Ok(names.into_iter().collect()) +} + +fn last_module_name(key: &str) -> String { + let parts: Vec<&str> = key.split("node_modules/").collect(); + let tail = parts.last().copied().unwrap_or(""); + tail.to_string() +} + +#[derive(Debug)] +struct ReqResult { + start: Instant, + end: Instant, + bytes: usize, + status: u16, +} + +async fn run_once(args: &Args, names: &[String], rep: usize) -> Result<()> { + // Build a fresh client per rep — matches hyperfine's cold-start + // assumption that each iteration pays the TLS handshake cost. + let client = build_client(args)?; + let registry = Arc::new(args.registry.trim_end_matches('/').to_string()); + let accept = Arc::new( + args.accept + .clone() + .unwrap_or_else(|| "application/vnd.npm.install-v1+json".to_string()), + ); + + let single_version = args.single_version; + let concurrency = args.concurrency; + + let phase_start = Instant::now(); + let mut futs = FuturesUnordered::new(); + let mut idx = 0usize; + let mut results: Vec = Vec::with_capacity(names.len()); + + while idx < names.len() && futs.len() < concurrency { + spawn_one( + &client, + ®istry, + &names[idx], + &accept, + single_version, + &mut futs, + ); + idx += 1; + } + + while let Some(res) = futs.next().await { + results.push(res); + if idx < names.len() { + spawn_one( + &client, + ®istry, + &names[idx], + &accept, + single_version, + &mut futs, + ); + idx += 1; + } + } + let phase_wall_ms = phase_start.elapsed().as_millis(); + + report(rep, &results, phase_wall_ms); + Ok(()) +} + +type Fut = std::pin::Pin + Send>>; + +fn spawn_one( + client: &reqwest::Client, + registry: &Arc, + name: &str, + accept: &Arc, + single_version: bool, + futs: &mut FuturesUnordered, +) { + let url = if single_version { + format!("{registry}/{name}/latest") + } else { + format!("{registry}/{name}") + }; + let client = client.clone(); + let accept = Arc::clone(accept); + futs.push(Box::pin(async move { + let start = Instant::now(); + let req = client.get(&url).header("accept", accept.as_str()).send(); + let (bytes, status) = match req.await { + Ok(resp) => { + let status = resp.status().as_u16(); + let body = resp.bytes().await.map(|b| b.len()).unwrap_or(0); + (body, status) + } + Err(_) => (0, 0), + }; + let end = Instant::now(); + ReqResult { + start, + end, + bytes, + status, + } + })); +} + +fn build_client(args: &Args) -> Result { + // Install aws-lc-rs as the default crypto provider (idempotent — + // first call wins). Matches ruborist's `service::http` setup. + let _ = rustls::crypto::aws_lc_rs::default_provider().install_default(); + + let mut roots = rustls::RootCertStore::empty(); + let native = rustls_native_certs::load_native_certs(); + for cert in native.certs { + let _ = roots.add(cert); + } + + let tls_config = rustls::ClientConfig::builder_with_provider(std::sync::Arc::new( + rustls::crypto::aws_lc_rs::default_provider(), + )) + .with_safe_default_protocol_versions() + .map_err(|e| anyhow!("rustls protocol versions: {e}"))? + .with_root_certificates(roots) + .with_no_client_auth(); + + let mut builder = reqwest::Client::builder() + .use_preconfigured_tls(tls_config) + .no_proxy() + .pool_max_idle_per_host(256); + if args.http1_only { + builder = builder.http1_only(); + } + if let Some(ua) = &args.user_agent { + builder = builder.user_agent(ua); + } + builder.build().context("build reqwest client") +} + +fn report(rep: usize, results: &[ReqResult], wall_ms: u128) { + if results.is_empty() { + eprintln!("[rep {rep}] no results"); + return; + } + + let mut spans: Vec<(Instant, Instant)> = results.iter().map(|r| (r.start, r.end)).collect(); + spans.sort_by_key(|(s, _)| *s); + + let first_start = spans.first().unwrap().0; + let last_end = spans.iter().map(|(_, e)| *e).max().unwrap(); + let win_wall = last_end.duration_since(first_start).as_millis(); + + let mut per_us: Vec = spans + .iter() + .map(|(s, e)| e.duration_since(*s).as_micros()) + .collect(); + per_us.sort_unstable(); + let n = per_us.len(); + let pct = |p: usize| per_us[(n * p).div_ceil(100).saturating_sub(1)]; + let sum: u128 = per_us.iter().sum(); + let p50 = per_us[n / 2]; + + let mut busy_us: u128 = 0; + let (mut cur_s, mut cur_e) = spans[0]; + for &(s, e) in &spans[1..] { + if s <= cur_e { + if e > cur_e { + cur_e = e; + } + } else { + busy_us += cur_e.duration_since(cur_s).as_micros(); + cur_s = s; + cur_e = e; + } + } + busy_us += cur_e.duration_since(cur_s).as_micros(); + + let bytes_total: usize = results.iter().map(|r| r.bytes).sum(); + let ok = results.iter().filter(|r| r.status == 200).count(); + let err = results.iter().filter(|r| r.status == 0).count(); + let four_xx = results + .iter() + .filter(|r| (400..500).contains(&r.status)) + .count(); + let five_xx = results + .iter() + .filter(|r| (500..600).contains(&r.status)) + .count(); + + let avg_conc = if busy_us > 0 { + sum as f64 / busy_us as f64 + } else { + 0.0 + }; + + println!( + "[rep {rep}] n={} phase_wall={}ms win_wall={}ms busy={}ms ({:.0}%) sum={}ms avg_conc={:.1} p50={}ms p95={}ms p99={}ms max={}ms bytes={} 200={} 4xx={} 5xx={} err={}", + n, + wall_ms, + win_wall, + busy_us / 1000, + if win_wall > 0 { + 100.0 * (busy_us as f64 / 1000.0) / win_wall as f64 + } else { + 0.0 + }, + sum / 1000, + avg_conc, + p50 / 1000, + pct(95) / 1000, + pct(99) / 1000, + per_us.last().unwrap() / 1000, + bytes_total, + ok, + four_xx, + five_xx, + err, + ); +} diff --git a/crates/pm/src/util/user_config.rs b/crates/pm/src/util/user_config.rs index bc281fb40..a0235830a 100644 --- a/crates/pm/src/util/user_config.rs +++ b/crates/pm/src/util/user_config.rs @@ -132,15 +132,20 @@ pub fn get_install_scope() -> InstallScope { INSTALL_SCOPE.get().copied().unwrap_or_default() } -// Manifest fetch concurrency configuration. +// Manifest fetch concurrency configuration. Default kept at 64. // -// 256 to match bun's observed ~260 parallel TCP streams against -// registry.npmjs.org. Local fetch-breakdown instrumentation showed -// 88% of preload-phase work is in per-request RTT (TCP+TLS+server), -// only 11% body, 0.16% parse — so the dominant lever for p1 wall is -// the cap on concurrent in-flight manifest requests. +// We tried 256 to match bun's observed parallel streams; on GHA the +// fetch-breakdown instrumentation showed sum_parse exploded from +// ~10ms (local Mac, network-bound) to 728s on first cold run with +// 256 concurrency. Mechanism: parse_json_off_runtime dispatches to +// rayon, which has only num_cpus (=2 on GHA) workers. Bumping +// concurrency to 256 queued 256 parses behind 2 workers → wall +// per-parse jumped from 730µs to 266ms. Net p1 wall *increased* +// 3.10s → 3.33s on phases bench. Keep 64 until we address the +// parse-side queueing (e.g. inline parse on tokio, or a wider +// dedicated parse pool). static MANIFESTS_CONCURRENCY_LIMIT: LazyLock> = - LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 256)); + LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 64)); pub fn set_manifests_concurrency_limit(value: Option) { MANIFESTS_CONCURRENCY_LIMIT.set(value); From 5690a9b6b416fb7040a52a3ce24a303177d8bc76 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Fri, 8 May 2026 22:56:20 +0800 Subject: [PATCH 03/32] ci(p1): wire manifest-bench standalone HTTP sweep into bench-phases-linux MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit build-linux now also builds + uploads `manifest-bench` when a phases bench is going to run (label or dispatch). bench-phases-linux downloads the binary and runs it after the regular phase-isolated benchmark. Sweep mirrors the original (#2818-era) wire-in: concurrency: 32 / 64 / 96 / 128 / 192 / 256 (HTTP/1.1, full manifest) protocol: H1 vs H2-negotiate (cap=128) endpoint: full vs `//latest` (cap=128, smaller bodies) UA: default vs `Bun/1.2.21` (cap=128) Output goes to /tmp/pm-bench-output/manifest-bench-npmjs.log and ships in the existing pm-bench-logs-linux artifact — no PR comment surface (the headline phases bench comment stays the same). Why now: the new ruborist `p1-breakdown` instrumentation showed sum_parse on GHA can dominate when concurrency is bumped (256: sum_parse 728s vs sum_request 193s). To attribute the bun-vs-utoo gap on p1_resolve we need a "pure HTTP" baseline that strips out ruborist's parse / BFS / dedup / lockfile path. manifest-bench is that baseline: same TLS stack as ruborist (rustls + aws-lc-rs, native roots), no resolver pipeline. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/pm-e2e-bench.yml | 80 ++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/.github/workflows/pm-e2e-bench.yml b/.github/workflows/pm-e2e-bench.yml index 74c90ece5..b25f5c380 100644 --- a/.github/workflows/pm-e2e-bench.yml +++ b/.github/workflows/pm-e2e-bench.yml @@ -143,6 +143,24 @@ jobs: name: utoo-linux-x64 path: target/x86_64-unknown-linux-gnu/release/utoo retention-days: 1 + # manifest-bench is a standalone HTTP-only fetch sweeper used as + # the network-only baseline for p1_resolve perf work. Built only + # when phases bench is going to run (label or dispatch), so plain + # PR builds aren't slowed by the extra crate. + - name: Build manifest-bench (p1 baseline) + if: > + (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'benchmark')) || + (github.event_name == 'workflow_dispatch' && (inputs.target == 'pm-bench-phases' || inputs.target == 'pm-bench-pcap')) + run: cargo build --release --target x86_64-unknown-linux-gnu -p manifest-bench + - name: Upload manifest-bench binary + if: > + (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'benchmark')) || + (github.event_name == 'workflow_dispatch' && (inputs.target == 'pm-bench-phases' || inputs.target == 'pm-bench-pcap')) + uses: actions/upload-artifact@v4 + with: + name: manifest-bench-linux-x64 + path: target/x86_64-unknown-linux-gnu/release/manifest-bench + retention-days: 1 # Piggyback on the already-built target/ from the step above: when the # PR is labeled `benchmark`, overlay origin/next's tree onto the current # workdir and re-run cargo build. cargo's incremental compile only @@ -516,6 +534,19 @@ jobs: mv /tmp/utoo-next-dist/utoo /tmp/utoo-next echo "Baseline utoo (next) version: $(/tmp/utoo-next --version)" echo "UTOO_NEXT_BIN=/tmp/utoo-next" >> $GITHUB_ENV + # Download the manifest-bench binary built by build-linux. Used as + # the network-only baseline for p1_resolve work — strips out parse, + # BFS, dedup, lockfile write so the wall is pure HTTP fetch. + - name: Download manifest-bench binary + uses: actions/download-artifact@v4 + with: + name: manifest-bench-linux-x64 + path: /tmp/manifest-bench-dist + - name: Install manifest-bench + run: | + chmod +x /tmp/manifest-bench-dist/manifest-bench + mv /tmp/manifest-bench-dist/manifest-bench /tmp/manifest-bench + echo "MANIFEST_BENCH_BIN=/tmp/manifest-bench" >> $GITHUB_ENV - name: Verify tools run: | hyperfine --version @@ -565,6 +596,55 @@ jobs: run: | mkdir -p /tmp/pm-bench-output bash bench/pm-bench-phases.sh 2>&1 | tee /tmp/pm-bench-output/bench-phases-npmmirror.log + # Standalone HTTP-only sweep — sweeps the network-only ceiling + # against the same lockfile-derived workload phase-bench just used. + # Output goes into the bench logs artifact; no PR comment surface. + - name: Standalone manifest-bench (HTTP-only sweep) + env: + PROJECT: ${{ github.event.inputs.project || 'ant-design' }} + REGISTRY: 'https://registry.npmjs.org' + run: | + set -eu + mkdir -p /tmp/pm-bench-output + PROJECT_DIR="/tmp/pm-bench/$PROJECT" + if [ ! -d "$PROJECT_DIR" ]; then + mkdir -p /tmp/pm-bench + git clone --depth 1 "https://github.com/ant-design/$PROJECT" "$PROJECT_DIR" + fi + cd "$PROJECT_DIR" + if [ ! -f package-lock.json ]; then + echo "==> generating lockfile via utoo (one-shot, untimed)" + utoo deps --registry "$REGISTRY" || true + fi + ls -la package-lock.json || { echo "no lockfile; skipping manifest-bench"; exit 0; } + + MB_LOG=/tmp/pm-bench-output/manifest-bench-npmjs.log + { + echo "============================================================" + echo "manifest-bench: HTTP-only fetch (no parse, no resolver)" + echo " Goal: isolate reqwest/rustls/tokio behaviour from" + echo " ruborist's resolver pipeline. Same metric shape as" + echo " ruborist's p1-breakdown line." + echo "============================================================" + for CAP in 32 64 96 128 192 256; do + echo + echo "--- concurrency=$CAP, h1, full manifest, default UA ---" + "$MANIFEST_BENCH_BIN" --lockfile package-lock.json --registry "$REGISTRY" \ + --concurrency "$CAP" --reps 2 --http1-only || true + done + echo + echo "--- concurrency=128, h2 negotiate, full manifest, default UA ---" + "$MANIFEST_BENCH_BIN" --lockfile package-lock.json --registry "$REGISTRY" \ + --concurrency 128 --reps 2 || true + echo + echo "--- concurrency=128, h1, single-version endpoint ---" + "$MANIFEST_BENCH_BIN" --lockfile package-lock.json --registry "$REGISTRY" \ + --concurrency 128 --reps 2 --http1-only --single-version || true + echo + echo "--- concurrency=128, h1, UA=Bun/1.2.21 ---" + "$MANIFEST_BENCH_BIN" --lockfile package-lock.json --registry "$REGISTRY" \ + --concurrency 128 --reps 2 --http1-only --user-agent "Bun/1.2.21" || true + } 2>&1 | tee "$MB_LOG" - name: Upload bench logs if: always() uses: actions/upload-artifact@v4 From 94af458887de3add09f2e973dbbad6f2524f1a5f Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Fri, 8 May 2026 23:24:56 +0800 Subject: [PATCH 04/32] perf(ruborist): inline JSON parse, drop rayon::spawn dispatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI fetch-breakdown on GHA (run 25562552058, conc=64) showed parse queueing on rayon dominates the gap to manifest-bench's pure-HTTP baseline: manifest-bench (pure HTTP, conc=64): 2.12s wall utoo p1 (full ruborist): 3.10s wall ← +1.0s overhead ↑ sum_parse 95s vs sum_request 95s, parse 50% of work-time ↑ avg_parse 30ms wall vs ~5ms actual CPU — the 25ms extra is rayon queue wait Mechanism: 64 concurrent tasks all dispatching parse to rayon's pool (size = num_cpus = 2 on GHA). Queue depth grows to ~32 per worker. Each parse waits 25ms+ in queue before running its 5ms of CPU work. Round 1 fix: inline parse, drop the rayon hop. simd_json on a tokio worker thread is fast (~5ms for 115KB JSON), and the tokio runtime's cooperative budget naturally rebalances CPU across the 64 tasks. Expected on next CI: - avg_parse drops from 30ms wall → ~5-10ms wall (close to CPU-only) - preload_wall drops from 5.4s → ~3.5-4s for cold runs - p1 hyperfine wall drops from 3.10s → 2.3-2.5s, narrowing the gap to manifest-bench's 2.12s ceiling If parse becomes the new bottleneck (CPU-bound), next round could look at partial parse / lazy field access. If wall doesn't drop, hypothesis is wrong and we look elsewhere (BFS, dedup, lockfile). Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/service/manifest.rs | 29 +++++++++---------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/crates/ruborist/src/service/manifest.rs b/crates/ruborist/src/service/manifest.rs index 36bc6a85a..3502f6ec2 100644 --- a/crates/ruborist/src/service/manifest.rs +++ b/crates/ruborist/src/service/manifest.rs @@ -14,29 +14,20 @@ use super::http::get_client; use crate::model::manifest::{CoreVersionManifest, FullManifest}; use crate::util::FETCH_TIMINGS; -/// Parse JSON bytes on rayon's CPU thread pool (native) or inline -/// (wasm32). Keeps the tokio runtime free of `simd_json` work so other -/// in-flight manifest fetches keep driving network IO while this one -/// parses. +/// Parse JSON bytes inline on the calling tokio task. Previously this +/// dispatched to `rayon::spawn` to "free the runtime", but +/// fetch-breakdown instrumentation on GHA showed the rayon hop made it +/// strictly worse: rayon's pool is `num_cpus` (= 2 on ubuntu-latest), +/// 64 concurrent fetches all dispatching parse queued behind 2 workers +/// — avg_parse ballooned from ~5ms (CPU only) to 30ms wall (queue + +/// CPU). Inlining puts parse on the tokio worker that already owns +/// the buffer; the cooperative-scheduling budget naturally rebalances +/// CPU between fetches. async fn parse_json_off_runtime(mut bytes: Vec) -> Result where T: serde::de::DeserializeOwned + Send + 'static, { - #[cfg(not(target_arch = "wasm32"))] - { - let (tx, rx) = tokio::sync::oneshot::channel(); - rayon::spawn(move || { - let result = simd_json::serde::from_slice::(&mut bytes) - .map_err(|e| anyhow!("JSON parse error: {e}")); - let _ = tx.send(result); - }); - rx.await - .map_err(|e| anyhow!("rayon parse channel closed: {e}"))? - } - #[cfg(target_arch = "wasm32")] - { - simd_json::serde::from_slice::(&mut bytes).map_err(|e| anyhow!("JSON parse error: {e}")) - } + simd_json::serde::from_slice::(&mut bytes).map_err(|e| anyhow!("JSON parse error: {e}")) } /// Result of a full manifest fetch with ETag support. From ee5f5f4d23c8c9668c90c7d6b3b12eb49dab3afe Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Fri, 8 May 2026 23:47:49 +0800 Subject: [PATCH 05/32] perf(ruborist): switch JSON parse to tokio spawn_blocking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 1 (inline parse) reverted on data: GHA showed +0.37s p1 regression because parse blocked tokio runtime workers, dropping eff_parallel 42 → 35 even though per-fetch work-time fell. avg_request went up from 35ms → 52ms — symptomatic of socket reads being delayed by the parsing task on the same worker. metric round 0 (rayon) round 1 (inline) p1 wall 3.27s 3.64s ⚠️ +0.37s avg_parse 30ms (queued) 300µs ✓ avg_request 35ms 52ms ⚠️ +17ms (worker contention) eff_parallel 42 35 ⚠️ Round 2 attempts the third option: `tokio::task::spawn_blocking`. - rayon's pool was too small (num_cpus = 2 on GHA) — 64 concurrent parses queued behind 2 workers, parse wall 30ms. - inline parse held tokio worker hostage during simd_json call, starving in-flight socket reads. - tokio's blocking pool has a much larger default cap (512), so 64 concurrent parses never queue. Unlike rayon there's no contention with the install path's parallel-write rayon usage. Unlike inline the tokio runtime workers stay free to drive network I/O. Expected on next CI: - avg_parse drops to ~5-10ms wall (close to CPU floor, no queue) - avg_request stays ~35ms (workers free for I/O) - eff_parallel returns to ~50, possibly higher - p1 wall drops toward manifest-bench's 2.10s ceiling Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/service/manifest.rs | 39 ++++++++++++++++++------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/crates/ruborist/src/service/manifest.rs b/crates/ruborist/src/service/manifest.rs index 3502f6ec2..90f1db71b 100644 --- a/crates/ruborist/src/service/manifest.rs +++ b/crates/ruborist/src/service/manifest.rs @@ -14,20 +14,39 @@ use super::http::get_client; use crate::model::manifest::{CoreVersionManifest, FullManifest}; use crate::util::FETCH_TIMINGS; -/// Parse JSON bytes inline on the calling tokio task. Previously this -/// dispatched to `rayon::spawn` to "free the runtime", but -/// fetch-breakdown instrumentation on GHA showed the rayon hop made it -/// strictly worse: rayon's pool is `num_cpus` (= 2 on ubuntu-latest), -/// 64 concurrent fetches all dispatching parse queued behind 2 workers -/// — avg_parse ballooned from ~5ms (CPU only) to 30ms wall (queue + -/// CPU). Inlining puts parse on the tokio worker that already owns -/// the buffer; the cooperative-scheduling budget naturally rebalances -/// CPU between fetches. +/// Parse JSON bytes on tokio's blocking thread pool. +/// +/// The history of this function captures three different attempts: +/// - rayon::spawn (original): rayon's pool is `num_cpus` (= 2 on +/// GHA), 64 concurrent parses queued behind 2 workers → avg_parse +/// 30ms wall vs ~5ms CPU. round-0 baseline. +/// - inline (round 1, reverted): no rayon hop, but the simd_json +/// call blocks the tokio runtime worker, so other in-flight +/// fetches couldn't drive their socket I/O — avg_request grew +/// 35ms → 52ms (+17ms), eff_parallel 42 → 35, net p1 wall +0.37s. +/// - spawn_blocking (current): tokio's dedicated blocking pool has +/// a much higher default cap (512), so 64 concurrent parses are +/// never queued. Unlike rayon there's no contention with the +/// install path's parallel-write rayon usage, and unlike inline +/// the tokio runtime workers stay free to drive network I/O on +/// all in-flight fetches. async fn parse_json_off_runtime(mut bytes: Vec) -> Result where T: serde::de::DeserializeOwned + Send + 'static, { - simd_json::serde::from_slice::(&mut bytes).map_err(|e| anyhow!("JSON parse error: {e}")) + #[cfg(not(target_arch = "wasm32"))] + { + tokio::task::spawn_blocking(move || { + simd_json::serde::from_slice::(&mut bytes) + .map_err(|e| anyhow!("JSON parse error: {e}")) + }) + .await + .map_err(|e| anyhow!("spawn_blocking parse panicked: {e}"))? + } + #[cfg(target_arch = "wasm32")] + { + simd_json::serde::from_slice::(&mut bytes).map_err(|e| anyhow!("JSON parse error: {e}")) + } } /// Result of a full manifest fetch with ETag support. From 16404fc481577a03b00ba2f46aa1f3711ec5351f Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 00:14:46 +0800 Subject: [PATCH 06/32] perf(ruborist): switch extract_core_version to spawn_blocking too MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 2 moved parse_json_off_runtime off rayon (-0.11s p1). But fetch-breakdown still showed avg_request 41ms vs round 0's 35ms, hinting at a second source of rayon contention. Found it: `extract_core_version_off_runtime` is also on `rayon::spawn`. On npmjs.org's `!supports_semver` path EVERY fetch resolves through `resolve_via_full_manifest`, which fetches the full packument once per package name (deduped via inflight_full) and then calls `extract_core_version_off_runtime` per (name, spec) to materialize the chosen version into a `CoreVersionManifest`. So per fetch we hit rayon TWICE — once for the JSON parse (round 2 moved to spawn_blocking), and once for `get_core_version` (still on rayon). The second hop has the same head-of-line blocking signature as the first: 64 concurrent resolves dispatching to a 2-thread rayon pool. Round 3: move extract_core_version_off_runtime to spawn_blocking for the same reasons. The work is JSON lazy-reparse (`raw_json` sub-tree decoding) — genuinely blocking, well-suited for tokio's blocking pool. Expected: utoo p1 wall drops further toward manifest-bench's 2.10s ceiling. avg_request should fall back from 41ms → ~35ms (rayon contention removed from the fetch task's await chain). Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/model/manifest.rs | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/crates/ruborist/src/model/manifest.rs b/crates/ruborist/src/model/manifest.rs index 37e95deb9..15c762eb5 100644 --- a/crates/ruborist/src/model/manifest.rs +++ b/crates/ruborist/src/model/manifest.rs @@ -163,14 +163,20 @@ pub async fn extract_core_version_off_runtime( full: Arc, version: String, ) -> (String, Option>) { + // See `parse_json_off_runtime` for the same rayon-vs-spawn_blocking + // history: rayon's `num_cpus` pool oversubscribes when many concurrent + // resolves all extract from full manifests at once. spawn_blocking's + // larger pool avoids the queue, and the work is genuinely blocking + // (lazy JSON re-parse via `get_core_version`) so the blocking pool + // is the right home. #[cfg(not(target_arch = "wasm32"))] { - let (tx, rx) = tokio::sync::oneshot::channel(); - rayon::spawn(move || { + tokio::task::spawn_blocking(move || { let core = full.get_core_version(&version).map(Arc::new); - let _ = tx.send((version, core)); - }); - rx.await.expect("rayon parse worker dropped before sending") + (version, core) + }) + .await + .expect("spawn_blocking parse worker panicked") } #[cfg(target_arch = "wasm32")] { From 460a53885b30982bd19c68ca1a866fa540c66a76 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 00:37:37 +0800 Subject: [PATCH 07/32] revert + instrument(ruborist): post-build phase timing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two changes for round 4 of p1 optimization: 1. Revert `extract_core_version_off_runtime` from spawn_blocking back to rayon::spawn (round 3). Within-run measurement showed +0.42s regression vs utoo-next (round 2 was +0.11s). Likely cause: this function is called per (name, spec), so multi-spec packages call it 2-5x per fetch. spawn_blocking's per-dispatch overhead exceeds rayon queue savings at this multiplier. 2. Add `serialize_us` and `cache_export_us` to the p1-breakdown line so we can attribute the remaining gap. Currently: manifest-bench wall: 2.10s (pure HTTP ceiling) utoo p1 wall (round 2): 3.16s gap: 1.06s We have: preload_wall ≈ 2.7s (logged) bfs_wall ≈ 0.3s (logged) serialize_us ? cache_export_us ? ← suspected: full manifest deep-clone into ProjectCacheData for ~2730 entries Next round will have data to choose between attacking serialize, cache export, or the BFS loop body. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/model/manifest.rs | 24 +++++++++++++----------- crates/ruborist/src/service/api.rs | 10 ++++++++++ 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/crates/ruborist/src/model/manifest.rs b/crates/ruborist/src/model/manifest.rs index 15c762eb5..3509e839d 100644 --- a/crates/ruborist/src/model/manifest.rs +++ b/crates/ruborist/src/model/manifest.rs @@ -163,20 +163,22 @@ pub async fn extract_core_version_off_runtime( full: Arc, version: String, ) -> (String, Option>) { - // See `parse_json_off_runtime` for the same rayon-vs-spawn_blocking - // history: rayon's `num_cpus` pool oversubscribes when many concurrent - // resolves all extract from full manifests at once. spawn_blocking's - // larger pool avoids the queue, and the work is genuinely blocking - // (lazy JSON re-parse via `get_core_version`) so the blocking pool - // is the right home. + // Round 3 attempted to switch this to `tokio::task::spawn_blocking` + // for the same reasons as `parse_json_off_runtime`, but CI showed + // it regressed p1 by 0.5s on `preload_wall`. Mechanism: this + // function is called per (name, spec), so packages with multiple + // specs (e.g. peer-dep range overlaps) call it 2-5x per fetch. + // spawn_blocking's per-dispatch overhead (channel + thread wake) + // is significant for short CPU work; with the multiplier this + // outweighed rayon queue waits at conc=64. Keep on rayon::spawn. #[cfg(not(target_arch = "wasm32"))] { - tokio::task::spawn_blocking(move || { + let (tx, rx) = tokio::sync::oneshot::channel(); + rayon::spawn(move || { let core = full.get_core_version(&version).map(Arc::new); - (version, core) - }) - .await - .expect("spawn_blocking parse worker panicked") + let _ = tx.send((version, core)); + }); + rx.await.expect("rayon parse worker dropped before sending") } #[cfg(target_arch = "wasm32")] { diff --git a/crates/ruborist/src/service/api.rs b/crates/ruborist/src/service/api.rs index 878b357a1..82703ed97 100644 --- a/crates/ruborist/src/service/api.rs +++ b/crates/ruborist/src/service/api.rs @@ -258,9 +258,12 @@ where .await .map_err(|e| anyhow::Error::new(e).context("Dependency resolution failed"))?; + let t_serialize_start = std::time::Instant::now(); let (packages, _total) = graph.serialize_to_packages(&root_path); + let serialize_us = t_serialize_start.elapsed().as_micros() as u64; // Export project cache from memory cache for the host to persist. + let t_cache_export_start = std::time::Instant::now(); let mut project_cache = ProjectCacheData::default(); for (key, manifest) in registry.cache().export_version_manifests() { // `parse_package_spec` rather than `split_once('@')` so scoped names @@ -271,6 +274,13 @@ where pkg_cache.specs.insert(spec.to_string(), version.clone()); pkg_cache.manifests.insert(version, (*manifest).clone()); } + let cache_export_us = t_cache_export_start.elapsed().as_micros() as u64; + + tracing::info!( + "p1-breakdown serialize_us={} cache_export_us={}", + serialize_us, + cache_export_us, + ); Ok(BuildDepsOutput { lock: PackageLock::new(&pkg.name, &pkg.version, packages), From 58d49aafd2f886d1af364d91f85997e4dc01e37e Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 01:02:11 +0800 Subject: [PATCH 08/32] instrument(ruborist): preload main loop dispatch + result split MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 4 measured serialize_us = 15ms and cache_export_us = 34ms — both tiny — confirming the 1s gap from manifest-bench (utoo p1 = 3.16s vs mb wall = 2.10s) is not in post-build code. Per-fetch math also pointed at main-loop bookkeeping: manifest-bench: eff_parallel = 52 (sum_work 111s / wall 2.14s) utoo preload : eff_parallel = 43 (sum_work 120s / wall 2.85s) Same conc=64 cap, but utoo loses 9 effective slots — most likely the main loop's serial bookkeeping (dedup hash insert, format! key, extract_transitive_deps, queue push, 3-4 receiver events) holds the flow between futures.next() returning and the next fetch dispatch. This commit splits the main loop into two timed segments: preload_loop_dispatch_us: time spent in the `while in_flight < concurrency` block — popping pending, dedup check, futures.push. preload_loop_result_us: time spent processing each completed future — extract_transitive_deps, pending.extend, on_manifest. If dispatch+result sum approaches preload_wall, the main loop is the bottleneck and we need to either (a) split processing onto a dedicated task, or (b) use unbounded futures with a downstream consumer. If they're small, the gap is elsewhere (per-task overhead in resolve_package's inflight gates). Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/resolver/preload.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/crates/ruborist/src/resolver/preload.rs b/crates/ruborist/src/resolver/preload.rs index 1230c5bf6..e9a777407 100644 --- a/crates/ruborist/src/resolver/preload.rs +++ b/crates/ruborist/src/resolver/preload.rs @@ -99,8 +99,17 @@ where let mut in_flight = 0usize; let mut started = false; + // Main-loop overhead instrumentation. Atomic accumulators so we + // can attribute the gap between manifest-bench's pure-HTTP wall + // and ruborist's preload wall: how much of the gap is bookkeeping + // (dedup hash, extract_transitive_deps, queue push, events) vs + // actual fetch wait? + let mut total_dispatch_us: u64 = 0; + let mut total_result_us: u64 = 0; + loop { // Fill up to concurrency limit + let dispatch_start = tokio::time::Instant::now(); while in_flight < concurrency { let item = loop { let Some((name, spec)) = pending.pop_front() else { @@ -134,6 +143,7 @@ where }); in_flight += 1; } + total_dispatch_us += dispatch_start.elapsed().as_micros() as u64; if in_flight == 0 { break; @@ -142,6 +152,7 @@ where let Some((name, result, elapsed_ms)) = futures.next().await else { break; }; + let result_start = tokio::time::Instant::now(); in_flight -= 1; if stats.success_count == 0 && stats.failed_count == 0 { @@ -174,8 +185,15 @@ where tracing::debug!("Failed to preload {}: {}", name, e); } } + total_result_us += result_start.elapsed().as_micros() as u64; } + tracing::info!( + "p1-breakdown preload_loop_dispatch_us={} preload_loop_result_us={}", + total_dispatch_us, + total_result_us, + ); + stats.total_processed = processed.len(); receiver.on_event(BuildEvent::PreloadComplete { From 8114bf42af0e9d102bd9c2893acd764d9e0470be Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 01:25:54 +0800 Subject: [PATCH 09/32] perf(pm): grow rayon pool to max(num_cpus, 8) to drain p1 extract queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 5 main-loop instrumentation showed the preload main loop itself is fast (15-25ms total dispatch+result). The 0.8s gap from manifest-bench's 2.10s wall lives INSIDE the spawned fetch tasks. Per-fetch wall (warm runs): measured: avg_request 30ms + avg_body 6ms + avg_parse 2.5ms = ~38ms derived: preload_wall 2.4s × eff_parallel(43) / 2730 = 38ms delta: ~12ms unaccounted per task That 12ms is `extract_core_version_off_runtime` queueing on rayon's 2-thread pool. extract is called per (name, spec) — for ant-design that's ~3000+ calls. With pool=2 and 64 concurrent fetches each dispatching extract, the queue depth grows; each task waits its turn before extract returns. Bump rayon pool to `max(num_cpus, 8)` for non-Windows. Sizing the pool above the CPU count for short blocking JSON ops (parse + extract) replaces FIFO queueing with parallel dispatch. Real CPU contention is bounded by num_cpus (the kernel scheduler still gates), so the extra pool threads just hold ready-to-run dispatches in parallel rather than serialised in a queue. Why not just spawn_blocking (round 3 attempt): tokio's blocking pool defaults to 512 threads, but its per-dispatch overhead was higher than rayon's even when queueing — round 3 regressed by 0.5s. Expected: extract queue wait drops from ~12ms to ~1-2ms wall, p1 preload_wall narrows toward manifest-bench's 2.10s. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/pm/src/util/sysconf.rs | 45 ++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/crates/pm/src/util/sysconf.rs b/crates/pm/src/util/sysconf.rs index af77a7745..645b7b451 100644 --- a/crates/pm/src/util/sysconf.rs +++ b/crates/pm/src/util/sysconf.rs @@ -6,13 +6,46 @@ pub fn init() { reset_sigpipe(); } - // Windows default thread stack is 1MB, insufficient for libdeflater + tar - // + rayon work-stealing. + init_rayon_pool(); +} + +/// Configure the global rayon pool size. +/// +/// Rayon defaults to `num_cpus` workers, which is 2 on GHA ubuntu-latest. +/// Two workers are enough for the install-path's `par_chunks(64)` extract +/// (mostly disk-bound), but the resolve-path's manifest parse + extract +/// pipeline runs *many* short CPU bursts (parse: ~5ms, get_core_version: +/// ~1-3ms) dispatched from up to 64 concurrent fetches. +/// +/// With pool=2, each fetch waits up to ~25ms in queue per dispatch — +/// fetch-breakdown instrumentation showed avg_parse jumping 5ms (CPU) +/// → 30ms (CPU + queue) just from the first dispatch. The second hop +/// (`extract_core_version_off_runtime`) has the same problem. `tokio +/// spawn_blocking` avoids the queue but its per-dispatch overhead +/// (round 3 measurement) was higher than rayon's queue wait at 64×. +/// +/// Sizing the pool above the host CPU count for these short, blocking +/// JSON-shape operations gives the queue a chance to drain even when +/// 64 fetches dispatch concurrently. The work itself is bounded — at +/// most 2 are doing real CPU at once on a 2-core box; the extra pool +/// slots just hold pending tasks until a CPU is free, replacing FIFO +/// queueing with parallel dispatch. +/// +/// Cap of 8 keeps the pool reasonable on bigger machines (where +/// `num_cpus` is already enough); the floor of 8 oversubscribes +/// only on the constrained 2-core CI image. +fn init_rayon_pool() { + let parallelism = std::thread::available_parallelism() + .map(std::num::NonZero::get) + .unwrap_or(2); + let threads = parallelism.max(8); + + let builder = rayon::ThreadPoolBuilder::new().num_threads(threads); + #[cfg(target_os = "windows")] - rayon::ThreadPoolBuilder::new() - .stack_size(8 * 1024 * 1024) - .build_global() - .ok(); + let builder = builder.stack_size(8 * 1024 * 1024); + + builder.build_global().ok(); } /// Restore default SIGPIPE handling so broken pipes cause a clean exit From 394f6c92d7c5f929c18846abec54fefb9dbbb1bd Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 04:06:09 +0800 Subject: [PATCH 10/32] perf(pm): skip preload for p1 path; BFS does per-level parallel prefetch Adds `BuildDepsOptions::skip_preload` so callers without a pipeline consumer (utoo deps / package-lock-only) can drop the up-front preload phase entirely. BFS now batches prefetch per level across the whole frontier, then runs the existing sequential process_dependency walk against the warmed cache. For install paths (Context::pipeline_deps_options), skip_preload stays false so PackageResolved events still feed the download/clone pipeline. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/pm/src/helper/ruborist_context.rs | 8 ++- crates/ruborist/src/resolver/builder.rs | 71 +++++++++++++++++++++--- crates/ruborist/src/service/api.rs | 21 ++++++- 3 files changed, 90 insertions(+), 10 deletions(-) diff --git a/crates/pm/src/helper/ruborist_context.rs b/crates/pm/src/helper/ruborist_context.rs index b47def019..bc4d7faa1 100644 --- a/crates/pm/src/helper/ruborist_context.rs +++ b/crates/pm/src/helper/ruborist_context.rs @@ -63,6 +63,7 @@ impl Context { receiver, supports_semver: get_supports_semver(), catalogs, + skip_preload: false, } } @@ -82,8 +83,13 @@ impl Context { /// Resolve dependency tree with plain ProgressReceiver. Returns /// [`BuildDepsOutput`] (lock + project cache); the project cache is /// persisted in the background. + /// + /// Used by the lockfile-only path (`utoo deps`). No pipeline consumes + /// `PackageResolved` events here, so preload is pure overhead — BFS's + /// own per-level parallel prefetch warms the manifest cache. pub async fn build_deps(cwd: PathBuf) -> anyhow::Result { - let options = Self::deps_options(cwd.clone(), ProgressReceiver).await; + let mut options = Self::deps_options(cwd.clone(), ProgressReceiver).await; + options.skip_preload = true; let output = utoo_ruborist::service::build_deps(options).await?; spawn_save_project_cache(cwd, output.project_cache.clone()); Ok(output) diff --git a/crates/ruborist/src/resolver/builder.rs b/crates/ruborist/src/resolver/builder.rs index 166372c91..d811fc38c 100644 --- a/crates/ruborist/src/resolver/builder.rs +++ b/crates/ruborist/src/resolver/builder.rs @@ -18,21 +18,22 @@ //! This separation allows for maximum parallelism during network I/O //! while keeping the graph building logic simple and deterministic. -use petgraph::graph::NodeIndex; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::path::PathBuf; use std::sync::Arc; #[cfg(feature = "http-tarball")] use anyhow::Context as _; +use futures::stream::{self, StreamExt}; +use petgraph::graph::NodeIndex; use crate::model::graph::{DependencyGraph, FindResult, PackageNode}; use crate::model::manifest::NodeManifest; use crate::model::node::EdgeType; use crate::model::package_json::PackageJson; use crate::resolver::preload::{PreloadConfig, preload_manifests}; -use crate::resolver::registry::{ResolveError, resolve_registry_dep}; -use crate::spec::{Catalogs, PackageSpec, Protocol}; +use crate::resolver::registry::{ResolveError, resolve_package, resolve_registry_dep}; +use crate::spec::{Catalogs, PackageSpec, Protocol, SpecStr}; use crate::traits::progress::{BuildEvent, EventReceiver, NoopReceiver}; use crate::traits::registry::{RegistryClient, ResolvedPackage}; @@ -181,9 +182,6 @@ struct NodeFlags { /// resolved at edge creation time, so by the time this runs they are already /// concrete registry specs. fn gather_preload_deps(graph: &DependencyGraph, peer_deps: PeerDeps) -> Vec<(String, String)> { - use crate::spec::SpecStr; - use std::collections::HashSet; - let mut deps = HashSet::new(); let collect = |node_index: NodeIndex, deps: &mut HashSet<(String, String)>| { @@ -805,20 +803,74 @@ async fn run_preload_phase( } /// Run the BFS traversal phase to build the dependency tree. +/// +/// Each level does a parallel prefetch of all unresolved registry specs +/// before the sequential `process_dependency` walk. The prefetch warms +/// the registry's manifest cache so the per-edge `process_dependency` +/// calls below hit cache instead of awaiting network. +/// +/// This collapses the previously-separate `run_preload_phase` (which +/// fetched all transitive manifests up-front) into per-level batches. +/// Net effect on `utoo deps`: no separate preload wall — fetch happens +/// inside BFS in waves matching the dep tree's natural levels. For +/// install paths (p0/p3), `run_preload_phase` may still run via +/// `skip_preload=false` and feed the `PackageResolved` pipeline event. async fn run_bfs_phase( graph: &mut DependencyGraph, registry: &R, config: &BuildDepsConfig, receiver: &E, ) -> Result<(), ResolveError> { + // Reset fetch counters so the breakdown line reports fetches issued + // *during* this BFS phase, not preload's. (Preload still runs for + // install-path callers and reports its own breakdown.) + if config.skip_preload { + crate::util::FETCH_TIMINGS.reset(); + } + let start = tokio::time::Instant::now(); + let mut total_prefetch_wall_us: u64 = 0; + let mut total_merge_wall_us: u64 = 0; let mut current_level = vec![graph.root_index]; + let mut prefetched: HashSet = HashSet::new(); while !current_level.is_empty() { receiver.on_event(BuildEvent::LevelStart { node_count: current_level.len(), }); + + // Phase A: collect unresolved registry edges across the whole level + // (deduplicated against earlier levels — once a (name, spec) is + // prefetched, the registry's cache satisfies every subsequent + // `process_dependency` call). + let mut prefetch_targets: Vec<(String, String)> = Vec::new(); + for &node_index in ¤t_level { + for edge in collect_unresolved_edges(graph, node_index) { + if edge.spec.is_registry_spec() { + let key = format!("{}@{}", edge.name, edge.spec); + if prefetched.insert(key) { + prefetch_targets.push((edge.name, edge.spec)); + } + } + } + } + + // Phase B: parallel prefetch — pure cache warming. Errors are + // ignored here; the sequential `process_dependency` below will + // re-issue (now hitting either cache or the same fresh failure) + // and propagate any real error through the existing path. + if !prefetch_targets.is_empty() { + let prefetch_start = tokio::time::Instant::now(); + stream::iter(prefetch_targets) + .for_each_concurrent(config.concurrency, |(name, spec)| async move { + let _ = resolve_package(registry, &name, &spec).await; + }) + .await; + total_prefetch_wall_us += prefetch_start.elapsed().as_micros() as u64; + } + + let merge_start = tokio::time::Instant::now(); let mut next_level = Vec::new(); for node_index in current_level { @@ -900,14 +952,17 @@ async fn run_bfs_phase( receiver.on_event(BuildEvent::LevelComplete { next_level_count: next_level.len(), }); + total_merge_wall_us += merge_start.elapsed().as_micros() as u64; current_level = next_level; } let bfs_elapsed = start.elapsed(); tracing::debug!("Build phase: {:?}", bfs_elapsed); tracing::info!( - "p1-breakdown bfs_wall={}ms | {}", + "p1-breakdown bfs_wall={}ms bfs_prefetch_wall_us={} bfs_merge_wall_us={} | {}", bfs_elapsed.as_millis(), + total_prefetch_wall_us, + total_merge_wall_us, crate::util::FETCH_TIMINGS.snapshot().summary_line(), ); Ok(()) diff --git a/crates/ruborist/src/service/api.rs b/crates/ruborist/src/service/api.rs index 82703ed97..5a14f2a56 100644 --- a/crates/ruborist/src/service/api.rs +++ b/crates/ruborist/src/service/api.rs @@ -70,6 +70,16 @@ pub struct BuildDepsOptions { /// Catalog definitions for the `catalog:` dependency protocol. /// Key `""` = default catalog, other keys = named catalogs. pub catalogs: Catalogs, + /// When true, skip the up-front `run_preload_phase`. Set by callers + /// that don't consume the `BuildEvent::PackageResolved` pipeline + /// stream — e.g. `utoo deps` (lockfile-only). The BFS phase has its + /// own per-level prefetch that warms the manifest cache, so dropping + /// preload doesn't change correctness, only avoids the redundant + /// up-front fetch + dedicated wall. + /// Install paths (which feed `PipelineReceiver` to start tarball + /// downloads as resolves complete) leave this false so preload still + /// emits PackageResolved events to the pipeline. + pub skip_preload: bool, } impl BuildDepsOptions { @@ -91,6 +101,7 @@ impl BuildDepsOptions { receiver, supports_semver: None, catalogs: HashMap::new(), + skip_preload: false, } } } @@ -132,6 +143,7 @@ where receiver, supports_semver, catalogs, + skip_preload: skip_preload_caller, } = options; // 1. Find root path (workspace root if applicable) @@ -234,7 +246,13 @@ where registry.supports_semver(), ); - let skip_preload = cache_count > 0; + // Skip preload when: + // - the caller asked us to (e.g. `utoo deps`, no pipeline consumer + // for PackageResolved events — BFS does its own per-level + // prefetch, preload is redundant), OR + // - the project's warm cache already has manifests covering most + // of the workload (existing skip-on-warm behavior). + let skip_preload = skip_preload_caller || cache_count > 0; let mut config = BuildDepsConfig::default() .with_peer_deps(peer_deps) .with_concurrency(concurrency) @@ -334,6 +352,7 @@ mod tests { receiver: NoopReceiver, supports_semver: None, catalogs: HashMap::new(), + skip_preload: false, }; assert_eq!(options.concurrency, 20); From 596cd2045fd6ef5031703343b52ccad2a67a907f Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 04:18:21 +0800 Subject: [PATCH 11/32] perf(pm): fast_preload bypasses UnifiedRegistry for utoo deps path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds resolver::fast_preload, a manifest-bench-style flat FuturesUnordered over service::manifest::fetch_full_manifest. It warms MemoryCache (both full_manifests and version_manifests slots) synchronously after each fetch, so the BFS phase is pure cache-hit: no rayon hop on extract_core_version, no OnceMap gates, no DiskManifestStore writes, no PackageResolved events. Wired into service::api::build_deps: when the caller asks to skip preload (Context::build_deps for `utoo deps`) and there's no warm project cache, fast_preload runs ahead of build_deps_with_config. Install paths still go through preload_manifests so the pipeline keeps its early-start signal. Also reverts the per-level prefetch I added in 394f6c92 — with fast_preload pre-warming everything, BFS doesn't need its own prefetch wave. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/resolver/builder.rs | 72 ++---- crates/ruborist/src/resolver/fast_preload.rs | 234 +++++++++++++++++++ crates/ruborist/src/resolver/mod.rs | 1 + crates/ruborist/src/service/api.rs | 26 +++ 4 files changed, 275 insertions(+), 58 deletions(-) create mode 100644 crates/ruborist/src/resolver/fast_preload.rs diff --git a/crates/ruborist/src/resolver/builder.rs b/crates/ruborist/src/resolver/builder.rs index d811fc38c..156622502 100644 --- a/crates/ruborist/src/resolver/builder.rs +++ b/crates/ruborist/src/resolver/builder.rs @@ -24,7 +24,6 @@ use std::sync::Arc; #[cfg(feature = "http-tarball")] use anyhow::Context as _; -use futures::stream::{self, StreamExt}; use petgraph::graph::NodeIndex; use crate::model::graph::{DependencyGraph, FindResult, PackageNode}; @@ -32,7 +31,7 @@ use crate::model::manifest::NodeManifest; use crate::model::node::EdgeType; use crate::model::package_json::PackageJson; use crate::resolver::preload::{PreloadConfig, preload_manifests}; -use crate::resolver::registry::{ResolveError, resolve_package, resolve_registry_dep}; +use crate::resolver::registry::{ResolveError, resolve_registry_dep}; use crate::spec::{Catalogs, PackageSpec, Protocol, SpecStr}; use crate::traits::progress::{BuildEvent, EventReceiver, NoopReceiver}; use crate::traits::registry::{RegistryClient, ResolvedPackage}; @@ -181,7 +180,10 @@ struct NodeFlags { /// Only registry specs (e.g. `^4.17.0`) are collected. `catalog:` specs are /// resolved at edge creation time, so by the time this runs they are already /// concrete registry specs. -fn gather_preload_deps(graph: &DependencyGraph, peer_deps: PeerDeps) -> Vec<(String, String)> { +pub(crate) fn gather_preload_deps( + graph: &DependencyGraph, + peer_deps: PeerDeps, +) -> Vec<(String, String)> { let mut deps = HashSet::new(); let collect = |node_index: NodeIndex, deps: &mut HashSet<(String, String)>| { @@ -805,72 +807,29 @@ async fn run_preload_phase( /// Run the BFS traversal phase to build the dependency tree. /// /// Each level does a parallel prefetch of all unresolved registry specs -/// before the sequential `process_dependency` walk. The prefetch warms -/// the registry's manifest cache so the per-edge `process_dependency` -/// calls below hit cache instead of awaiting network. +/// before the sequential `process_dependency` walk. /// -/// This collapses the previously-separate `run_preload_phase` (which -/// fetched all transitive manifests up-front) into per-level batches. -/// Net effect on `utoo deps`: no separate preload wall — fetch happens -/// inside BFS in waves matching the dep tree's natural levels. For -/// install paths (p0/p3), `run_preload_phase` may still run via -/// `skip_preload=false` and feed the `PackageResolved` pipeline event. +/// When `skip_preload=true` (lockfile-only path), the caller is +/// expected to have already populated `registry.cache()` via +/// [`super::fast_preload::fast_preload`], so this BFS sees only +/// cache hits. When `skip_preload=false` (install paths), the +/// receiver-driven [`super::preload::preload_manifests`] runs ahead +/// of this phase and feeds `BuildEvent::PackageResolved` to the +/// pipeline. async fn run_bfs_phase( graph: &mut DependencyGraph, registry: &R, config: &BuildDepsConfig, receiver: &E, ) -> Result<(), ResolveError> { - // Reset fetch counters so the breakdown line reports fetches issued - // *during* this BFS phase, not preload's. (Preload still runs for - // install-path callers and reports its own breakdown.) - if config.skip_preload { - crate::util::FETCH_TIMINGS.reset(); - } - let start = tokio::time::Instant::now(); - let mut total_prefetch_wall_us: u64 = 0; - let mut total_merge_wall_us: u64 = 0; - let mut current_level = vec![graph.root_index]; - let mut prefetched: HashSet = HashSet::new(); while !current_level.is_empty() { receiver.on_event(BuildEvent::LevelStart { node_count: current_level.len(), }); - // Phase A: collect unresolved registry edges across the whole level - // (deduplicated against earlier levels — once a (name, spec) is - // prefetched, the registry's cache satisfies every subsequent - // `process_dependency` call). - let mut prefetch_targets: Vec<(String, String)> = Vec::new(); - for &node_index in ¤t_level { - for edge in collect_unresolved_edges(graph, node_index) { - if edge.spec.is_registry_spec() { - let key = format!("{}@{}", edge.name, edge.spec); - if prefetched.insert(key) { - prefetch_targets.push((edge.name, edge.spec)); - } - } - } - } - - // Phase B: parallel prefetch — pure cache warming. Errors are - // ignored here; the sequential `process_dependency` below will - // re-issue (now hitting either cache or the same fresh failure) - // and propagate any real error through the existing path. - if !prefetch_targets.is_empty() { - let prefetch_start = tokio::time::Instant::now(); - stream::iter(prefetch_targets) - .for_each_concurrent(config.concurrency, |(name, spec)| async move { - let _ = resolve_package(registry, &name, &spec).await; - }) - .await; - total_prefetch_wall_us += prefetch_start.elapsed().as_micros() as u64; - } - - let merge_start = tokio::time::Instant::now(); let mut next_level = Vec::new(); for node_index in current_level { @@ -952,17 +911,14 @@ async fn run_bfs_phase( receiver.on_event(BuildEvent::LevelComplete { next_level_count: next_level.len(), }); - total_merge_wall_us += merge_start.elapsed().as_micros() as u64; current_level = next_level; } let bfs_elapsed = start.elapsed(); tracing::debug!("Build phase: {:?}", bfs_elapsed); tracing::info!( - "p1-breakdown bfs_wall={}ms bfs_prefetch_wall_us={} bfs_merge_wall_us={} | {}", + "p1-breakdown bfs_wall={}ms | {}", bfs_elapsed.as_millis(), - total_prefetch_wall_us, - total_merge_wall_us, crate::util::FETCH_TIMINGS.snapshot().summary_line(), ); Ok(()) diff --git a/crates/ruborist/src/resolver/fast_preload.rs b/crates/ruborist/src/resolver/fast_preload.rs new file mode 100644 index 000000000..975c18a81 --- /dev/null +++ b/crates/ruborist/src/resolver/fast_preload.rs @@ -0,0 +1,234 @@ +//! Lean parallel manifest fetcher modeled on `manifest-bench`. +//! +//! Bypasses [`crate::service::registry::UnifiedRegistry`] — and therefore +//! its `OnceMap` gates, [`crate::service::store::ManifestStore`] writes, +//! and `EventReceiver` event dispatch — to drive a flat +//! `FuturesUnordered` over [`crate::service::manifest::fetch_full_manifest`] +//! plus a synchronous transitive walk. The warm +//! [`crate::service::cache::MemoryCache`] it leaves behind makes the +//! subsequent BFS phase a pure cache-hit walk: no network, no rayon +//! re-parse hop on `extract_core_version`. +//! +//! Intended for the lockfile-only path (`utoo deps`) which has no +//! pipeline consumer for `BuildEvent::PackageResolved` — install paths +//! still go through [`super::preload::preload_manifests`] so the +//! pipeline keeps its early-start signal. + +use std::collections::{HashSet, VecDeque}; +use std::sync::Arc; + +use futures::stream::{FuturesUnordered, StreamExt}; + +use crate::model::manifest::CoreVersionManifest; +use crate::model::node::PeerDeps; +use crate::resolver::preload::{Dep, PreloadConfig}; +use crate::resolver::version::resolve_target_version; +use crate::service::{ + FetchManifestOptions, FetchManifestResult, MemoryCache, MetadataFormat, fetch_full_manifest, +}; +use crate::spec::SpecStr; +use crate::util::FETCH_TIMINGS; + +/// Statistics from the lean fetch loop. Mirrors `PreloadStats` shape so +/// the bench-grep regex stays the same. +#[derive(Debug, Default)] +pub struct FastPreloadStats { + pub success_count: usize, + pub failed_count: usize, + pub fetched_names: usize, + pub min_request_ms: u64, + pub max_request_ms: u64, + pub total_request_ms: u64, +} + +/// Collect dependencies from any deps map, filtering out non-registry specs. +fn collect_deps(map: Option<&std::collections::HashMap>) -> Vec { + map.into_iter() + .flatten() + .filter(|(_, spec)| spec.is_registry_spec()) + .map(|(name, spec)| (name.clone(), spec.clone())) + .collect() +} + +/// Extract transitive dependencies from a resolved manifest. +/// devDependencies are omitted (only the root installs devDeps). +fn extract_transitive_deps(manifest: &CoreVersionManifest, peer_deps: PeerDeps) -> Vec { + let mut deps = Vec::new(); + deps.extend(collect_deps(manifest.dependencies.as_ref())); + if peer_deps == PeerDeps::Include { + deps.extend(collect_deps(manifest.peer_dependencies.as_ref())); + } + deps.extend(collect_deps(manifest.optional_dependencies.as_ref())); + deps +} + +/// Resolve `(name, spec)` against the cached `FullManifest` synchronously. +/// +/// Inlines the work that `UnifiedRegistry::resolve_via_full_manifest` does +/// after a cache hit — pick a version, parse just that subset, populate +/// the per-version cache slot the BFS phase will read from. Skips the +/// rayon/`spawn_blocking` hop because the caller is already doing +/// CPU-bound bookkeeping between fetches. +fn settle_spec(name: &str, spec: &str, cache: &MemoryCache, peer_deps: PeerDeps) -> Vec { + let Some(full) = cache.get_full_manifest(name) else { + return Vec::new(); + }; + let Ok(resolved_version) = resolve_target_version((&*full).into(), spec) else { + return Vec::new(); + }; + if let Some(cached) = cache.get_version_manifest(name, &resolved_version) { + return extract_transitive_deps(&cached, peer_deps); + } + let Some(core) = full.get_core_version(&resolved_version) else { + return Vec::new(); + }; + let core_arc = Arc::new(core); + cache.set_version_manifest(name.to_string(), resolved_version, Arc::clone(&core_arc)); + extract_transitive_deps(&core_arc, peer_deps) +} + +/// Manifest-bench-style flat parallel fetch of all transitively-reachable +/// registry manifests. Populates `cache` with both `full_manifests` and +/// `version_manifests` slots so the subsequent BFS does no network and no +/// re-parse. +/// +/// `initial_deps` should already be the union of root+workspace +/// registry edges, with non-registry specs filtered out. +pub async fn fast_preload( + initial_deps: Vec, + registry_url: &str, + cache: &MemoryCache, + config: &PreloadConfig, +) -> FastPreloadStats { + let mut stats = FastPreloadStats::default(); + let mut pending: VecDeque = VecDeque::from(initial_deps); + // Specs we've already enqueued (or settled). Prevents duplicate + // sync resolutions from re-walking the same transitive subtree. + let mut seen_specs: HashSet<(String, String)> = HashSet::new(); + // Names whose full manifest is either cached or in flight. Spec-level + // dedup happens in `seen_specs` above; this set is the gate that + // prevents two concurrent fetches for the same package (sibling + // specs queue against the in-flight one rather than racing). + let mut fetched_names: HashSet = HashSet::new(); + // Specs that arrived while their package's full manifest was still + // in flight — we'll settle them once the fetch lands. + let mut deferred_specs: Vec<(String, String)> = Vec::new(); + let mut futs = FuturesUnordered::new(); + let concurrency = config.concurrency; + let peer_deps = config.peer_deps; + + loop { + while futs.len() < concurrency { + let Some((name, spec)) = pending.pop_front() else { + break; + }; + if !seen_specs.insert((name.clone(), spec.clone())) { + continue; + } + + // Full manifest already cached: skip the network round-trip, + // settle synchronously and queue this package's transitive + // deps. This is the hot path on the second-and-later spec + // for any popular package (lodash, semver, etc.). + if cache.get_full_manifest(&name).is_some() { + let new_deps = settle_spec(&name, &spec, cache, peer_deps); + pending.extend(new_deps); + continue; + } + + // Fetch in flight for this name — defer settling this spec + // until the fetch lands. The deferred set is small (only + // sibling specs for in-flight names) so the linear scan is + // cheaper than another HashMap. + if !fetched_names.insert(name.clone()) { + deferred_specs.push((name, spec)); + continue; + } + + let registry_url = registry_url.to_string(); + let n = name.clone(); + futs.push(async move { + let start = tokio::time::Instant::now(); + let result = fetch_full_manifest(FetchManifestOptions { + registry_url: ®istry_url, + name: &n, + format: MetadataFormat::Abbreviated, + etag: None, + }) + .await; + let elapsed_ms = start.elapsed().as_millis() as u64; + (name, spec, result, elapsed_ms) + }); + } + + if futs.is_empty() { + break; + } + + let Some((name, spec, result, elapsed_ms)) = futs.next().await else { + break; + }; + + if stats.success_count == 0 && stats.failed_count == 0 { + stats.min_request_ms = elapsed_ms; + stats.max_request_ms = elapsed_ms; + } else { + stats.min_request_ms = stats.min_request_ms.min(elapsed_ms); + stats.max_request_ms = stats.max_request_ms.max(elapsed_ms); + } + stats.total_request_ms += elapsed_ms; + + match result { + Ok(FetchManifestResult::Ok(manifest, _etag)) => { + stats.success_count += 1; + stats.fetched_names += 1; + cache.set_full_manifest(name.clone(), Arc::new(manifest)); + + let new_deps = settle_spec(&name, &spec, cache, peer_deps); + pending.extend(new_deps); + + // Drain any sibling specs that arrived while this fetch + // was in flight. `extract_if`-style retain in place. + let mut i = 0; + while i < deferred_specs.len() { + if deferred_specs[i].0 == name { + let (n, s) = deferred_specs.swap_remove(i); + let new_deps = settle_spec(&n, &s, cache, peer_deps); + pending.extend(new_deps); + } else { + i += 1; + } + } + } + Ok(FetchManifestResult::NotModified) => { + // No ETag was sent on these requests, so 304 is unreachable + // here in practice; treat it as a soft-failure to keep the + // path total. + stats.failed_count += 1; + } + Err(e) => { + stats.failed_count += 1; + tracing::debug!("fast_preload failed for {}: {}", name, e); + } + } + } + + let total = stats.success_count + stats.failed_count; + let avg_ms = if total > 0 { + stats.total_request_ms / total as u64 + } else { + 0 + }; + tracing::info!( + "p1-breakdown fast_preload n={} ok={} fail={} avg_req={}ms min={}ms max={}ms | {}", + total, + stats.success_count, + stats.failed_count, + avg_ms, + stats.min_request_ms, + stats.max_request_ms, + FETCH_TIMINGS.snapshot().summary_line(), + ); + + stats +} diff --git a/crates/ruborist/src/resolver/mod.rs b/crates/ruborist/src/resolver/mod.rs index 582e03b31..e7baad988 100644 --- a/crates/ruborist/src/resolver/mod.rs +++ b/crates/ruborist/src/resolver/mod.rs @@ -3,6 +3,7 @@ pub mod builder; pub mod common; pub mod edges; +pub mod fast_preload; #[cfg(feature = "native-git")] pub mod git; #[cfg(feature = "http-tarball")] diff --git a/crates/ruborist/src/service/api.rs b/crates/ruborist/src/service/api.rs index 5a14f2a56..3b9b713ea 100644 --- a/crates/ruborist/src/service/api.rs +++ b/crates/ruborist/src/service/api.rs @@ -36,7 +36,10 @@ use crate::model::package_lock::PackageLock; use crate::model::util::parse_package_spec; use crate::resolver::builder::{ BuildDepsConfig, DevDeps, EdgeContext, PeerDeps, add_edges_from, build_deps_with_config, + gather_preload_deps, }; +use crate::resolver::fast_preload::fast_preload; +use crate::resolver::preload::PreloadConfig; use crate::resolver::runtime::install_runtime_from_map; use crate::resolver::workspace::WorkspaceDiscovery; use crate::spec::Catalogs; @@ -269,6 +272,29 @@ where ); } + // Lockfile-only callers (`utoo deps`) skip the receiver-driven + // `run_preload_phase` because they have no pipeline consumer for + // `BuildEvent::PackageResolved`. Run `fast_preload` instead — a flat + // `FuturesUnordered` over `fetch_full_manifest` that warms the + // `MemoryCache` so the BFS phase below is pure cache-hit. This is + // the manifest-bench-style path; the heavier `preload_manifests` + // path (with `OnceMap` gates + `EventReceiver` events) only runs + // for install paths that need the pipeline signal. + if skip_preload_caller && cache_count == 0 { + let initial_deps = gather_preload_deps(&graph, peer_deps); + let preload_config = PreloadConfig { + peer_deps, + concurrency, + }; + fast_preload( + initial_deps, + registry.registry_url(), + registry.cache(), + &preload_config, + ) + .await; + } + // Preserve the typed error via `Error::new` + `.context(...)` so CLI // renderers (e.g. pm's format_print) can downcast and pretty-print the // dependency chain carried by `ResolveError::WithChain`. From 2e74bba904e391931a71960464932334e0d46e94 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 04:51:27 +0800 Subject: [PATCH 12/32] perf(pm): dispatch fast_preload settle to rayon to free tokio runtime MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v1 of fast_preload called settle_spec inline on the tokio worker — each settle ran simd_json::to_borrowed_value over the full manifest's raw bytes (5–10ms per spec) right on the runtime thread. CI showed it starved sibling fetches: avg_request rose +3ms, avg_parse jumped 5→11ms, p1_resolve regressed +1.0s vs the preload+BFS baseline (4.0s vs 3.0s). Fix: route every settle through extract_core_version_off_runtime (the same rayon::spawn helper the BFS path uses), and merge fetch and settle completions into a single FuturesUnordered so backpressure on either side throttles the other. Sibling specs that arrived during a fetch are now stashed by name (HashMap, not linear scan), then dispatched as their own settle futures when the fetch lands. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/resolver/fast_preload.rs | 248 ++++++++++++------- 1 file changed, 163 insertions(+), 85 deletions(-) diff --git a/crates/ruborist/src/resolver/fast_preload.rs b/crates/ruborist/src/resolver/fast_preload.rs index 975c18a81..faea79752 100644 --- a/crates/ruborist/src/resolver/fast_preload.rs +++ b/crates/ruborist/src/resolver/fast_preload.rs @@ -4,7 +4,7 @@ //! its `OnceMap` gates, [`crate::service::store::ManifestStore`] writes, //! and `EventReceiver` event dispatch — to drive a flat //! `FuturesUnordered` over [`crate::service::manifest::fetch_full_manifest`] -//! plus a synchronous transitive walk. The warm +//! plus a rayon-dispatched per-spec settle. The warm //! [`crate::service::cache::MemoryCache`] it leaves behind makes the //! subsequent BFS phase a pure cache-hit walk: no network, no rayon //! re-parse hop on `extract_core_version`. @@ -13,13 +13,28 @@ //! pipeline consumer for `BuildEvent::PackageResolved` — install paths //! still go through [`super::preload::preload_manifests`] so the //! pipeline keeps its early-start signal. +//! +//! ## Why settle is dispatched off-runtime +//! +//! A "settle" turns a freshly-fetched `FullManifest` plus a spec into a +//! `CoreVersionManifest` for one version, via `simd_json::to_borrowed_value` +//! over the manifest's raw bytes. That parse is 5–10ms per spec on a +//! 100KB body. Calling it inline on the tokio runtime (the v1 of this +//! module) starves the runtime worker — sibling fetches in flight stop +//! draining their sockets while the worker is parsing, which CI showed +//! as `avg_request` rising +3ms and `avg_parse` jumping 5→11ms vs the +//! UnifiedRegistry baseline. Routing settle through `rayon::spawn` +//! (the same path the `extract_core_version_off_runtime` helper takes) +//! keeps the runtime free to drive I/O. -use std::collections::{HashSet, VecDeque}; +use std::collections::{HashMap, HashSet, VecDeque}; +use std::pin::Pin; use std::sync::Arc; +use futures::future::BoxFuture; use futures::stream::{FuturesUnordered, StreamExt}; -use crate::model::manifest::CoreVersionManifest; +use crate::model::manifest::{CoreVersionManifest, FullManifest, extract_core_version_off_runtime}; use crate::model::node::PeerDeps; use crate::resolver::preload::{Dep, PreloadConfig}; use crate::resolver::version::resolve_target_version; @@ -41,8 +56,32 @@ pub struct FastPreloadStats { pub total_request_ms: u64, } +/// Output of one in-flight future. The main loop merges fetch and settle +/// completions through a single `FuturesUnordered` so backpressure on +/// either side throttles the other naturally. +/// +/// `Fetched` is boxed because `FetchManifestResult::Ok` carries a fully- +/// parsed `FullManifest` (`raw` bytes + parsed envelope), which makes +/// the variant large enough that clippy flags the size delta with +/// `Settled`. The cost is one heap allocation per fetched manifest; +/// trivial against the network round-trip we already paid. +#[allow(clippy::large_enum_variant)] +enum FastEvent { + Fetched { + name: String, + primary_spec: String, + result: anyhow::Result, + elapsed_ms: u64, + }, + Settled { + new_deps: Vec, + }, +} + +type FastFut = Pin + Send>>; + /// Collect dependencies from any deps map, filtering out non-registry specs. -fn collect_deps(map: Option<&std::collections::HashMap>) -> Vec { +fn collect_deps(map: Option<&HashMap>) -> Vec { map.into_iter() .flatten() .filter(|(_, spec)| spec.is_registry_spec()) @@ -62,29 +101,41 @@ fn extract_transitive_deps(manifest: &CoreVersionManifest, peer_deps: PeerDeps) deps } -/// Resolve `(name, spec)` against the cached `FullManifest` synchronously. +/// Resolve `(name, spec)` against `full` off the tokio runtime. /// -/// Inlines the work that `UnifiedRegistry::resolve_via_full_manifest` does -/// after a cache hit — pick a version, parse just that subset, populate -/// the per-version cache slot the BFS phase will read from. Skips the -/// rayon/`spawn_blocking` hop because the caller is already doing -/// CPU-bound bookkeeping between fetches. -fn settle_spec(name: &str, spec: &str, cache: &MemoryCache, peer_deps: PeerDeps) -> Vec { - let Some(full) = cache.get_full_manifest(name) else { - return Vec::new(); - }; - let Ok(resolved_version) = resolve_target_version((&*full).into(), spec) else { - return Vec::new(); - }; - if let Some(cached) = cache.get_version_manifest(name, &resolved_version) { - return extract_transitive_deps(&cached, peer_deps); - } - let Some(core) = full.get_core_version(&resolved_version) else { - return Vec::new(); - }; - let core_arc = Arc::new(core); - cache.set_version_manifest(name.to_string(), resolved_version, Arc::clone(&core_arc)); - extract_transitive_deps(&core_arc, peer_deps) +/// Returns the freshly-extracted version manifest's transitive deps so +/// the caller can extend its pending queue. The heavy +/// `simd_json::to_borrowed_value` parse runs inside +/// `extract_core_version_off_runtime`, which dispatches to rayon — same +/// path the BFS phase uses for cold extracts. +fn settle_future( + name: String, + spec: String, + full: Arc, + cache: MemoryCache, + peer_deps: PeerDeps, +) -> BoxFuture<'static, FastEvent> { + Box::pin(async move { + let resolved_version = match resolve_target_version((&*full).into(), &spec) { + Ok(v) => v, + Err(_) => return FastEvent::Settled { new_deps: vec![] }, + }; + if let Some(cached) = cache.get_version_manifest(&name, &resolved_version) { + return FastEvent::Settled { + new_deps: extract_transitive_deps(&cached, peer_deps), + }; + } + let (resolved_version, core) = + extract_core_version_off_runtime(Arc::clone(&full), resolved_version).await; + let new_deps = match core { + Some(core_arc) => { + cache.set_version_manifest(name, resolved_version, Arc::clone(&core_arc)); + extract_transitive_deps(&core_arc, peer_deps) + } + None => Vec::new(), + }; + FastEvent::Settled { new_deps } + }) } /// Manifest-bench-style flat parallel fetch of all transitively-reachable @@ -103,17 +154,15 @@ pub async fn fast_preload( let mut stats = FastPreloadStats::default(); let mut pending: VecDeque = VecDeque::from(initial_deps); // Specs we've already enqueued (or settled). Prevents duplicate - // sync resolutions from re-walking the same transitive subtree. + // settles from re-walking the same transitive subtree. let mut seen_specs: HashSet<(String, String)> = HashSet::new(); - // Names whose full manifest is either cached or in flight. Spec-level - // dedup happens in `seen_specs` above; this set is the gate that - // prevents two concurrent fetches for the same package (sibling - // specs queue against the in-flight one rather than racing). + // Names whose full manifest is in flight or already cached. let mut fetched_names: HashSet = HashSet::new(); - // Specs that arrived while their package's full manifest was still - // in flight — we'll settle them once the fetch lands. - let mut deferred_specs: Vec<(String, String)> = Vec::new(); - let mut futs = FuturesUnordered::new(); + // Sibling specs that arrived while their package's full manifest + // was still in flight. The fetch's completion handler drains this + // bucket — we stash by name so the lookup is one HashMap probe. + let mut deferred_by_name: HashMap> = HashMap::new(); + let mut futs: FuturesUnordered = FuturesUnordered::new(); let concurrency = config.concurrency; let peer_deps = config.peer_deps; @@ -126,28 +175,33 @@ pub async fn fast_preload( continue; } - // Full manifest already cached: skip the network round-trip, - // settle synchronously and queue this package's transitive - // deps. This is the hot path on the second-and-later spec - // for any popular package (lodash, semver, etc.). - if cache.get_full_manifest(&name).is_some() { - let new_deps = settle_spec(&name, &spec, cache, peer_deps); - pending.extend(new_deps); + // Hot path: the full manifest is already cached (a sibling + // spec for this name has already returned). Dispatch a + // settle so the parse work runs on rayon, not on the tokio + // worker — keeps the runtime free for ongoing fetches. + if let Some(full) = cache.get_full_manifest(&name) { + futs.push(Box::pin(settle_future( + name, + spec, + full, + cache.clone(), + peer_deps, + ))); continue; } - // Fetch in flight for this name — defer settling this spec - // until the fetch lands. The deferred set is small (only - // sibling specs for in-flight names) so the linear scan is - // cheaper than another HashMap. + // A fetch for this name is already in flight: stash this + // spec; the fetch's completion handler will dispatch its + // settle. if !fetched_names.insert(name.clone()) { - deferred_specs.push((name, spec)); + deferred_by_name.entry(name).or_default().push(spec); continue; } let registry_url = registry_url.to_string(); + let primary_spec = spec.clone(); let n = name.clone(); - futs.push(async move { + futs.push(Box::pin(async move { let start = tokio::time::Instant::now(); let result = fetch_full_manifest(FetchManifestOptions { registry_url: ®istry_url, @@ -157,58 +211,82 @@ pub async fn fast_preload( }) .await; let elapsed_ms = start.elapsed().as_millis() as u64; - (name, spec, result, elapsed_ms) - }); + FastEvent::Fetched { + name, + primary_spec, + result, + elapsed_ms, + } + })); } if futs.is_empty() { break; } - let Some((name, spec, result, elapsed_ms)) = futs.next().await else { + let Some(event) = futs.next().await else { break; }; - if stats.success_count == 0 && stats.failed_count == 0 { - stats.min_request_ms = elapsed_ms; - stats.max_request_ms = elapsed_ms; - } else { - stats.min_request_ms = stats.min_request_ms.min(elapsed_ms); - stats.max_request_ms = stats.max_request_ms.max(elapsed_ms); - } - stats.total_request_ms += elapsed_ms; + match event { + FastEvent::Fetched { + name, + primary_spec, + result, + elapsed_ms, + } => { + if stats.success_count == 0 && stats.failed_count == 0 { + stats.min_request_ms = elapsed_ms; + stats.max_request_ms = elapsed_ms; + } else { + stats.min_request_ms = stats.min_request_ms.min(elapsed_ms); + stats.max_request_ms = stats.max_request_ms.max(elapsed_ms); + } + stats.total_request_ms += elapsed_ms; - match result { - Ok(FetchManifestResult::Ok(manifest, _etag)) => { - stats.success_count += 1; - stats.fetched_names += 1; - cache.set_full_manifest(name.clone(), Arc::new(manifest)); + match result { + Ok(FetchManifestResult::Ok(manifest, _etag)) => { + stats.success_count += 1; + stats.fetched_names += 1; + let full_arc = Arc::new(manifest); + cache.set_full_manifest(name.clone(), Arc::clone(&full_arc)); - let new_deps = settle_spec(&name, &spec, cache, peer_deps); - pending.extend(new_deps); + // Primary settle. + futs.push(Box::pin(settle_future( + name.clone(), + primary_spec, + Arc::clone(&full_arc), + cache.clone(), + peer_deps, + ))); - // Drain any sibling specs that arrived while this fetch - // was in flight. `extract_if`-style retain in place. - let mut i = 0; - while i < deferred_specs.len() { - if deferred_specs[i].0 == name { - let (n, s) = deferred_specs.swap_remove(i); - let new_deps = settle_spec(&n, &s, cache, peer_deps); - pending.extend(new_deps); - } else { - i += 1; + // Sibling settles that were stashed while the + // fetch was in flight. + if let Some(siblings) = deferred_by_name.remove(&name) { + for sibling_spec in siblings { + futs.push(Box::pin(settle_future( + name.clone(), + sibling_spec, + Arc::clone(&full_arc), + cache.clone(), + peer_deps, + ))); + } + } + } + Ok(FetchManifestResult::NotModified) => { + // No ETag was sent on these requests, so 304 is + // unreachable in practice; treat as soft failure. + stats.failed_count += 1; + } + Err(e) => { + stats.failed_count += 1; + tracing::debug!("fast_preload failed for {}: {}", name, e); } } } - Ok(FetchManifestResult::NotModified) => { - // No ETag was sent on these requests, so 304 is unreachable - // here in practice; treat it as a soft-failure to keep the - // path total. - stats.failed_count += 1; - } - Err(e) => { - stats.failed_count += 1; - tracing::debug!("fast_preload failed for {}: {}", name, e); + FastEvent::Settled { new_deps } => { + pending.extend(new_deps); } } } From 04c9ec34d26fdb97f83014c9a09e241cd64715aa Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 05:19:48 +0800 Subject: [PATCH 13/32] =?UTF-8?q?perf(pm):=20bump=20manifests-concurrency-?= =?UTF-8?q?limit=2064=20=E2=86=92=2096=20(manifest-bench=20best)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Standalone manifest-bench HTTP-only sweep (npmjs, h1) shows wall bottoming at concurrency=96 (1817ms) — earlier 256 regression was caused by rayon-queued parses behind 2 workers, no longer relevant since fetch parse is on spawn_blocking and settle is rayon-dispatched off the runtime. fast_preload's wave-shaped transitive walk currently runs at eff_parallel ~35 against the 64 cap because pending refills lag settles; raising the cap to 96 gives headroom for sustained in-flight on the deep waves without crossing the npmjs per-IP tail-latency cliff that conc 128+ trips. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/pm/src/util/user_config.rs | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/crates/pm/src/util/user_config.rs b/crates/pm/src/util/user_config.rs index a0235830a..f05b0f52f 100644 --- a/crates/pm/src/util/user_config.rs +++ b/crates/pm/src/util/user_config.rs @@ -137,15 +137,18 @@ pub fn get_install_scope() -> InstallScope { // We tried 256 to match bun's observed parallel streams; on GHA the // fetch-breakdown instrumentation showed sum_parse exploded from // ~10ms (local Mac, network-bound) to 728s on first cold run with -// 256 concurrency. Mechanism: parse_json_off_runtime dispatches to -// rayon, which has only num_cpus (=2 on GHA) workers. Bumping -// concurrency to 256 queued 256 parses behind 2 workers → wall -// per-parse jumped from 730µs to 266ms. Net p1 wall *increased* -// 3.10s → 3.33s on phases bench. Keep 64 until we address the -// parse-side queueing (e.g. inline parse on tokio, or a wider -// dedicated parse pool). +// Once we moved fetch parse off rayon to tokio's spawn_blocking pool +// (cap 512) and settle off the runtime via rayon::spawn, the original +// 256-concurrency regression mechanism (parses queued behind 2 rayon +// workers) no longer applies. The standalone manifest-bench HTTP-only +// sweep on GHA (npmjs, conc 32→256) shows wall bottoming out at conc 96 +// (1817ms) and tracking flat-then-rising past that — beyond ~96 +// in-flight, npmjs's per-IP rate degrades and tail latency widens. +// 96 is the sweet spot: enough headroom for the wave-shaped transitive +// dep walk in fast_preload to keep the runtime busy, without paying the +// p99 widening that 128+ shows. static MANIFESTS_CONCURRENCY_LIMIT: LazyLock> = - LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 64)); + LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 96)); pub fn set_manifests_concurrency_limit(value: Option) { MANIFESTS_CONCURRENCY_LIMIT.set(value); From 6455852e518b3cc9859e12442972f40697360d73 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 05:46:06 +0800 Subject: [PATCH 14/32] perf(pm): fast_preload populates (name, spec) cache slot for BFS fast path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `UnifiedRegistry::resolve_version_manifest`'s first cache check (service/registry.rs:347) keys on `(name, spec)` — the original spec string the caller passed, e.g. `^4.0.0`. settle_future was only populating `(name, resolved_version)` (e.g. `4.17.21`), so on every BFS edge for `lodash@^4.0.0`-style specs the warm path missed and fell into the OnceMap inflight gate + `resolve_via_full_manifest` re-walk before recovering the manifest from the `(name, resolved_version)` slot we'd already set. Now settle writes both keys so BFS hits the early-return at service/registry.rs:347 with no further dispatch. Saves ~1 OnceMap+resolve_target_version round-trip per unique (name, spec) the BFS encounters (≈3000 calls on ant-design-x). Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/resolver/fast_preload.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/crates/ruborist/src/resolver/fast_preload.rs b/crates/ruborist/src/resolver/fast_preload.rs index faea79752..c3845a73a 100644 --- a/crates/ruborist/src/resolver/fast_preload.rs +++ b/crates/ruborist/src/resolver/fast_preload.rs @@ -121,6 +121,8 @@ fn settle_future( Err(_) => return FastEvent::Settled { new_deps: vec![] }, }; if let Some(cached) = cache.get_version_manifest(&name, &resolved_version) { + // Populate the (name, spec) slot too — see comment below. + cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&cached)); return FastEvent::Settled { new_deps: extract_transitive_deps(&cached, peer_deps), }; @@ -129,6 +131,18 @@ fn settle_future( extract_core_version_off_runtime(Arc::clone(&full), resolved_version).await; let new_deps = match core { Some(core_arc) => { + // Populate BOTH cache slots so the subsequent BFS hits the + // fast path on its first call: + // * `(name, resolved_version)` — what + // `resolve_via_full_manifest` writes in the cold path, + // and what `extract_core_version_off_runtime`'s callers + // elsewhere expect. + // * `(name, spec)` — what `resolve_version_manifest`'s + // first cache check uses (line 347 in service/registry.rs). + // Without this slot, BFS still pays one OnceMap dispatch + // + `resolve_via_full_manifest` walk per `(name, spec)`, + // even though we've already done that work here. + cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&core_arc)); cache.set_version_manifest(name, resolved_version, Arc::clone(&core_arc)); extract_transitive_deps(&core_arc, peer_deps) } From 4bbcae8083de94ea69b6ef19611cdb59c719ca9c Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 06:12:08 +0800 Subject: [PATCH 15/32] perf(pm): fuse primary settle into fetch task to drop dispatch RTT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous fast_preload (v2) dispatched primary settles to rayon as separate FuturesUnordered futures. CI breakdown showed eff_parallel ~44 against the conc=96 cap — the wave-shaped transitive walk was held back by settle dispatch RTT: each fetch landed → primary settle queued → settle popped → only then did `pending` get transitive deps and fill the next dispatch wave. v3 folds the primary settle into the fetch task itself via `tokio::task::spawn_blocking`. The fetch task does the network round-trip and the primary version-extract on the same blocking pool slot, then returns with the resolved CoreVersionManifest attached. Main loop pulls one Fetched event, immediately extends `pending`, no second `next().await` to wait through the queue. Sibling specs (rare; same name, different range) still go through the rayon settle_future path so the primary path stays lean. Carries primary_spec through FastEvent so the fused path can populate both `(name, primary_spec)` and `(name, resolved_version)` cache slots — preserves the 6455852e BFS fast-path win. FetchOutcome enum replaces by-value FetchManifestResult to avoid a full FullManifest clone (HashMap+Vec) per fetch event. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/resolver/fast_preload.rs | 206 ++++++++++++------- 1 file changed, 135 insertions(+), 71 deletions(-) diff --git a/crates/ruborist/src/resolver/fast_preload.rs b/crates/ruborist/src/resolver/fast_preload.rs index c3845a73a..008030139 100644 --- a/crates/ruborist/src/resolver/fast_preload.rs +++ b/crates/ruborist/src/resolver/fast_preload.rs @@ -4,7 +4,7 @@ //! its `OnceMap` gates, [`crate::service::store::ManifestStore`] writes, //! and `EventReceiver` event dispatch — to drive a flat //! `FuturesUnordered` over [`crate::service::manifest::fetch_full_manifest`] -//! plus a rayon-dispatched per-spec settle. The warm +//! plus a fused-into-fetch primary settle. The warm //! [`crate::service::cache::MemoryCache`] it leaves behind makes the //! subsequent BFS phase a pure cache-hit walk: no network, no rayon //! re-parse hop on `extract_core_version`. @@ -14,18 +14,30 @@ //! still go through [`super::preload::preload_manifests`] so the //! pipeline keeps its early-start signal. //! -//! ## Why settle is dispatched off-runtime +//! ## Why settle is fused into the fetch task //! //! A "settle" turns a freshly-fetched `FullManifest` plus a spec into a //! `CoreVersionManifest` for one version, via `simd_json::to_borrowed_value` //! over the manifest's raw bytes. That parse is 5–10ms per spec on a -//! 100KB body. Calling it inline on the tokio runtime (the v1 of this -//! module) starves the runtime worker — sibling fetches in flight stop -//! draining their sockets while the worker is parsing, which CI showed -//! as `avg_request` rising +3ms and `avg_parse` jumping 5→11ms vs the -//! UnifiedRegistry baseline. Routing settle through `rayon::spawn` -//! (the same path the `extract_core_version_off_runtime` helper takes) -//! keeps the runtime free to drive I/O. +//! 100KB body. +//! +//! v1 ran settle inline on the tokio runtime worker — that starved +//! sibling fetches' I/O drive (CI showed `avg_request` +3ms, +//! `avg_parse` 5→11ms). v2 dispatched settle to rayon via a separate +//! `FuturesUnordered` future, which fixed the runtime starvation but +//! introduced a dispatch RTT: fetch lands → rayon settle queued → settle +//! pops → `pending` finally gets transitive deps. That round-trip held +//! the wave-shaped transitive walk back, capping `eff_parallel` at ~44 +//! against a 96 cap. +//! +//! v3 (this) folds the primary settle into the fetch task itself via +//! `tokio::task::spawn_blocking`. The fetch task awaits both the +//! network round-trip and the version-extract on the same blocking +//! pool slot, then returns with the resolved `CoreVersionManifest` +//! attached. The main loop pulls a single `Fetched` event and +//! immediately extends `pending` — no separate settle pop. Sibling +//! specs (rare; same package, different range) still go through a +//! `Settled` future to keep the primary path lean. use std::collections::{HashMap, HashSet, VecDeque}; use std::pin::Pin; @@ -56,21 +68,31 @@ pub struct FastPreloadStats { pub total_request_ms: u64, } -/// Output of one in-flight future. The main loop merges fetch and settle -/// completions through a single `FuturesUnordered` so backpressure on -/// either side throttles the other naturally. -/// -/// `Fetched` is boxed because `FetchManifestResult::Ok` carries a fully- -/// parsed `FullManifest` (`raw` bytes + parsed envelope), which makes -/// the variant large enough that clippy flags the size delta with -/// `Settled`. The cost is one heap allocation per fetched manifest; -/// trivial against the network round-trip we already paid. -#[allow(clippy::large_enum_variant)] +/// One fetch's primary settle outcome — the resolved version + parsed +/// `CoreVersionManifest` for the spec the fetch was originally issued +/// for. `None` means the spec didn't match any version (caller treats +/// as soft skip). +type PrimarySettle = Option<(String, Arc)>; + +/// Outcome of a fetch task. Owning `Arc` (rather than +/// `FetchManifestResult` by-value) means the fetch task can `Arc::clone` +/// once for the primary settle, then pass ownership along — no full +/// `FullManifest` clone (which would copy the 200-entry `time` +/// HashMap + the `versions` `Vec` per fetch). +enum FetchOutcome { + Ok(Arc), + NotModified, + Err, +} + +/// Output of one in-flight future. The main loop merges fetch and +/// sibling-settle completions through a single `FuturesUnordered`. enum FastEvent { Fetched { name: String, primary_spec: String, - result: anyhow::Result, + outcome: FetchOutcome, + primary_settle: PrimarySettle, elapsed_ms: u64, }, Settled { @@ -101,13 +123,9 @@ fn extract_transitive_deps(manifest: &CoreVersionManifest, peer_deps: PeerDeps) deps } -/// Resolve `(name, spec)` against `full` off the tokio runtime. -/// -/// Returns the freshly-extracted version manifest's transitive deps so -/// the caller can extend its pending queue. The heavy -/// `simd_json::to_borrowed_value` parse runs inside -/// `extract_core_version_off_runtime`, which dispatches to rayon — same -/// path the BFS phase uses for cold extracts. +/// Off-runtime settle for a `(name, spec)` whose `FullManifest` is +/// already cached. Used for sibling specs — multiple ranges on the +/// same package — that arrive after the primary fetch has landed. fn settle_future( name: String, spec: String, @@ -121,7 +139,6 @@ fn settle_future( Err(_) => return FastEvent::Settled { new_deps: vec![] }, }; if let Some(cached) = cache.get_version_manifest(&name, &resolved_version) { - // Populate the (name, spec) slot too — see comment below. cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&cached)); return FastEvent::Settled { new_deps: extract_transitive_deps(&cached, peer_deps), @@ -131,17 +148,6 @@ fn settle_future( extract_core_version_off_runtime(Arc::clone(&full), resolved_version).await; let new_deps = match core { Some(core_arc) => { - // Populate BOTH cache slots so the subsequent BFS hits the - // fast path on its first call: - // * `(name, resolved_version)` — what - // `resolve_via_full_manifest` writes in the cold path, - // and what `extract_core_version_off_runtime`'s callers - // elsewhere expect. - // * `(name, spec)` — what `resolve_version_manifest`'s - // first cache check uses (line 347 in service/registry.rs). - // Without this slot, BFS still pays one OnceMap dispatch - // + `resolve_via_full_manifest` walk per `(name, spec)`, - // even though we've already done that work here. cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&core_arc)); cache.set_version_manifest(name, resolved_version, Arc::clone(&core_arc)); extract_transitive_deps(&core_arc, peer_deps) @@ -152,6 +158,35 @@ fn settle_future( }) } +/// Resolve `(name, spec)` against `full` on tokio's blocking pool. +/// +/// Same shape as `extract_core_version_off_runtime` (which uses rayon), +/// but stays inside the fetch task so the result lands together with +/// the network round-trip — no separate `FuturesUnordered` pop, so +/// `pending` gets the transitive deps the moment the fetch event is +/// drained. Tokio's blocking pool has a 512-thread cap; rayon's is +/// `max(num_cpus, 8)`. With many primary settles arriving in waves, +/// the wider blocking pool absorbs the burst better than rayon would. +async fn resolve_primary_settle(spec: String, full: Arc) -> PrimarySettle { + #[cfg(not(target_arch = "wasm32"))] + { + tokio::task::spawn_blocking(move || { + let resolved = resolve_target_version((&*full).into(), &spec).ok()?; + let core = full.get_core_version(&resolved)?; + Some((resolved, Arc::new(core))) + }) + .await + .ok() + .flatten() + } + #[cfg(target_arch = "wasm32")] + { + let resolved = resolve_target_version((&*full).into(), &spec).ok()?; + let core = full.get_core_version(&resolved)?; + Some((resolved, Arc::new(core))) + } +} + /// Manifest-bench-style flat parallel fetch of all transitively-reachable /// registry manifests. Populates `cache` with both `full_manifests` and /// `version_manifests` slots so the subsequent BFS does no network and no @@ -167,14 +202,14 @@ pub async fn fast_preload( ) -> FastPreloadStats { let mut stats = FastPreloadStats::default(); let mut pending: VecDeque = VecDeque::from(initial_deps); - // Specs we've already enqueued (or settled). Prevents duplicate - // settles from re-walking the same transitive subtree. + // Specs we've already enqueued. Prevents duplicate settles from + // re-walking the same transitive subtree. let mut seen_specs: HashSet<(String, String)> = HashSet::new(); // Names whose full manifest is in flight or already cached. let mut fetched_names: HashSet = HashSet::new(); // Sibling specs that arrived while their package's full manifest - // was still in flight. The fetch's completion handler drains this - // bucket — we stash by name so the lookup is one HashMap probe. + // was still in flight. The fetch's completion handler dispatches + // settles for them, then drains this bucket. let mut deferred_by_name: HashMap> = HashMap::new(); let mut futs: FuturesUnordered = FuturesUnordered::new(); let concurrency = config.concurrency; @@ -189,10 +224,10 @@ pub async fn fast_preload( continue; } - // Hot path: the full manifest is already cached (a sibling - // spec for this name has already returned). Dispatch a - // settle so the parse work runs on rayon, not on the tokio - // worker — keeps the runtime free for ongoing fetches. + // Hot path: a sibling spec for this name has already + // returned, so the full manifest is cached. Settle on + // rayon (off-runtime) — keeps the primary fetch path + // (next branch) clean. if let Some(full) = cache.get_full_manifest(&name) { futs.push(Box::pin(settle_future( name, @@ -205,8 +240,8 @@ pub async fn fast_preload( } // A fetch for this name is already in flight: stash this - // spec; the fetch's completion handler will dispatch its - // settle. + // sibling spec; the fetch's completion handler will + // dispatch a settle for it. if !fetched_names.insert(name.clone()) { deferred_by_name.entry(name).or_default().push(spec); continue; @@ -225,10 +260,30 @@ pub async fn fast_preload( }) .await; let elapsed_ms = start.elapsed().as_millis() as u64; + // Fuse the primary settle into the same task so the + // main loop sees the resolved version + transitive + // deps in the same event — no extra `next().await` to + // wait through the FuturesUnordered queue before + // `pending` can refill. + let (outcome, primary_settle) = match result { + Ok(FetchManifestResult::Ok(manifest, _etag)) => { + let full_arc = Arc::new(manifest); + let settle = + resolve_primary_settle(primary_spec.clone(), Arc::clone(&full_arc)) + .await; + (FetchOutcome::Ok(full_arc), settle) + } + Ok(FetchManifestResult::NotModified) => (FetchOutcome::NotModified, None), + Err(e) => { + tracing::debug!("fast_preload failed for {}: {}", n, e); + (FetchOutcome::Err, None) + } + }; FastEvent::Fetched { name, primary_spec, - result, + outcome, + primary_settle, elapsed_ms, } })); @@ -246,7 +301,8 @@ pub async fn fast_preload( FastEvent::Fetched { name, primary_spec, - result, + outcome, + primary_settle, elapsed_ms, } => { if stats.success_count == 0 && stats.failed_count == 0 { @@ -258,24 +314,36 @@ pub async fn fast_preload( } stats.total_request_ms += elapsed_ms; - match result { - Ok(FetchManifestResult::Ok(manifest, _etag)) => { + match outcome { + FetchOutcome::Ok(full_arc) => { stats.success_count += 1; stats.fetched_names += 1; - let full_arc = Arc::new(manifest); cache.set_full_manifest(name.clone(), Arc::clone(&full_arc)); - // Primary settle. - futs.push(Box::pin(settle_future( - name.clone(), - primary_spec, - Arc::clone(&full_arc), - cache.clone(), - peer_deps, - ))); + // Apply the primary settle (already done inside + // the fetch task via spawn_blocking) — populate + // both `(name, primary_spec)` and + // `(name, resolved_version)` cache slots so BFS + // hits the early-return at registry.rs:347 on + // its first probe, then extend `pending` with + // the spec's transitive deps. + if let Some((resolved_version, core_arc)) = primary_settle { + cache.set_version_manifest( + name.clone(), + primary_spec, + Arc::clone(&core_arc), + ); + cache.set_version_manifest( + name.clone(), + resolved_version, + Arc::clone(&core_arc), + ); + pending.extend(extract_transitive_deps(&core_arc, peer_deps)); + } - // Sibling settles that were stashed while the - // fetch was in flight. + // Sibling specs that were stashed while the + // fetch was in flight: dispatch each as a + // separate settle future. if let Some(siblings) = deferred_by_name.remove(&name) { for sibling_spec in siblings { futs.push(Box::pin(settle_future( @@ -288,14 +356,10 @@ pub async fn fast_preload( } } } - Ok(FetchManifestResult::NotModified) => { - // No ETag was sent on these requests, so 304 is - // unreachable in practice; treat as soft failure. - stats.failed_count += 1; - } - Err(e) => { + FetchOutcome::NotModified | FetchOutcome::Err => { + // 304 is unreachable in practice (no ETag sent); + // both branches treated as soft failure. stats.failed_count += 1; - tracing::debug!("fast_preload failed for {}: {}", name, e); } } } From 671ac98e51e4a7ca4e53149c8bead24b4f144451 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 06:42:55 +0800 Subject: [PATCH 16/32] perf(pm): combined-parse fetch path eliminates per-fetch double simd_json MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fast_preload hot path was paying TWO simd_json passes per manifest: 1. fetch_full_manifest's parse_json_off_runtime did a typed simd_json::serde::from_slice (envelope + IgnoredAny visitor on `versions` keys, ~3-5ms on a 100KB body). 2. Primary settle re-parsed the same raw bytes with simd_json::to_borrowed_value (~5-10ms) to extract one version's subtree. Both passes went through simd_json's Tape constructor — duplicated work. CI showed avg_parse 5-7ms × 2700 fetches = 14-19s of CPU sum on 2-core GHA, where the spawn_blocking pool's overlapping schedule masked some of the cost but not all. Adds `service::manifest::fetch_full_manifest_with_settle`: same HTTP + retry + ETag machinery as `fetch_full_manifest`, but the parse step does ONE `to_borrowed_value` and extracts: * envelope (`name`, `dist-tags`, `versions` keys) into FullManifest manually (no typed serde), and * the resolved version's subtree as a typed CoreVersionManifest (serde-deserializing that single subtree via the borrowed value). fast_preload's fetch task switches to this entry point — primary settle is now a free byproduct of the fetch parse, not a separate `to_borrowed_value` pass. Sibling specs (same name, different range) still go through the rayon settle_future path. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/resolver/fast_preload.rs | 68 ++---- crates/ruborist/src/service/manifest.rs | 208 +++++++++++++++++++ crates/ruborist/src/service/mod.rs | 5 +- 3 files changed, 231 insertions(+), 50 deletions(-) diff --git a/crates/ruborist/src/resolver/fast_preload.rs b/crates/ruborist/src/resolver/fast_preload.rs index 008030139..d049321d8 100644 --- a/crates/ruborist/src/resolver/fast_preload.rs +++ b/crates/ruborist/src/resolver/fast_preload.rs @@ -51,7 +51,8 @@ use crate::model::node::PeerDeps; use crate::resolver::preload::{Dep, PreloadConfig}; use crate::resolver::version::resolve_target_version; use crate::service::{ - FetchManifestOptions, FetchManifestResult, MemoryCache, MetadataFormat, fetch_full_manifest, + FetchManifestOptions, FetchWithSettleResult, MemoryCache, MetadataFormat, + fetch_full_manifest_with_settle, }; use crate::spec::SpecStr; use crate::util::FETCH_TIMINGS; @@ -158,35 +159,6 @@ fn settle_future( }) } -/// Resolve `(name, spec)` against `full` on tokio's blocking pool. -/// -/// Same shape as `extract_core_version_off_runtime` (which uses rayon), -/// but stays inside the fetch task so the result lands together with -/// the network round-trip — no separate `FuturesUnordered` pop, so -/// `pending` gets the transitive deps the moment the fetch event is -/// drained. Tokio's blocking pool has a 512-thread cap; rayon's is -/// `max(num_cpus, 8)`. With many primary settles arriving in waves, -/// the wider blocking pool absorbs the burst better than rayon would. -async fn resolve_primary_settle(spec: String, full: Arc) -> PrimarySettle { - #[cfg(not(target_arch = "wasm32"))] - { - tokio::task::spawn_blocking(move || { - let resolved = resolve_target_version((&*full).into(), &spec).ok()?; - let core = full.get_core_version(&resolved)?; - Some((resolved, Arc::new(core))) - }) - .await - .ok() - .flatten() - } - #[cfg(target_arch = "wasm32")] - { - let resolved = resolve_target_version((&*full).into(), &spec).ok()?; - let core = full.get_core_version(&resolved)?; - Some((resolved, Arc::new(core))) - } -} - /// Manifest-bench-style flat parallel fetch of all transitively-reachable /// registry manifests. Populates `cache` with both `full_manifests` and /// `version_manifests` slots so the subsequent BFS does no network and no @@ -252,28 +224,28 @@ pub async fn fast_preload( let n = name.clone(); futs.push(Box::pin(async move { let start = tokio::time::Instant::now(); - let result = fetch_full_manifest(FetchManifestOptions { - registry_url: ®istry_url, - name: &n, - format: MetadataFormat::Abbreviated, - etag: None, - }) + // Combined fetch + envelope parse + primary settle in + // a single `to_borrowed_value` pass — replaces the old + // pattern of typed-serde envelope parse followed by a + // separate `to_borrowed_value` reparse for version + // extraction. Halves simd_json work per fetch. + let result = fetch_full_manifest_with_settle( + FetchManifestOptions { + registry_url: ®istry_url, + name: &n, + format: MetadataFormat::Abbreviated, + etag: None, + }, + &primary_spec, + ) .await; let elapsed_ms = start.elapsed().as_millis() as u64; - // Fuse the primary settle into the same task so the - // main loop sees the resolved version + transitive - // deps in the same event — no extra `next().await` to - // wait through the FuturesUnordered queue before - // `pending` can refill. let (outcome, primary_settle) = match result { - Ok(FetchManifestResult::Ok(manifest, _etag)) => { - let full_arc = Arc::new(manifest); - let settle = - resolve_primary_settle(primary_spec.clone(), Arc::clone(&full_arc)) - .await; - (FetchOutcome::Ok(full_arc), settle) + Ok(FetchWithSettleResult::Ok(payload)) => { + let full_arc = Arc::new(payload.manifest); + (FetchOutcome::Ok(full_arc), payload.primary_settle) } - Ok(FetchManifestResult::NotModified) => (FetchOutcome::NotModified, None), + Ok(FetchWithSettleResult::NotModified) => (FetchOutcome::NotModified, None), Err(e) => { tracing::debug!("fast_preload failed for {}: {}", n, e); (FetchOutcome::Err, None) diff --git a/crates/ruborist/src/service/manifest.rs b/crates/ruborist/src/service/manifest.rs index 90f1db71b..38db87969 100644 --- a/crates/ruborist/src/service/manifest.rs +++ b/crates/ruborist/src/service/manifest.rs @@ -4,7 +4,11 @@ //! [`crate::service::fetch`] so retry policy stays uniform across registry //! manifest fetches and non-registry resolvers (git, http tarball). +use std::collections::HashMap; +use std::sync::Arc; + use anyhow::{Result, anyhow}; +use serde::Deserialize; use tokio_retry::RetryIf; use super::fetch::{ @@ -12,6 +16,7 @@ use super::fetch::{ }; use super::http::get_client; use crate::model::manifest::{CoreVersionManifest, FullManifest}; +use crate::resolver::version::resolve_target_version; use crate::util::FETCH_TIMINGS; /// Parse JSON bytes on tokio's blocking thread pool. @@ -157,6 +162,209 @@ pub async fn fetch_full_manifest(opts: FetchManifestOptions<'_>) -> Result, + /// `Some` when the requested spec resolves to a real version in + /// `manifest.versions`. `None` only on no-match (rare; usually a + /// spec referring to a yanked or moved version). + pub primary_settle: Option, +} + +/// `(resolved_version, parsed_subtree)` — what +/// [`fetch_full_manifest_with_settle`] hands back to callers that +/// supplied a `primary_spec`. +pub type PrimarySettleResult = (String, Arc); + +#[allow(clippy::large_enum_variant)] +pub enum FetchWithSettleResult { + Ok(FetchWithSettle), + NotModified, +} + +/// Fetch a full manifest and resolve the primary spec from the same +/// parse pass. +/// +/// Where [`fetch_full_manifest`] uses `simd_json::serde::from_slice` +/// to materialize a typed `FullManifest` (cheap envelope, deep +/// `versions` subtrees skipped via `IgnoredAny`) and leaves version +/// subtree extraction to a later `simd_json::to_borrowed_value` +/// reparse, this entry point does the borrowed-value parse once and +/// extracts: +/// * envelope fields needed by the resolver (`name`, `dist-tags`, +/// `versions` keys), +/// * the resolved-version subtree as a typed +/// [`CoreVersionManifest`]. +/// +/// Saves one full simd_json pass on the parse hot path — +/// `fast_preload` uses ~2700 of these per `utoo deps` cold run, so +/// halving the per-fetch parse work meaningfully reduces CPU on +/// 2-core CI. +pub async fn fetch_full_manifest_with_settle( + opts: FetchManifestOptions<'_>, + primary_spec: &str, +) -> Result { + let url = format!("{}/{}", opts.registry_url, opts.name); + let etag_owned = opts.etag.map(|s| s.to_string()); + let primary_spec_owned = primary_spec.to_string(); + let accept = match opts.format { + MetadataFormat::Abbreviated => "application/vnd.npm.install-v1+json", + MetadataFormat::Complete => "application/json", + }; + + RetryIf::spawn( + retry_strategy(), + || { + let url = url.clone(); + let etag = etag_owned.clone(); + let primary_spec = primary_spec_owned.clone(); + async move { + let mut request = get_client() + .map_err(FetchError::Permanent)? + .get(&url) + .header("Accept", accept); + if let Some(etag_value) = &etag { + request = request.header("If-None-Match", etag_value); + } + + let t_request_start = std::time::Instant::now(); + let response = request.send().await.map_err(classify_reqwest_error)?; + let request_us = t_request_start.elapsed().as_micros() as u64; + let status = response.status(); + + if status == reqwest::StatusCode::NOT_MODIFIED { + if etag.is_some() { + return Ok(FetchWithSettleResult::NotModified); + } + return Err(classify_status(status, &url)); + } + + if status.is_success() { + let new_etag = response + .headers() + .get("etag") + .and_then(|v| v.to_str().ok()) + .map(|s| s.to_string()); + + let t_body_start = std::time::Instant::now(); + let raw_bytes = response + .bytes() + .await + .map_err(|e| FetchError::Permanent(anyhow!("Response read error: {e}")))? + .to_vec(); + let body_us = t_body_start.elapsed().as_micros() as u64; + let bytes_len = raw_bytes.len() as u64; + let raw_arc: Arc<[u8]> = Arc::from(raw_bytes); + + let t_parse_start = std::time::Instant::now(); + let parse_result = + parse_envelope_and_settle(Arc::clone(&raw_arc), primary_spec) + .await + .map_err(FetchError::Permanent)?; + let parse_us = t_parse_start.elapsed().as_micros() as u64; + + FETCH_TIMINGS.record(request_us, body_us, parse_us, bytes_len); + + let (manifest, primary_settle) = parse_result; + Ok(FetchWithSettleResult::Ok(FetchWithSettle { + manifest, + etag: new_etag, + primary_settle, + })) + } else { + Err(classify_status(status, &url)) + } + } + }, + is_retryable, + ) + .await + .map_err(|e| match e { + FetchError::Retryable(e) | FetchError::Permanent(e) => { + anyhow!("Failed to fetch {}: {:#}", opts.name, e) + } + }) +} + +/// Off-runtime combined parse: `simd_json::to_borrowed_value` once, +/// extract envelope into [`FullManifest`] + resolve `primary_spec` +/// against the parsed `versions` keys + materialize the resolved +/// version's subtree into [`CoreVersionManifest`]. +/// +/// Constructs `FullManifest` manually rather than via typed serde so +/// the work is exactly one parse pass. Other `FullManifest` fields +/// (`description`, `time`, `maintainers`, etc.) stay at `Default` +/// values — none are read on the resolver hot path. +async fn parse_envelope_and_settle( + raw: Arc<[u8]>, + primary_spec: String, +) -> Result<(FullManifest, Option)> { + #[cfg(not(target_arch = "wasm32"))] + { + tokio::task::spawn_blocking(move || parse_envelope_and_settle_sync(raw, &primary_spec)) + .await + .map_err(|e| anyhow!("spawn_blocking parse panicked: {e}"))? + } + #[cfg(target_arch = "wasm32")] + { + parse_envelope_and_settle_sync(raw, &primary_spec) + } +} + +fn parse_envelope_and_settle_sync( + raw: Arc<[u8]>, + primary_spec: &str, +) -> Result<(FullManifest, Option)> { + use simd_json::prelude::{ValueAsScalar, ValueObjectAccess}; + + let mut buf = (*raw).to_vec(); + let parsed = + simd_json::to_borrowed_value(&mut buf).map_err(|e| anyhow!("JSON parse error: {e}"))?; + + let name = parsed + .get("name") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + .unwrap_or_default(); + + let dist_tags: HashMap = parsed + .get("dist-tags") + .and_then(|v| HashMap::::deserialize(v).ok()) + .unwrap_or_default(); + + let versions_keys: Vec = parsed + .get("versions") + .and_then(simd_json::prelude::ValueAsObject::as_object) + .map(|obj| obj.keys().map(|k| k.to_string()).collect()) + .unwrap_or_default(); + + let manifest = FullManifest { + name, + dist_tags: dist_tags.clone(), + versions: versions_keys, + raw, + ..Default::default() + }; + + // Resolve spec against the just-extracted envelope. + let primary_settle = match resolve_target_version((&manifest).into(), primary_spec) { + Ok(resolved) => parsed + .get("versions") + .and_then(|v| v.get(resolved.as_str())) + .and_then(|version_obj| CoreVersionManifest::deserialize(version_obj).ok()) + .map(|core| (resolved, Arc::new(core))), + Err(_) => None, + }; + + Ok((manifest, primary_settle)) +} + /// Fetch full manifest without ETag / 304 support. /// /// Convenience wrapper around [`fetch_full_manifest`] for callers that never diff --git a/crates/ruborist/src/service/mod.rs b/crates/ruborist/src/service/mod.rs index 13109e994..5adb6bf0b 100644 --- a/crates/ruborist/src/service/mod.rs +++ b/crates/ruborist/src/service/mod.rs @@ -60,8 +60,9 @@ pub use cache::{ pub use fs::{Glob, NoopGlob, exists, read_to_string}; pub use http::client_builder; pub use manifest::{ - FetchManifestOptions, FetchManifestResult, FetchVersionManifestOptions, MetadataFormat, - fetch_full_manifest, fetch_full_manifest_fresh, fetch_version_manifest, + FetchManifestOptions, FetchManifestResult, FetchVersionManifestOptions, FetchWithSettle, + FetchWithSettleResult, MetadataFormat, fetch_full_manifest, fetch_full_manifest_fresh, + fetch_full_manifest_with_settle, fetch_version_manifest, }; pub use registry::UnifiedRegistry; pub use store::{ManifestStore, NoopStore}; From 542d7f144ec700ab5601247eff655399585fedbe Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 07:11:45 +0800 Subject: [PATCH 17/32] =?UTF-8?q?perf(pm):=20bump=20manifests-concurrency-?= =?UTF-8?q?limit=2096=20=E2=86=92=20128?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After 671ac98e's combined-parse fetch path eliminated the double simd_json pass, the spawn_blocking pool's contention ceiling rose enough that bumping concurrency past 96 no longer queues parses behind 2-core CPU. manifest-bench's most recent good-network sweep on GHA showed conc=128 hitting 1500ms vs conc=96 at 1566ms — small but real headroom for fast_preload's late-wave saturation now that initial waves fill faster. Risk: on slower-network runs (npmjs per-IP throttle), conc=128 widens p99. Earlier conc-sweep data was mixed — accepting that variance for the average-case improvement. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/pm/src/util/user_config.rs | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/crates/pm/src/util/user_config.rs b/crates/pm/src/util/user_config.rs index f05b0f52f..2f389379e 100644 --- a/crates/pm/src/util/user_config.rs +++ b/crates/pm/src/util/user_config.rs @@ -137,18 +137,17 @@ pub fn get_install_scope() -> InstallScope { // We tried 256 to match bun's observed parallel streams; on GHA the // fetch-breakdown instrumentation showed sum_parse exploded from // ~10ms (local Mac, network-bound) to 728s on first cold run with -// Once we moved fetch parse off rayon to tokio's spawn_blocking pool -// (cap 512) and settle off the runtime via rayon::spawn, the original -// 256-concurrency regression mechanism (parses queued behind 2 rayon -// workers) no longer applies. The standalone manifest-bench HTTP-only -// sweep on GHA (npmjs, conc 32→256) shows wall bottoming out at conc 96 -// (1817ms) and tracking flat-then-rising past that — beyond ~96 -// in-flight, npmjs's per-IP rate degrades and tail latency widens. -// 96 is the sweet spot: enough headroom for the wave-shaped transitive -// dep walk in fast_preload to keep the runtime busy, without paying the -// p99 widening that 128+ shows. +// Once parse work shrank (combined `to_borrowed_value` pass replaces +// the typed-serde envelope parse + reparse), spawn_blocking pool +// pressure no longer caps us at 96. manifest-bench's HTTP-only sweep +// on GHA (npmjs, h1) consistently picks 96 or 128 as best wall — +// in the most recent good-network run, conc=128 hit 1500ms vs +// conc=96 at 1566ms. Bumping to 128 narrows the gap between +// fast_preload's wave-shaped concurrency floor (eff_parallel ~48 +// because pending takes ~2 wave depths to fill) and the cap, so +// the late-wave saturation has more headroom. static MANIFESTS_CONCURRENCY_LIMIT: LazyLock> = - LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 96)); + LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 128)); pub fn set_manifests_concurrency_limit(value: Option) { MANIFESTS_CONCURRENCY_LIMIT.set(value); From c8768ac4ce8ca26a60a3313e22dba7ac625665d7 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 07:39:24 +0800 Subject: [PATCH 18/32] revert(pm): manifests-concurrency-limit back to 96 542d7f14's conc=128 bench landed in a slow-network run (mb best 2010ms vs 1500ms in the prior good-network run; bun also bumped to 2.14s vs 1.83s). Adjusted gap to mb best stayed flat (~700ms either way), so conc=128 didn't beat 96 across runs. Picking 96 as the conservative default: at-or-near best on every GHA run we've measured, never the worst, and leaves headroom for npmjs's per-IP throttling to absorb without compounding p99. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/pm/src/util/user_config.rs | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/crates/pm/src/util/user_config.rs b/crates/pm/src/util/user_config.rs index 2f389379e..f6924f5aa 100644 --- a/crates/pm/src/util/user_config.rs +++ b/crates/pm/src/util/user_config.rs @@ -137,17 +137,18 @@ pub fn get_install_scope() -> InstallScope { // We tried 256 to match bun's observed parallel streams; on GHA the // fetch-breakdown instrumentation showed sum_parse exploded from // ~10ms (local Mac, network-bound) to 728s on first cold run with -// Once parse work shrank (combined `to_borrowed_value` pass replaces -// the typed-serde envelope parse + reparse), spawn_blocking pool -// pressure no longer caps us at 96. manifest-bench's HTTP-only sweep -// on GHA (npmjs, h1) consistently picks 96 or 128 as best wall — -// in the most recent good-network run, conc=128 hit 1500ms vs -// conc=96 at 1566ms. Bumping to 128 narrows the gap between -// fast_preload's wave-shaped concurrency floor (eff_parallel ~48 -// because pending takes ~2 wave depths to fill) and the cap, so -// the late-wave saturation has more headroom. +// manifest-bench's HTTP-only sweep on GHA (npmjs, h1) bottoms out +// somewhere in the 96-128 band — which one wins varies with npmjs's +// per-IP latency on each run (good runs picked 128, slow-network +// runs flattened the curve and even regressed at 128 due to wider +// p99 from queued requests). 96 is the conservative pick: it's at +// or near best on every run we've measured, never the worst, and +// leaves headroom for npmjs to throttle without compounding queue +// time. Combined-parse fetch (671ac98e) made the spawn_blocking +// pool no longer a contention bottleneck, but didn't change the +// network-side variance — that's what caps the useful concurrency. static MANIFESTS_CONCURRENCY_LIMIT: LazyLock> = - LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 128)); + LazyLock::new(|| ConfigValue::new("manifests-concurrency-limit", 96)); pub fn set_manifests_concurrency_limit(value: Option) { MANIFESTS_CONCURRENCY_LIMIT.set(value); From 3be7487d7ad772667ac125ce82955432c257f8d3 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 09:57:58 +0800 Subject: [PATCH 19/32] perf(pm): mb_resolve experimental fetch path (parallel track to fast_preload) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds resolver::mb_resolve module + service::build_deps_mb entry point as a parallel-track alternative to fast_preload, structured to match manifest-bench's main-loop shape as closely as correctness allows. Hypothesis under test: fast_preload's eff_parallel caps at ~50/96 because the FastEvent enum match + cache writes + sibling deferred bookkeeping in the main loop competes with tokio runtime workers for the 2 CPU cores on GHA, stalling socket I/O drive. mb_fetch pushes ALL per-fetch work into the spawned future itself (including cache writes), so the main loop is reduced to: while let Some(deps) = futs.next().await { pending.extend(deps); refill_to_cap(...); } Sibling specs (multiple ranges on same package) are NOT deferred at queue level — racing fetches for the same name both proceed. The race converges naturally: first fetch to land populates full_manifests, subsequent racers find the cache hit on entry and short-circuit to a sibling-style settle. Wastes ~5-50 network requests in real workloads but eliminates the HashMap probe + drain overhead from the hot loop. Wired in via UTOO_RESOLVE=mb env var: - Context::build_deps (utoo deps) routes through build_deps_mb - pipeline::resolve_with_pipeline (utoo install) also routes through it; pipeline workers still start but don't pipeline during fetch (mb_fetch emits no PackageResolved events) — install becomes phase-sequential, useful for resolve-phase A/B. bench script enables UTOO_RESOLVE=mb so CI measures the new path against existing baselines (utoo-next/utoo-npm/bun ignore the env var). Comment the export line to A/B back against fast_preload. Old fast_preload + UnifiedRegistry paths untouched. Co-Authored-By: Claude Opus 4.7 (1M context) --- bench/pm-bench-phases.sh | 7 + crates/pm/src/helper/ruborist_context.rs | 12 +- crates/pm/src/service/pipeline/mod.rs | 17 +- crates/ruborist/src/resolver/mb_resolve.rs | 243 +++++++++++++++++++++ crates/ruborist/src/resolver/mod.rs | 1 + crates/ruborist/src/service/api.rs | 161 ++++++++++++++ crates/ruborist/src/service/mod.rs | 2 +- 7 files changed, 440 insertions(+), 3 deletions(-) create mode 100644 crates/ruborist/src/resolver/mb_resolve.rs diff --git a/bench/pm-bench-phases.sh b/bench/pm-bench-phases.sh index 226ffb751..26e43388c 100755 --- a/bench/pm-bench-phases.sh +++ b/bench/pm-bench-phases.sh @@ -22,6 +22,13 @@ UTOO_NEXT_CACHE="${UTOO_NEXT_CACHE:-/tmp/utoo-next-bench-cache}" BUN_CACHE="${BUN_CACHE:-/tmp/bun-bench-cache}" export BUN_INSTALL_CACHE_DIR="$BUN_CACHE" +# Route the current `utoo` binary's resolve phase through the +# experimental `mb_resolve` flat-fetch path. Other PMs ignore this +# env var (utoo-next is built from origin/next which doesn't have +# the flag; utoo-npm/bun ignore unrecognized env vars). Comment out +# to A/B against the default `fast_preload` path. +export UTOO_RESOLVE=mb + # Drop optional baselines from the PM list when their binary is not wired # up — UTOO_NPM_BIN is set by CI's "Install utoo@npm" step, UTOO_NEXT_BIN # by the optional "Build next branch utoo" step. Local runs without them diff --git a/crates/pm/src/helper/ruborist_context.rs b/crates/pm/src/helper/ruborist_context.rs index bc4d7faa1..542664f8c 100644 --- a/crates/pm/src/helper/ruborist_context.rs +++ b/crates/pm/src/helper/ruborist_context.rs @@ -87,10 +87,20 @@ impl Context { /// Used by the lockfile-only path (`utoo deps`). No pipeline consumes /// `PackageResolved` events here, so preload is pure overhead — BFS's /// own per-level parallel prefetch warms the manifest cache. + /// + /// Set `UTOO_RESOLVE=mb` to opt into the experimental + /// manifest-bench-style fetch path (`build_deps_mb`) for A/B + /// benchmarking against the current `fast_preload`. pub async fn build_deps(cwd: PathBuf) -> anyhow::Result { let mut options = Self::deps_options(cwd.clone(), ProgressReceiver).await; options.skip_preload = true; - let output = utoo_ruborist::service::build_deps(options).await?; + let use_mb = std::env::var("UTOO_RESOLVE").as_deref() == Ok("mb"); + let output = if use_mb { + tracing::debug!("UTOO_RESOLVE=mb: routing to build_deps_mb"); + utoo_ruborist::service::build_deps_mb(options).await? + } else { + utoo_ruborist::service::build_deps(options).await? + }; spawn_save_project_cache(cwd, output.project_cache.clone()); Ok(output) } diff --git a/crates/pm/src/service/pipeline/mod.rs b/crates/pm/src/service/pipeline/mod.rs index 719d31d13..4169ca88d 100644 --- a/crates/pm/src/service/pipeline/mod.rs +++ b/crates/pm/src/service/pipeline/mod.rs @@ -41,7 +41,22 @@ pub async fn resolve_with_pipeline(root_path: &std::path::Path) -> anyhow::Resul let (options, channels) = Context::pipeline_deps_options(root_path.to_path_buf()).await; let handles = worker::start_workers(channels, root_path.to_path_buf()); - let output = utoo_ruborist::service::build_deps(options).await?; + // `UTOO_RESOLVE=mb` reroutes install through the experimental + // mb-style fetch path. Pipeline workers are still started, but + // because mb_fetch doesn't emit `PackageResolved` events, the + // pipeline only fires once BFS completes (graph_to_package_lock + // emits `PackagePlaced` from BFS). Install becomes + // phase-sequential — fetch all manifests, then download + + // clone. Useful for A/B benchmarking the resolve phase in + // isolation; the pipelining advantage of the default path is + // lost. + let use_mb = std::env::var("UTOO_RESOLVE").as_deref() == Ok("mb"); + let output = if use_mb { + tracing::debug!("UTOO_RESOLVE=mb: routing install resolve to build_deps_mb"); + utoo_ruborist::service::build_deps_mb(options).await? + } else { + utoo_ruborist::service::build_deps(options).await? + }; save_package_lock(root_path, &output.lock).await?; spawn_save_project_cache(root_path.to_path_buf(), output.project_cache); diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs new file mode 100644 index 000000000..2928638be --- /dev/null +++ b/crates/ruborist/src/resolver/mb_resolve.rs @@ -0,0 +1,243 @@ +//! Manifest-bench-style flat manifest fetcher (experimental new pipeline). +//! +//! A parallel-track alternative to [`super::fast_preload`], structured +//! to match `manifest-bench`'s main-loop shape as closely as +//! correctness allows. The hypothesis under test: `fast_preload`'s +//! eff_parallel caps at ~50 against a 96-cap because the main loop's +//! CPU work (FastEvent enum match + cache writes + sibling-deferred +//! bookkeeping + Box::pin allocation) competes with tokio runtime +//! workers for the 2 cores on GHA, stalling socket I/O drive. +//! +//! `mb_resolve` pushes ALL per-fetch work into the spawned future +//! itself (cache writes included) so the main loop is reduced to: +//! +//! ```ignore +//! while let Some(deps) = futs.next().await { +//! pending.extend(deps); +//! refill_to_cap(&mut futs, &mut pending, ...); +//! } +//! ``` +//! +//! Sibling specs (multiple ranges on the same package) are NOT +//! deferred at queue level — if two specs for the same name race, +//! both fetch. This wastes a small number of network requests (~5-50 +//! across a real install) but keeps the main loop's per-event cost +//! minimal (no HashMap probe / drain). The race converges: whichever +//! fetch lands first populates `full_manifests`; subsequent racers +//! find the cache hit on entry and short-circuit to a sibling-style +//! settle without re-fetching. +//! +//! Wiring: opt-in via `UTOO_RESOLVE=mb` env var. Both `utoo deps` +//! and `utoo install` route through this when set; install loses +//! pipelining (mb_fetch doesn't emit `PackageResolved` events) but +//! gains the lean main loop for resolve-phase A/B testing. + +use std::collections::{HashSet, VecDeque}; +use std::sync::Arc; + +use futures::stream::{FuturesUnordered, StreamExt}; + +use crate::model::manifest::{CoreVersionManifest, FullManifest}; +use crate::model::node::PeerDeps; +use crate::resolver::preload::{Dep, PreloadConfig}; +use crate::resolver::version::resolve_target_version; +use crate::service::{ + FetchManifestOptions, FetchWithSettleResult, MemoryCache, MetadataFormat, + fetch_full_manifest_with_settle, +}; +use crate::spec::SpecStr; +use crate::util::FETCH_TIMINGS; + +#[derive(Debug, Default)] +pub struct MbFetchStats { + pub success: usize, + pub fail: usize, +} + +/// Collect dependencies from a deps map, filtering non-registry specs. +fn collect_deps(map: Option<&std::collections::HashMap>) -> Vec { + map.into_iter() + .flatten() + .filter(|(_, spec)| spec.is_registry_spec()) + .map(|(name, spec)| (name.clone(), spec.clone())) + .collect() +} + +fn extract_transitive(manifest: &CoreVersionManifest, peer_deps: PeerDeps) -> Vec { + let mut out = Vec::new(); + out.extend(collect_deps(manifest.dependencies.as_ref())); + if peer_deps == PeerDeps::Include { + out.extend(collect_deps(manifest.peer_dependencies.as_ref())); + } + out.extend(collect_deps(manifest.optional_dependencies.as_ref())); + out +} + +/// Settle one (name, spec) against an already-cached `FullManifest`. +/// Used for sibling specs (or racing-fetch losers) — extracts the +/// resolved version's `CoreVersionManifest` on the blocking pool, +/// populates both `(name, spec)` and `(name, resolved_version)` cache +/// slots so BFS hits the early-return fast path. +async fn settle_sibling( + name: String, + spec: String, + full: Arc, + cache: MemoryCache, + peer_deps: PeerDeps, +) -> Vec { + let Ok(resolved) = resolve_target_version((&*full).into(), &spec) else { + return Vec::new(); + }; + if let Some(cached) = cache.get_version_manifest(&name, &resolved) { + cache.set_version_manifest(name, spec, Arc::clone(&cached)); + return extract_transitive(&cached, peer_deps); + } + + let resolved_for_parse = resolved.clone(); + let full_for_parse = Arc::clone(&full); + let core_opt = tokio::task::spawn_blocking(move || { + full_for_parse + .get_core_version(&resolved_for_parse) + .map(Arc::new) + }) + .await + .ok() + .flatten(); + + let Some(core_arc) = core_opt else { + return Vec::new(); + }; + cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc)); + cache.set_version_manifest(name, resolved, Arc::clone(&core_arc)); + extract_transitive(&core_arc, peer_deps) +} + +/// Self-contained per-spec future. Either fetches `(name)`'s full +/// manifest from the registry (if not yet cached), or settles against +/// an already-cached one. In both cases it: +/// * writes `full_manifests` and `version_manifests` cache slots +/// for the resolved spec, +/// * returns the spec's transitive deps for the main loop to +/// enqueue. +/// +/// Racing-fetch handling: two specs for the same name dispatched +/// concurrently both enter the fetch branch (no in-flight gate). The +/// second one re-issues a network round-trip; the cost is bounded by +/// the small number of sibling specs in real workloads (<2% in +/// ant-design-x). Last writer to `cache.set_full_manifest` wins; +/// content is identical so correctness is preserved. +async fn fetch_or_settle( + name: String, + spec: String, + registry_url: String, + cache: MemoryCache, + peer_deps: PeerDeps, +) -> Vec { + // Sibling fast path: full manifest already cached. + if let Some(full) = cache.get_full_manifest(&name) { + return settle_sibling(name, spec, full, cache, peer_deps).await; + } + + let result = fetch_full_manifest_with_settle( + FetchManifestOptions { + registry_url: ®istry_url, + name: &name, + format: MetadataFormat::Abbreviated, + etag: None, + }, + &spec, + ) + .await; + + let Ok(FetchWithSettleResult::Ok(payload)) = result else { + return Vec::new(); + }; + + let full_arc = Arc::new(payload.manifest); + cache.set_full_manifest(name.clone(), Arc::clone(&full_arc)); + + let Some((resolved, core_arc)) = payload.primary_settle else { + return Vec::new(); + }; + cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc)); + cache.set_version_manifest(name, resolved, Arc::clone(&core_arc)); + extract_transitive(&core_arc, peer_deps) +} + +/// Manifest-bench-style flat parallel fetch. See module docs for the +/// rationale. +pub async fn mb_fetch( + initial_deps: Vec, + registry_url: &str, + cache: &MemoryCache, + config: &PreloadConfig, +) -> MbFetchStats { + let mut stats = MbFetchStats::default(); + let mut pending: VecDeque = initial_deps.into(); + let mut seen: HashSet<(String, String)> = HashSet::new(); + let mut futs = FuturesUnordered::new(); + let cap = config.concurrency; + let peer_deps = config.peer_deps; + let registry_url = registry_url.to_string(); + + let start = tokio::time::Instant::now(); + + // Initial fill — same shape as the refill below. + while futs.len() < cap { + let Some((name, spec)) = pending.pop_front() else { + break; + }; + if !seen.insert((name.clone(), spec.clone())) { + continue; + } + futs.push(Box::pin(fetch_or_settle( + name, + spec, + registry_url.clone(), + cache.clone(), + peer_deps, + ))); + } + + while let Some(transitive) = futs.next().await { + if transitive.is_empty() { + // Empty result is ambiguous (no transitive deps OR fetch + // failed) — `MbFetchStats` only tracks success/fail at a + // coarse level. The fetch-timings counters (recorded + // inside `fetch_full_manifest_with_settle`) carry the + // detailed per-fetch metrics. + stats.fail += 1; + } else { + stats.success += 1; + } + pending.extend(transitive); + + // Refill — same body as the initial fill above. + while futs.len() < cap { + let Some((name, spec)) = pending.pop_front() else { + break; + }; + if !seen.insert((name.clone(), spec.clone())) { + continue; + } + futs.push(Box::pin(fetch_or_settle( + name, + spec, + registry_url.clone(), + cache.clone(), + peer_deps, + ))); + } + } + + let wall = start.elapsed(); + tracing::info!( + "p1-breakdown mb_fetch wall={}ms ok={} fail={} | {}", + wall.as_millis(), + stats.success, + stats.fail, + FETCH_TIMINGS.snapshot().summary_line(), + ); + + stats +} diff --git a/crates/ruborist/src/resolver/mod.rs b/crates/ruborist/src/resolver/mod.rs index e7baad988..2d0a288d9 100644 --- a/crates/ruborist/src/resolver/mod.rs +++ b/crates/ruborist/src/resolver/mod.rs @@ -8,6 +8,7 @@ pub mod fast_preload; pub mod git; #[cfg(feature = "http-tarball")] pub mod http; +pub mod mb_resolve; pub mod preload; pub mod registry; pub mod runtime; diff --git a/crates/ruborist/src/service/api.rs b/crates/ruborist/src/service/api.rs index 3b9b713ea..9687fc875 100644 --- a/crates/ruborist/src/service/api.rs +++ b/crates/ruborist/src/service/api.rs @@ -39,6 +39,7 @@ use crate::resolver::builder::{ gather_preload_deps, }; use crate::resolver::fast_preload::fast_preload; +use crate::resolver::mb_resolve::mb_fetch; use crate::resolver::preload::PreloadConfig; use crate::resolver::runtime::install_runtime_from_map; use crate::resolver::workspace::WorkspaceDiscovery; @@ -332,6 +333,166 @@ where }) } +/// Experimental parallel-track entry point: structurally identical to +/// [`build_deps`] but routes the manifest-fetch phase through +/// [`crate::resolver::mb_resolve::mb_fetch`] instead of +/// [`crate::resolver::fast_preload::fast_preload`]. +/// +/// Intended for A/B benchmarking: install + lockfile-only callers can +/// opt in via `UTOO_RESOLVE=mb` (wired in `pm::helper::ruborist_context`). +/// All other behavior — workspace discovery, runtime injection, BFS, +/// graph→lock serialization, project cache export — is the same as +/// `build_deps`. The `EventReceiver` still receives BFS events; it +/// does NOT receive `PreloadFetching` / `PreloadProgress` events +/// because mb_fetch is silent (matches `manifest-bench`'s zero-event +/// loop). +/// +/// **Install-path note:** `pipeline_deps_options` callers that need +/// `PackageResolved` events to drive the download/clone pipeline +/// won't pipeline under this path — mb_fetch finishes all fetches +/// before BFS starts. Use only for `utoo deps`-style workloads, or +/// accept that install becomes phase-sequential. +pub async fn build_deps_mb(options: BuildDepsOptions) -> Result +where + G: Glob + Clone, + R: EventReceiver, +{ + let BuildDepsOptions { + cwd, + registry_url, + cache_dir, + manifest_store, + warm_project_cache, + concurrency, + peer_deps, + glob, + receiver, + supports_semver, + catalogs, + skip_preload: _, + } = options; + + // Steps 1-6: structurally identical to `build_deps` — read + // package.json, inject runtime deps, build initial graph, add + // root edges, discover and add workspaces. + let discovery = WorkspaceDiscovery::new(glob.clone()); + let root_path = discovery.find_root_path(&cwd).await?; + let pkg_path = root_path.join("package.json"); + let mut pkg: PackageJson = super::fs::read_json(&pkg_path) + .await + .map_err(|e| anyhow::anyhow!("Failed to read/parse package.json: {}", e))?; + + if let Some(engines) = &pkg.engines { + let runtime_deps = install_runtime_from_map(engines); + if !runtime_deps.is_empty() { + for (name, version) in runtime_deps { + pkg.optional_dependencies + .get_or_insert_with(HashMap::new) + .entry(name) + .or_insert(version); + } + } + } + + let mut graph = DependencyGraph::from_package_json(root_path.clone(), pkg.clone()); + let root_index = graph.root_index; + let edge_ctx = EdgeContext::new(peer_deps, DevDeps::Include).with_catalogs(&catalogs); + add_edges_from(&mut graph, root_index, &pkg, &edge_ctx); + + let workspaces = discovery.find_workspaces_from_pkg(&root_path, &pkg).await?; + for workspace in workspaces { + let ws_pkg = workspace.package_json; + let workspace_node = + PackageNode::workspace_from_package_json(workspace.path.clone(), ws_pkg.clone()); + let workspace_index = graph.add_node(workspace_node); + let link_node = PackageNode::link_from_package_json(workspace.path.clone(), ws_pkg.clone()); + let link_index = graph.add_node(link_node); + graph.add_physical_edge(root_index, workspace_index); + graph.add_physical_edge(root_index, link_index); + let dep_edge_id = graph.add_dependency_edge( + root_index, + workspace.name.clone(), + &ws_pkg.version, + EdgeType::Prod, + ); + graph.mark_dependency_resolved(dep_edge_id, workspace_index); + add_edges_from(&mut graph, workspace_index, &ws_pkg, &edge_ctx); + } + + // Step 7-8: cache + registry, same as `build_deps`. Warm project + // cache is honored. + let package_cache = Arc::new(PackageCache::default()); + let (cache_count, _) = prepopulate_warm_cache(&package_cache, warm_project_cache.as_ref()); + + let mut builder = UnifiedRegistry::builder() + .registry(®istry_url) + .cache(package_cache) + .store(Arc::clone(&manifest_store)); + if let Some(semver) = supports_semver { + builder = builder.supports_semver(semver); + } + let registry = builder.build(); + + // Run mb_fetch instead of fast_preload — pre-warms cache by + // walking transitive deps via flat FuturesUnordered. Skipped if + // the warm project cache already covers the workload. + if cache_count == 0 { + let initial_deps = gather_preload_deps(&graph, peer_deps); + let preload_config = PreloadConfig { + peer_deps, + concurrency, + }; + mb_fetch( + initial_deps, + registry.registry_url(), + registry.cache(), + &preload_config, + ) + .await; + } + + // BFS phase reads the now-warm cache. `skip_preload=true` skips + // the receiver-driven preload — mb_fetch already ran. + let mut config = BuildDepsConfig::default() + .with_peer_deps(peer_deps) + .with_concurrency(concurrency) + .with_skip_preload(true) + .with_catalogs(catalogs); + if let Some(dir) = cache_dir { + config = config.with_cache_dir(dir); + } + + build_deps_with_config(&mut graph, ®istry, config, &receiver) + .await + .map_err(|e| anyhow::Error::new(e).context("Dependency resolution failed"))?; + + let t_serialize_start = std::time::Instant::now(); + let (packages, _total) = graph.serialize_to_packages(&root_path); + let serialize_us = t_serialize_start.elapsed().as_micros() as u64; + + let t_cache_export_start = std::time::Instant::now(); + let mut project_cache = ProjectCacheData::default(); + for (key, manifest) in registry.cache().export_version_manifests() { + let (name, spec) = parse_package_spec(&key); + let version = manifest.version.clone(); + let pkg_cache = project_cache.cache.entry(name.to_string()).or_default(); + pkg_cache.specs.insert(spec.to_string(), version.clone()); + pkg_cache.manifests.insert(version, (*manifest).clone()); + } + let cache_export_us = t_cache_export_start.elapsed().as_micros() as u64; + + tracing::info!( + "p1-breakdown serialize_us={} cache_export_us={}", + serialize_us, + cache_export_us, + ); + + Ok(BuildDepsOutput { + lock: PackageLock::new(&pkg.name, &pkg.version, packages), + project_cache, + }) +} + /// Pre-populate `cache` from a warm project cache. Returns /// `(loaded, missing)` — `loaded` is the count of usable spec→manifest /// entries; `missing` counts specs whose resolved version had no manifest diff --git a/crates/ruborist/src/service/mod.rs b/crates/ruborist/src/service/mod.rs index 5adb6bf0b..7a7cf8ca8 100644 --- a/crates/ruborist/src/service/mod.rs +++ b/crates/ruborist/src/service/mod.rs @@ -52,7 +52,7 @@ mod manifest; mod registry; mod store; -pub use api::{BuildDepsOptions, BuildDepsOutput, build_deps}; +pub use api::{BuildDepsOptions, BuildDepsOutput, build_deps, build_deps_mb}; pub use cache::{ CacheStats, MemoryCache, PackageCache, ProjectCacheData, ProjectPackageCache, Versions, VersionsInfo, From 02cc12e7a23214672215a1ee1efd6317e7ce6d8c Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 10:39:27 +0800 Subject: [PATCH 20/32] =?UTF-8?q?perf(pm):=20mb=5Fresolve=20v3=20=E2=80=94?= =?UTF-8?q?=20two-phase=20pure=20HTTP=20+=20rayon=20batch=20parse?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v1/v2 ran parse work in spawn_blocking inside each fetch future, which competed with tokio runtime workers for the 2 GHA cores. CI showed eff_parallel capped at 47/96 vs manifest-bench standalone's 75/96 on the same box. Hypothesis: parse CPU starves socket drive. v3 separates the two phases: * PHASE 1 — `mb_style_pure_fetch` is a structural copy of `manifest-bench`'s main loop: future body does ONLY GET + body recv, refill 1-for-1 on completion. Zero per-future CPU work, so tokio runtime workers retain full CPU for socket drive. * PHASE 2 — bulk rayon par_iter parse: for each body, parse `FullManifest` envelope via simd_json::to_borrowed_value, resolve every queued spec for this name against the just-parsed manifest, populate cache slots, collect transitive deps. Runs off the tokio runtime entirely (spawn_blocking → rayon par_iter). Phases alternate until pending exhausted. Typical project: 3-5 iterations as the dep tree fans out wave by wave. The point of the split is the `phase1_http_wall` trace — measured in isolation from any parse work, it should match manifest-bench's standalone wall (~1.5-2.0s for 2733 names @ conc=96). If it does, the remaining gap to mb is concentrated in phase 2 work, which is inherent to discovering transitive deps from a non-flat name list. Tracing per iteration: p1-breakdown mb_fetch iter=N phase1_http_wall=Xms n=Y bytes=Z p1-breakdown mb_fetch iter=N phase2_parse_wall=Xms settles=Y new_transitives=Z p1-breakdown mb_fetch total_wall=Xms iters=Y Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/resolver/mb_resolve.rs | 494 ++++++++++++++------- 1 file changed, 332 insertions(+), 162 deletions(-) diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs index 2928638be..05e1bf038 100644 --- a/crates/ruborist/src/resolver/mb_resolve.rs +++ b/crates/ruborist/src/resolver/mb_resolve.rs @@ -1,61 +1,87 @@ -//! Manifest-bench-style flat manifest fetcher (experimental new pipeline). +//! Two-phase manifest fetcher: phase 1 pure HTTP (mirrors +//! `manifest-bench` standalone exactly), phase 2 rayon batch parse + +//! settle. //! -//! A parallel-track alternative to [`super::fast_preload`], structured -//! to match `manifest-bench`'s main-loop shape as closely as -//! correctness allows. The hypothesis under test: `fast_preload`'s -//! eff_parallel caps at ~50 against a 96-cap because the main loop's -//! CPU work (FastEvent enum match + cache writes + sibling-deferred -//! bookkeeping + Box::pin allocation) competes with tokio runtime -//! workers for the 2 cores on GHA, stalling socket I/O drive. +//! ## Phase split //! -//! `mb_resolve` pushes ALL per-fetch work into the spawned future -//! itself (cache writes included) so the main loop is reduced to: +//! Per-fetch parse work was the real bottleneck in v1/v2 — `simd_json` +//! ran in `spawn_blocking` threads that competed with tokio runtime +//! workers for CPU on the 2-core GHA box. When 50+ parses ran in +//! parallel, tokio workers couldn't drive sockets, so `eff_parallel` +//! capped at ~47 against the 96 cap (vs `manifest-bench` standalone's +//! 75 on the same box). //! -//! ```ignore -//! while let Some(deps) = futs.next().await { -//! pending.extend(deps); -//! refill_to_cap(&mut futs, &mut pending, ...); -//! } -//! ``` +//! v3 separates the work: //! -//! Sibling specs (multiple ranges on the same package) are NOT -//! deferred at queue level — if two specs for the same name race, -//! both fetch. This wastes a small number of network requests (~5-50 -//! across a real install) but keeps the main loop's per-event cost -//! minimal (no HashMap probe / drain). The race converges: whichever -//! fetch lands first populates `full_manifests`; subsequent racers -//! find the cache hit on entry and short-circuit to a sibling-style -//! settle without re-fetching. +//! - **Phase 1** — `mb_style_pure_fetch` is a structural copy of +//! `manifest-bench`'s main loop: `spawn_one` (GET + body recv, +//! nothing else) + 1-for-1 refill on completion. The future body +//! has zero CPU work, so the tokio runtime workers retain full CPU +//! to drive sockets and `eff_parallel` reaches the same level as +//! the standalone bench. //! -//! Wiring: opt-in via `UTOO_RESOLVE=mb` env var. Both `utoo deps` -//! and `utoo install` route through this when set; install loses -//! pipelining (mb_fetch doesn't emit `PackageResolved` events) but -//! gains the lean main loop for resolve-phase A/B testing. +//! - **Phase 2** — bulk parse on rayon (off the tokio runtime). For +//! each fetched body: parse `FullManifest` envelope, resolve every +//! spec we need for this name, materialize `CoreVersionManifest` +//! subtrees, populate cache slots, collect transitive deps for the +//! next iteration. +//! +//! Phases alternate until `pending` is empty (typical project: 3-5 +//! iterations as transitive deps fan out wave by wave). +//! +//! Phase 1 is the line we measure against `manifest-bench` — +//! `p1-breakdown mb_fetch_iter=N phase1_http_wall=...` traces let us +//! check eff_parallel directly. +//! +//! Wired in via `UTOO_RESOLVE=mb` env var (see +//! `pm::helper::ruborist_context::Context::build_deps`). -use std::collections::{HashSet, VecDeque}; +use std::collections::{HashMap, HashSet}; use std::sync::Arc; +use bytes::Bytes; use futures::stream::{FuturesUnordered, StreamExt}; +use rayon::prelude::*; +use serde::Deserialize; use crate::model::manifest::{CoreVersionManifest, FullManifest}; use crate::model::node::PeerDeps; use crate::resolver::preload::{Dep, PreloadConfig}; use crate::resolver::version::resolve_target_version; -use crate::service::{ - FetchManifestOptions, FetchWithSettleResult, MemoryCache, MetadataFormat, - fetch_full_manifest_with_settle, -}; +use crate::service::MemoryCache; +use crate::service::http::get_client; use crate::spec::SpecStr; -use crate::util::FETCH_TIMINGS; #[derive(Debug, Default)] pub struct MbFetchStats { pub success: usize, pub fail: usize, + pub iterations: usize, +} + +/// Phase 1 result: one body per fetched name. `bytes` is `None` on +/// transport / non-2xx — kept in the result vector so phase 2 can +/// account for it, but contributes no settle work. +struct FetchOutcome { + name: String, + bytes: Option, } -/// Collect dependencies from a deps map, filtering non-registry specs. -fn collect_deps(map: Option<&std::collections::HashMap>) -> Vec { +/// Phase 2 per-name output. `full` is `None` on parse failure. +struct ParseOutcome { + name: String, + full: Option>, + /// Per-spec settled subtrees: `(spec, resolved_version, core)`. + /// Empty when the body failed to fetch / parse, or when no spec + /// resolves against the manifest. + settled: Vec<(String, String, Arc)>, + /// Transitive deps collected across all settled subtrees for this + /// name. Already filtered to registry specs; the main loop dedups + /// against `done_names` before queueing. + transitives: Vec, +} + +fn collect_deps(map: Option<&HashMap>) -> Vec { map.into_iter() .flatten() .filter(|(_, spec)| spec.is_registry_spec()) @@ -73,99 +99,177 @@ fn extract_transitive(manifest: &CoreVersionManifest, peer_deps: PeerDeps) -> Ve out } -/// Settle one (name, spec) against an already-cached `FullManifest`. -/// Used for sibling specs (or racing-fetch losers) — extracts the -/// resolved version's `CoreVersionManifest` on the blocking pool, -/// populates both `(name, spec)` and `(name, resolved_version)` cache -/// slots so BFS hits the early-return fast path. -async fn settle_sibling( - name: String, - spec: String, - full: Arc, - cache: MemoryCache, - peer_deps: PeerDeps, -) -> Vec { - let Ok(resolved) = resolve_target_version((&*full).into(), &spec) else { - return Vec::new(); +/// Phase 1 — structural copy of `manifest-bench`'s main loop. Future +/// body does ONLY GET + body recv; no parse, no cache writes, no +/// dedup. Returns one `FetchOutcome` per input name in arrival order. +async fn mb_style_pure_fetch( + names: Vec, + registry_url: &str, + concurrency: usize, +) -> Vec { + let client = match get_client() { + Ok(c) => c.clone(), + Err(e) => { + tracing::warn!("get_client failed: {e}"); + return Vec::new(); + } }; - if let Some(cached) = cache.get_version_manifest(&name, &resolved) { - cache.set_version_manifest(name, spec, Arc::clone(&cached)); - return extract_transitive(&cached, peer_deps); - } - let resolved_for_parse = resolved.clone(); - let full_for_parse = Arc::clone(&full); - let core_opt = tokio::task::spawn_blocking(move || { - full_for_parse - .get_core_version(&resolved_for_parse) - .map(Arc::new) - }) - .await - .ok() - .flatten(); + let mut results: Vec = Vec::with_capacity(names.len()); + let mut futs = FuturesUnordered::new(); + let mut idx = 0usize; - let Some(core_arc) = core_opt else { - return Vec::new(); + let spawn_one = |client: &reqwest::Client, + registry_url: &str, + name: String, + futs: &mut FuturesUnordered<_>| { + let url = format!("{}/{}", registry_url, name); + let client = client.clone(); + futs.push(Box::pin(async move { + let bytes = match client + .get(&url) + .header("accept", "application/vnd.npm.install-v1+json") + .send() + .await + { + Ok(resp) if resp.status().is_success() => resp.bytes().await.ok(), + _ => None, + }; + FetchOutcome { name, bytes } + })); }; - cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc)); - cache.set_version_manifest(name, resolved, Arc::clone(&core_arc)); - extract_transitive(&core_arc, peer_deps) + + while idx < names.len() && futs.len() < concurrency { + spawn_one(&client, registry_url, names[idx].clone(), &mut futs); + idx += 1; + } + + while let Some(outcome) = futs.next().await { + results.push(outcome); + if idx < names.len() { + spawn_one(&client, registry_url, names[idx].clone(), &mut futs); + idx += 1; + } + } + + results } -/// Self-contained per-spec future. Either fetches `(name)`'s full -/// manifest from the registry (if not yet cached), or settles against -/// an already-cached one. In both cases it: -/// * writes `full_manifests` and `version_manifests` cache slots -/// for the resolved spec, -/// * returns the spec's transitive deps for the main loop to -/// enqueue. -/// -/// Racing-fetch handling: two specs for the same name dispatched -/// concurrently both enter the fetch branch (no in-flight gate). The -/// second one re-issues a network round-trip; the cost is bounded by -/// the small number of sibling specs in real workloads (<2% in -/// ant-design-x). Last writer to `cache.set_full_manifest` wins; -/// content is identical so correctness is preserved. -async fn fetch_or_settle( +/// Sync phase 2 worker: parse one body, settle all specs we need for +/// this name. Runs on rayon (called from `par_iter` in +/// `parse_settle_batch`). +fn parse_one_body( name: String, - spec: String, - registry_url: String, - cache: MemoryCache, + raw: Bytes, + specs: Vec, peer_deps: PeerDeps, -) -> Vec { - // Sibling fast path: full manifest already cached. - if let Some(full) = cache.get_full_manifest(&name) { - return settle_sibling(name, spec, full, cache, peer_deps).await; - } +) -> ParseOutcome { + use simd_json::prelude::{ValueAsScalar, ValueObjectAccess}; - let result = fetch_full_manifest_with_settle( - FetchManifestOptions { - registry_url: ®istry_url, - name: &name, - format: MetadataFormat::Abbreviated, - etag: None, - }, - &spec, - ) - .await; - - let Ok(FetchWithSettleResult::Ok(payload)) = result else { - return Vec::new(); + let raw_arc: Arc<[u8]> = Arc::from(raw.as_ref()); + let mut buf = raw.to_vec(); + let parsed = match simd_json::to_borrowed_value(&mut buf) { + Ok(v) => v, + Err(_) => { + return ParseOutcome { + name, + full: None, + settled: Vec::new(), + transitives: Vec::new(), + }; + } }; - let full_arc = Arc::new(payload.manifest); - cache.set_full_manifest(name.clone(), Arc::clone(&full_arc)); + let envelope_name = parsed + .get("name") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + .unwrap_or_else(|| name.clone()); + let dist_tags: HashMap = parsed + .get("dist-tags") + .and_then(|v| HashMap::::deserialize(v).ok()) + .unwrap_or_default(); + let versions_keys: Vec = parsed + .get("versions") + .and_then(simd_json::prelude::ValueAsObject::as_object) + .map(|obj| obj.keys().map(|k| k.to_string()).collect()) + .unwrap_or_default(); - let Some((resolved, core_arc)) = payload.primary_settle else { - return Vec::new(); + let full = FullManifest { + name: envelope_name, + dist_tags, + versions: versions_keys, + raw: Arc::clone(&raw_arc), + ..Default::default() }; - cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc)); - cache.set_version_manifest(name, resolved, Arc::clone(&core_arc)); - extract_transitive(&core_arc, peer_deps) + let full_arc = Arc::new(full); + + // For each requested spec, resolve + extract version subtree. + // Cache the per-(name, version) `CoreVersionManifest` so sibling + // specs that resolve to the same version reuse the same Arc. + let mut version_cache: HashMap> = HashMap::new(); + let mut settled = Vec::with_capacity(specs.len()); + let mut transitives = Vec::new(); + + for spec in specs { + let Ok(resolved_version) = resolve_target_version((&*full_arc).into(), &spec) else { + continue; + }; + let core_arc = if let Some(cached) = version_cache.get(&resolved_version) { + Arc::clone(cached) + } else { + let Some(core) = parsed + .get("versions") + .and_then(|v| v.get(resolved_version.as_str())) + .and_then(|version_obj| CoreVersionManifest::deserialize(version_obj).ok()) + else { + continue; + }; + let arc = Arc::new(core); + version_cache.insert(resolved_version.clone(), Arc::clone(&arc)); + arc + }; + transitives.extend(extract_transitive(&core_arc, peer_deps)); + settled.push((spec, resolved_version, core_arc)); + } + + ParseOutcome { + name, + full: Some(full_arc), + settled, + transitives, + } +} + +/// Phase 2 dispatcher: hands the batch to rayon, awaits the result. +async fn parse_settle_batch( + bodies: Vec, + by_name: HashMap>, + peer_deps: PeerDeps, +) -> Vec { + let work: Vec<(String, Bytes, Vec)> = bodies + .into_iter() + .filter_map(|f| { + let bytes = f.bytes?; + let specs = by_name.get(&f.name).cloned().unwrap_or_default(); + Some((f.name, bytes, specs)) + }) + .collect(); + + if work.is_empty() { + return Vec::new(); + } + + tokio::task::spawn_blocking(move || { + work.into_par_iter() + .map(|(name, raw, specs)| parse_one_body(name, raw, specs, peer_deps)) + .collect::>() + }) + .await + .unwrap_or_default() } -/// Manifest-bench-style flat parallel fetch. See module docs for the -/// rationale. +/// Two-phase mb-style fetch with rayon batch parse. See module docs. pub async fn mb_fetch( initial_deps: Vec, registry_url: &str, @@ -173,70 +277,136 @@ pub async fn mb_fetch( config: &PreloadConfig, ) -> MbFetchStats { let mut stats = MbFetchStats::default(); - let mut pending: VecDeque = initial_deps.into(); - let mut seen: HashSet<(String, String)> = HashSet::new(); - let mut futs = FuturesUnordered::new(); - let cap = config.concurrency; + let mut pending_specs: Vec = initial_deps; + let mut done_names: HashSet = HashSet::new(); + let conc = config.concurrency; let peer_deps = config.peer_deps; - let registry_url = registry_url.to_string(); + let total_start = tokio::time::Instant::now(); - let start = tokio::time::Instant::now(); + while !pending_specs.is_empty() { + stats.iterations += 1; + let iter = stats.iterations; - // Initial fill — same shape as the refill below. - while futs.len() < cap { - let Some((name, spec)) = pending.pop_front() else { - break; - }; - if !seen.insert((name.clone(), spec.clone())) { - continue; + // Group this iteration's pending specs by name. + let mut by_name: HashMap> = HashMap::new(); + for (name, spec) in pending_specs.drain(..) { + by_name.entry(name).or_default().push(spec); } - futs.push(Box::pin(fetch_or_settle( - name, - spec, - registry_url.clone(), - cache.clone(), - peer_deps, - ))); - } - while let Some(transitive) = futs.next().await { - if transitive.is_empty() { - // Empty result is ambiguous (no transitive deps OR fetch - // failed) — `MbFetchStats` only tracks success/fail at a - // coarse level. The fetch-timings counters (recorded - // inside `fetch_full_manifest_with_settle`) carry the - // detailed per-fetch metrics. - stats.fail += 1; - } else { - stats.success += 1; + // Names whose full manifest is already cached from a prior + // iteration: settle their siblings synchronously (cheap + // semver match + cache lookup; no parse if version_manifest + // already cached, otherwise quick simd_json subtree extract). + let mut sibling_only: Vec<(String, Vec)> = Vec::new(); + let mut to_fetch: Vec = Vec::with_capacity(by_name.len()); + for (name, specs) in &by_name { + if done_names.contains(name) { + sibling_only.push((name.clone(), specs.clone())); + } else { + to_fetch.push(name.clone()); + } } - pending.extend(transitive); - // Refill — same body as the initial fill above. - while futs.len() < cap { - let Some((name, spec)) = pending.pop_front() else { - break; + // Sibling settles (rare on real workloads — most names appear + // exactly once across the whole walk). + for (name, specs) in sibling_only { + let Some(full) = cache.get_full_manifest(&name) else { + continue; }; - if !seen.insert((name.clone(), spec.clone())) { + for spec in specs { + let Ok(resolved) = resolve_target_version((&*full).into(), &spec) else { + continue; + }; + if let Some(cached) = cache.get_version_manifest(&name, &resolved) { + cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&cached)); + pending_specs.extend(extract_transitive(&cached, peer_deps)); + continue; + } + if let Some(core) = full.get_core_version(&resolved) { + let core_arc = Arc::new(core); + cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&core_arc)); + cache.set_version_manifest(name.clone(), resolved, Arc::clone(&core_arc)); + pending_specs.extend(extract_transitive(&core_arc, peer_deps)); + } + } + } + + if to_fetch.is_empty() { + // Iteration drained pending entirely via sibling settles. + continue; + } + + // PHASE 1 — pure HTTP, mb-style. + let p1_start = tokio::time::Instant::now(); + let bodies = mb_style_pure_fetch(to_fetch.clone(), registry_url, conc).await; + let p1_wall = p1_start.elapsed().as_millis(); + let total_bytes: usize = bodies + .iter() + .map(|b| b.bytes.as_ref().map(|v| v.len()).unwrap_or(0)) + .sum(); + tracing::info!( + "p1-breakdown mb_fetch iter={} phase1_http_wall={}ms n={} bytes={}", + iter, + p1_wall, + to_fetch.len(), + total_bytes, + ); + + // PHASE 2 — rayon batch parse + settle. + let p2_start = tokio::time::Instant::now(); + let by_name_for_parse = by_name + .iter() + .filter(|(name, _)| !done_names.contains(*name)) + .map(|(n, s)| (n.clone(), s.clone())) + .collect::>(); + let parsed = parse_settle_batch(bodies, by_name_for_parse, peer_deps).await; + let p2_wall = p2_start.elapsed().as_millis(); + + let mut new_transitives: Vec = Vec::new(); + let mut settle_count = 0usize; + let mut fail_count = 0usize; + for outcome in parsed { + done_names.insert(outcome.name.clone()); + let Some(full_arc) = outcome.full else { + fail_count += 1; continue; + }; + cache.set_full_manifest(outcome.name.clone(), Arc::clone(&full_arc)); + for (spec, resolved, core) in outcome.settled { + cache.set_version_manifest(outcome.name.clone(), spec, Arc::clone(&core)); + cache.set_version_manifest(outcome.name.clone(), resolved, Arc::clone(&core)); + settle_count += 1; } - futs.push(Box::pin(fetch_or_settle( - name, - spec, - registry_url.clone(), - cache.clone(), - peer_deps, - ))); + new_transitives.extend(outcome.transitives); } + // Names that fetched but failed parse — still mark done so we + // don't refetch them next iteration. + for name in to_fetch { + done_names.insert(name); + } + + stats.success += settle_count; + stats.fail += fail_count; + + tracing::info!( + "p1-breakdown mb_fetch iter={} phase2_parse_wall={}ms settles={} fail={} new_transitives={}", + iter, + p2_wall, + settle_count, + fail_count, + new_transitives.len(), + ); + + pending_specs.extend(new_transitives); } - let wall = start.elapsed(); + let total_wall = total_start.elapsed().as_millis(); tracing::info!( - "p1-breakdown mb_fetch wall={}ms ok={} fail={} | {}", - wall.as_millis(), + "p1-breakdown mb_fetch total_wall={}ms iters={} settled={} fail={}", + total_wall, + stats.iterations, stats.success, stats.fail, - FETCH_TIMINGS.snapshot().summary_line(), ); stats From 24165fb6d355d78cc606b69773fe2dc466560834 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 11:09:29 +0800 Subject: [PATCH 21/32] =?UTF-8?q?fix(pm):=20mb=5Fresolve=20v3=20=E2=80=94?= =?UTF-8?q?=20restore=20spec-level=20dedup=20to=20terminate?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v3 dropped the (name, spec) HashSet from v1/v2 thinking name-level dedup via done_names was sufficient. It wasn't: sibling-settle's extract_transitive can re-introduce specs we've already settled (peer/optional dep cycles trivially trigger this), so the outer while-loop never terminated. CI 25589397823 hung on `Run phase-isolated benchmark · npmjs` for ~25 min before being cancelled — the bench's first utoo p1_resolve hyperfine run got stuck in an infinite settle loop. Fix: maintain `seen_specs: HashSet<(String, String)>` across all iterations; filter both initial seed and every wave of new transitives through it before extending pending_specs. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/resolver/mb_resolve.rs | 42 ++++++++++++++++------ 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs index 05e1bf038..7ef0b5d85 100644 --- a/crates/ruborist/src/resolver/mb_resolve.rs +++ b/crates/ruborist/src/resolver/mb_resolve.rs @@ -278,11 +278,20 @@ pub async fn mb_fetch( ) -> MbFetchStats { let mut stats = MbFetchStats::default(); let mut pending_specs: Vec = initial_deps; + // (name, spec) pairs we've already processed (settled or queued + // to settle). Without this, sibling-settle's transitive deps can + // re-introduce already-walked specs and the outer loop never + // terminates — peer / optional dep cycles trivially trigger this. + let mut seen_specs: HashSet<(String, String)> = HashSet::new(); let mut done_names: HashSet = HashSet::new(); let conc = config.concurrency; let peer_deps = config.peer_deps; let total_start = tokio::time::Instant::now(); + // Filter the initial seed through `seen_specs` too — root + workspace + // edges can list the same dep multiple times across workspaces. + pending_specs.retain(|(n, s)| seen_specs.insert((n.clone(), s.clone()))); + while !pending_specs.is_empty() { stats.iterations += 1; let iter = stats.iterations; @@ -308,7 +317,8 @@ pub async fn mb_fetch( } // Sibling settles (rare on real workloads — most names appear - // exactly once across the whole walk). + // exactly once across the whole walk). New transitives go + // through `seen_specs` dedup before joining `pending_specs`. for (name, specs) in sibling_only { let Some(full) = cache.get_full_manifest(&name) else { continue; @@ -317,17 +327,22 @@ pub async fn mb_fetch( let Ok(resolved) = resolve_target_version((&*full).into(), &spec) else { continue; }; - if let Some(cached) = cache.get_version_manifest(&name, &resolved) { + let new_deps = if let Some(cached) = cache.get_version_manifest(&name, &resolved) { cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&cached)); - pending_specs.extend(extract_transitive(&cached, peer_deps)); - continue; - } - if let Some(core) = full.get_core_version(&resolved) { + extract_transitive(&cached, peer_deps) + } else if let Some(core) = full.get_core_version(&resolved) { let core_arc = Arc::new(core); cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&core_arc)); cache.set_version_manifest(name.clone(), resolved, Arc::clone(&core_arc)); - pending_specs.extend(extract_transitive(&core_arc, peer_deps)); - } + extract_transitive(&core_arc, peer_deps) + } else { + Vec::new() + }; + pending_specs.extend( + new_deps + .into_iter() + .filter(|(n, s)| seen_specs.insert((n.clone(), s.clone()))), + ); } } @@ -388,16 +403,21 @@ pub async fn mb_fetch( stats.success += settle_count; stats.fail += fail_count; + let new_unique: Vec = new_transitives + .into_iter() + .filter(|(n, s)| seen_specs.insert((n.clone(), s.clone()))) + .collect(); + tracing::info!( - "p1-breakdown mb_fetch iter={} phase2_parse_wall={}ms settles={} fail={} new_transitives={}", + "p1-breakdown mb_fetch iter={} phase2_parse_wall={}ms settles={} fail={} new_unique={}", iter, p2_wall, settle_count, fail_count, - new_transitives.len(), + new_unique.len(), ); - pending_specs.extend(new_transitives); + pending_specs.extend(new_unique); } let total_wall = total_start.elapsed().as_millis(); From 41822b081c713758fdbd633513d7257258f39d45 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 11:51:06 +0800 Subject: [PATCH 22/32] =?UTF-8?q?perf(pm):=20preload-bench=20=E2=80=94=20s?= =?UTF-8?q?elf-contained=20streaming=20preload=20baseline?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New crate `crates/preload-bench/` is a fully-standalone bench that: * Uses the SAME HTTP setup as `manifest-bench` (own reqwest::Client built per rep with aws-lc-rs TLS, pool_max_idle_per_host(256), no proxy, default DNS, no retry, h1_only). * Discovers names by walking transitive deps from a package.json root — instead of consuming a flat name list like manifest-bench. * Per-future does GET + body recv + spawn_blocking parse → returns transitive deps → main loop refills on completion. * No dependency on ruborist or any utoo internals (own simd_json, own dedup, own everything). The point: prove (or disprove) that a fully ruborist-independent streaming preload can hit standalone manifest-bench's wall on the same workload. ruborist's path runs at ~2.18s for ant-design's ~2700 names; manifest-bench standalone runs the same workload at ~1.6s. The gap could be in any number of things — DNS layer, retry, pool config, parse-CPU contention, registry single-flight gates. preload-bench eliminates all of those simultaneously so we can read the wall directly. Wired into bench-phases-linux: builds + uploads preload-bench binary alongside manifest-bench, then runs a conc=64/96/128 sweep against the same project after the standalone manifest-bench sweep. bench script reverts UTOO_RESOLVE=mb so utoo runs default fast_preload — gives a third datapoint (utoo wall on integrated path) alongside manifest-bench (HTTP-only ceiling) and preload-bench (streaming-with-walk ceiling). Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/pm-e2e-bench.yml | 69 ++++ Cargo.toml | 1 + bench/pm-bench-phases.sh | 12 +- crates/preload-bench/Cargo.toml | 38 +++ crates/preload-bench/src/main.rs | 505 +++++++++++++++++++++++++++++ 5 files changed, 619 insertions(+), 6 deletions(-) create mode 100644 crates/preload-bench/Cargo.toml create mode 100644 crates/preload-bench/src/main.rs diff --git a/.github/workflows/pm-e2e-bench.yml b/.github/workflows/pm-e2e-bench.yml index b25f5c380..eb560969b 100644 --- a/.github/workflows/pm-e2e-bench.yml +++ b/.github/workflows/pm-e2e-bench.yml @@ -161,6 +161,25 @@ jobs: name: manifest-bench-linux-x64 path: target/x86_64-unknown-linux-gnu/release/manifest-bench retention-days: 1 + # preload-bench: same HTTP setup as manifest-bench, but discovers + # names by walking transitive deps from a package.json root — + # tests whether a fully self-contained streaming preload can match + # standalone manifest-bench's wall on the same workload that + # ruborist's path runs at ~2.18s. + - name: Build preload-bench + if: > + (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'benchmark')) || + (github.event_name == 'workflow_dispatch' && (inputs.target == 'pm-bench-phases' || inputs.target == 'pm-bench-pcap')) + run: cargo build --release --target x86_64-unknown-linux-gnu -p preload-bench + - name: Upload preload-bench binary + if: > + (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'benchmark')) || + (github.event_name == 'workflow_dispatch' && (inputs.target == 'pm-bench-phases' || inputs.target == 'pm-bench-pcap')) + uses: actions/upload-artifact@v4 + with: + name: preload-bench-linux-x64 + path: target/x86_64-unknown-linux-gnu/release/preload-bench + retention-days: 1 # Piggyback on the already-built target/ from the step above: when the # PR is labeled `benchmark`, overlay origin/next's tree onto the current # workdir and re-run cargo build. cargo's incremental compile only @@ -547,6 +566,20 @@ jobs: chmod +x /tmp/manifest-bench-dist/manifest-bench mv /tmp/manifest-bench-dist/manifest-bench /tmp/manifest-bench echo "MANIFEST_BENCH_BIN=/tmp/manifest-bench" >> $GITHUB_ENV + # Self-contained streaming preload bench — same HTTP setup as + # manifest-bench but discovers names via transitive walk from a + # package.json. Used to test whether a fully-isolated path can + # match standalone manifest-bench's wall on the same workload. + - name: Download preload-bench binary + uses: actions/download-artifact@v4 + with: + name: preload-bench-linux-x64 + path: /tmp/preload-bench-dist + - name: Install preload-bench + run: | + chmod +x /tmp/preload-bench-dist/preload-bench + mv /tmp/preload-bench-dist/preload-bench /tmp/preload-bench + echo "PRELOAD_BENCH_BIN=/tmp/preload-bench" >> $GITHUB_ENV - name: Verify tools run: | hyperfine --version @@ -645,6 +678,42 @@ jobs: "$MANIFEST_BENCH_BIN" --lockfile package-lock.json --registry "$REGISTRY" \ --concurrency 128 --reps 2 --http1-only --user-agent "Bun/1.2.21" || true } 2>&1 | tee "$MB_LOG" + # Self-contained streaming preload (transitive walk from + # package.json) — same HTTP setup as manifest-bench but with a + # streaming FuturesUnordered + per-future parse. This tests + # whether a fully ruborist-independent path can hit standalone + # manifest-bench's wall under the same project workload. + - name: Standalone preload-bench (transitive walk sweep) + env: + PROJECT: ${{ github.event.inputs.project || 'ant-design' }} + REGISTRY: 'https://registry.npmjs.org' + run: | + set -eu + mkdir -p /tmp/pm-bench-output + PROJECT_DIR="/tmp/pm-bench/$PROJECT" + if [ ! -d "$PROJECT_DIR" ]; then + echo "no project dir; skipping preload-bench"; exit 0 + fi + PJ="$PROJECT_DIR/package.json" + if [ ! -f "$PJ" ]; then + echo "no package.json; skipping preload-bench"; exit 0 + fi + + PB_LOG=/tmp/pm-bench-output/preload-bench-npmjs.log + { + echo "============================================================" + echo "preload-bench: streaming transitive-walk preload" + echo " Self-contained (no ruborist deps). Same HTTP setup as" + echo " manifest-bench, but discovers names by walking transitive" + echo " deps from package.json instead of consuming a flat list." + echo "============================================================" + for CAP in 64 96 128; do + echo + echo "--- concurrency=$CAP, h1, transitive walk ---" + "$PRELOAD_BENCH_BIN" --package-json "$PJ" --registry "$REGISTRY" \ + --concurrency "$CAP" --reps 4 || true + done + } 2>&1 | tee "$PB_LOG" - name: Upload bench logs if: always() uses: actions/upload-artifact@v4 diff --git a/Cargo.toml b/Cargo.toml index 0574a185a..4b2836c06 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,6 +2,7 @@ resolver = "2" members = [ "crates/manifest-bench", + "crates/preload-bench", "crates/pack-api", "crates/pack-cli", "crates/pack-core", diff --git a/bench/pm-bench-phases.sh b/bench/pm-bench-phases.sh index 26e43388c..b025ebc6f 100755 --- a/bench/pm-bench-phases.sh +++ b/bench/pm-bench-phases.sh @@ -22,12 +22,12 @@ UTOO_NEXT_CACHE="${UTOO_NEXT_CACHE:-/tmp/utoo-next-bench-cache}" BUN_CACHE="${BUN_CACHE:-/tmp/bun-bench-cache}" export BUN_INSTALL_CACHE_DIR="$BUN_CACHE" -# Route the current `utoo` binary's resolve phase through the -# experimental `mb_resolve` flat-fetch path. Other PMs ignore this -# env var (utoo-next is built from origin/next which doesn't have -# the flag; utoo-npm/bun ignore unrecognized env vars). Comment out -# to A/B against the default `fast_preload` path. -export UTOO_RESOLVE=mb +# utoo path defaults to fast_preload (combined-parse) so we have a +# stable baseline to compare against. preload-bench is run as a +# separate standalone tool by the CI workflow — its wall is the +# self-contained-streaming reference, ruborist's utoo p1_resolve +# wall is the integrated path. The gap between them is what +# remains to close. # Drop optional baselines from the PM list when their binary is not wired # up — UTOO_NPM_BIN is set by CI's "Install utoo@npm" step, UTOO_NEXT_BIN diff --git a/crates/preload-bench/Cargo.toml b/crates/preload-bench/Cargo.toml new file mode 100644 index 000000000..9d37d7769 --- /dev/null +++ b/crates/preload-bench/Cargo.toml @@ -0,0 +1,38 @@ +[package] +name = "preload-bench" +version = "0.0.0" +edition = "2024" +license = "MIT" +publish = false +description = "Self-contained streaming-with-transitive-walk manifest preload bench. Reproduces manifest-bench's standalone fetch loop but discovers transitive deps from package.json instead of consuming a flat name list. No dependency on ruborist or any utoo internals." + +[[bin]] +name = "preload-bench" +path = "src/main.rs" + +# tombi: format.rules.table-keys-order.disabled = true +[dependencies] +anyhow = { workspace = true } +clap = { workspace = true } +futures = "0.3" +serde = { version = "1", features = ["derive"] } +serde_json = { workspace = true } +simd-json = "0.17" +tokio = { workspace = true, features = ["macros", "rt-multi-thread", "fs", "time"] } + +# Same TLS/DNS choices as manifest-bench so the only delta vs that bench +# is the transitive-walk loop. +reqwest = { version = "0.12", default-features = false, features = [ + "brotli", + "gzip", + "http2", + "rustls-tls-native-roots-no-provider", + "socks" +] } +rustls = { version = "0.23", default-features = false, features = [ + "aws-lc-rs", + "logging", + "std", + "tls12" +] } +rustls-native-certs = "0.8" diff --git a/crates/preload-bench/src/main.rs b/crates/preload-bench/src/main.rs new file mode 100644 index 000000000..46f917d19 --- /dev/null +++ b/crates/preload-bench/src/main.rs @@ -0,0 +1,505 @@ +//! Self-contained streaming preload bench with transitive walking. +//! +//! Same HTTP setup as `manifest-bench` (own `reqwest::Client` built +//! per rep with `aws-lc-rs` TLS, `pool_max_idle_per_host(256)`, no +//! proxy, default DNS, no retry). The only delta vs `manifest-bench` +//! is that this bench discovers names by walking transitive deps +//! from a `package.json` root, instead of consuming a flat name +//! list. +//! +//! Why a separate crate: ruborist's manifest-fetch path goes through +//! several service layers (custom DNS resolver, retry, cache, +//! single-flight gates, event receivers). Each layer might add +//! overhead. This bench bypasses all of them — same shape as +//! manifest-bench, just with a streaming `FuturesUnordered` that +//! refills from a pending queue extended by parsed transitive deps. +//! +//! Reports both the standalone preload wall and a per-rep eff_parallel +//! number so we can compare directly against manifest-bench's +//! `phase_wall` + `avg_conc` for the same workload. +//! +//! Output (one line per rep, matching manifest-bench shape): +//! [rep N] preload_wall=Xms n=Y bytes=Z avg_conc=N.N parse_sum=Wms 200=A 4xx=B err=C + +use std::collections::{HashMap, HashSet, VecDeque}; +use std::path::PathBuf; +use std::sync::Arc; +use std::time::Instant; + +use anyhow::{Context, Result, anyhow}; +use clap::Parser; +use futures::stream::{FuturesUnordered, StreamExt}; +use serde::Deserialize; + +#[derive(Parser, Debug)] +#[command( + name = "preload-bench", + about = "Streaming preload bench with transitive walking (self-contained)" +)] +struct Args { + /// Registry base URL. + #[arg(long, default_value = "https://registry.npmjs.org")] + registry: String, + + /// Path to a `package.json` to walk from. Reads `dependencies` + + /// `devDependencies` + `optionalDependencies` as the initial seed. + #[arg(long)] + package_json: PathBuf, + + /// Maximum concurrent in-flight requests. + #[arg(long, default_value_t = 96)] + concurrency: usize, + + /// Number of times to repeat the whole walk (fresh client per rep). + #[arg(long, default_value_t = 4)] + reps: usize, + + /// Force HTTP/1.1. + #[arg(long, default_value_t = true)] + http1_only: bool, + + /// Override `User-Agent`. + #[arg(long)] + user_agent: Option, + + /// Include `peerDependencies` when walking transitives. Off by + /// default (matches utoo's default). + #[arg(long)] + include_peer: bool, +} + +#[derive(Deserialize)] +struct PackageJson { + #[serde(default)] + dependencies: HashMap, + #[serde(default, rename = "devDependencies")] + dev_dependencies: HashMap, + #[serde(default, rename = "optionalDependencies")] + optional_dependencies: HashMap, +} + +#[tokio::main] +async fn main() -> Result<()> { + let args = Args::parse(); + + let raw = std::fs::read_to_string(&args.package_json) + .with_context(|| format!("read {:?}", args.package_json))?; + let pkg: PackageJson = serde_json::from_str(&raw).context("parse package.json")?; + let initial: Vec<(String, String)> = pkg + .dependencies + .into_iter() + .chain(pkg.dev_dependencies) + .chain(pkg.optional_dependencies) + .filter(|(_, spec)| is_registry_spec(spec)) + .collect(); + + println!( + "preload-bench: registry={} concurrency={} reps={} initial={} h1_only={} ua={} include_peer={}", + args.registry, + args.concurrency, + args.reps, + initial.len(), + args.http1_only, + args.user_agent.as_deref().unwrap_or(""), + args.include_peer, + ); + + for rep in 1..=args.reps { + run_once(&args, &initial, rep).await?; + } + + Ok(()) +} + +/// Quick registry-spec check (a `^...` / `~...` / `latest` / etc). +/// Excludes `file:`, `link:`, `workspace:`, `git+`, `https://`, and +/// `/` shorthand. Same intent as ruborist's +/// `SpecStr::is_registry_spec` but inlined to keep this crate +/// dependency-free. +fn is_registry_spec(spec: &str) -> bool { + if spec.is_empty() { + return true; // bare entries default to "*" + } + let lower = spec.to_ascii_lowercase(); + if lower.starts_with("file:") + || lower.starts_with("link:") + || lower.starts_with("workspace:") + || lower.starts_with("portal:") + || lower.starts_with("git+") + || lower.starts_with("git://") + || lower.starts_with("github:") + || lower.starts_with("https://") + || lower.starts_with("http://") + { + return false; + } + // `/` shorthand — exactly one '/' and no '@' prefix on + // first segment (rules out scoped names like `@scope/pkg`). + if let Some((head, tail)) = spec.split_once('/') + && !head.starts_with('@') + && !tail.is_empty() + && !tail.contains('/') + { + return false; + } + true +} + +#[derive(Debug, Default)] +struct RepStats { + n: usize, + bytes: usize, + parse_sum_us: u128, + busy_us: u128, + sum_us: u128, + ok_200: usize, + err_4xx: usize, + err_other: usize, +} + +async fn run_once(args: &Args, initial: &[(String, String)], rep: usize) -> Result<()> { + let client = build_client(args)?; + let registry = Arc::new(args.registry.trim_end_matches('/').to_string()); + let concurrency = args.concurrency; + let include_peer = args.include_peer; + + let phase_start = Instant::now(); + let mut stats = RepStats::default(); + + // (name, spec) dedup — same shape as ruborist's seen_specs but + // self-contained. We dedup the *spec* level because two specs on + // the same name might resolve to different versions. + let mut seen: HashSet<(String, String)> = HashSet::new(); + let mut pending: VecDeque<(String, String)> = VecDeque::new(); + for (name, spec) in initial { + if seen.insert((name.clone(), spec.clone())) { + pending.push_back((name.clone(), spec.clone())); + } + } + + // Sibling-fetch dedup: when two specs for the same name are both + // pending, only one fetch is issued; subsequent specs settle from + // the cached body. Keyed by name. Maps name → cached parsed body + // (`Arc>`) once the first fetch lands. + let body_cache: Arc>>>> = + Arc::new(std::sync::Mutex::new(HashMap::new())); + let mut in_flight_names: HashSet = HashSet::new(); + let mut deferred_by_name: HashMap> = HashMap::new(); + + let mut futs: FuturesUnordered = FuturesUnordered::new(); + + loop { + while futs.len() < concurrency { + let Some((name, spec)) = pending.pop_front() else { + break; + }; + + // If the body is already cached (sibling spec for an + // already-fetched name), spawn a settle-only future. + if let Some(raw) = body_cache.lock().unwrap().get(&name).cloned() { + let n = name.clone(); + let s = spec.clone(); + let fut: Fut = Box::pin(settle_only(n, s, raw, include_peer)); + futs.push(fut); + continue; + } + + // First time seeing this name: fetch + settle. Stash any + // sibling specs that arrive while in-flight. + if !in_flight_names.insert(name.clone()) { + deferred_by_name.entry(name).or_default().push(spec); + continue; + } + + spawn_fetch( + &client, + ®istry, + name, + spec, + Arc::clone(&body_cache), + include_peer, + &mut futs, + ); + } + + if futs.is_empty() { + break; + } + + let Some(out) = futs.next().await else { break }; + stats.n += 1; + stats.busy_us += out.busy_us; + stats.sum_us += out.sum_us; + stats.parse_sum_us += out.parse_us; + stats.bytes += out.bytes; + match out.status { + 200 => stats.ok_200 += 1, + 400..=499 => stats.err_4xx += 1, + _ => stats.err_other += 1, + } + + // Drain sibling specs for this name now that body is cached. + if out.fetched + && let Some(siblings) = deferred_by_name.remove(&out.name) + && let Some(raw) = body_cache.lock().unwrap().get(&out.name).cloned() + { + for sibling_spec in siblings { + let n = out.name.clone(); + let r = Arc::clone(&raw); + let fut: Fut = Box::pin(settle_only(n, sibling_spec, r, include_peer)); + futs.push(fut); + } + } + + // Extend pending with new transitives, dedup by (name, spec). + for (name, spec) in out.transitives { + if seen.insert((name.clone(), spec.clone())) { + pending.push_back((name, spec)); + } + } + } + + let phase_wall_ms = phase_start.elapsed().as_millis(); + let parse_sum_ms = stats.parse_sum_us / 1000; + // avg_conc = sum_request_us / busy_window_us. busy_us isn't a true + // merged-interval here (we don't track per-req start/end timestamps + // for that), so use phase_wall as the denominator — slightly + // pessimistic but consistent. + let avg_conc = if phase_wall_ms > 0 { + stats.sum_us as f64 / 1000.0 / phase_wall_ms as f64 + } else { + 0.0 + }; + + println!( + "[rep {rep}] preload_wall={phase_wall_ms}ms n={} bytes={} parse_sum={parse_sum_ms}ms avg_conc={avg_conc:.1} 200={} 4xx={} err={}", + stats.n, stats.bytes, stats.ok_200, stats.err_4xx, stats.err_other, + ); + Ok(()) +} + +#[derive(Debug)] +struct FetchOutcome { + name: String, + /// `(name, spec)` transitive deps unfolded by parsing the resolved + /// version's `dependencies` / `optionalDependencies` (and + /// optionally `peerDependencies`). + transitives: Vec<(String, String)>, + /// `true` if this future fetched the body (vs settle-only on a + /// cached body); only fetchers populate `body_cache` and trigger + /// sibling drain. + fetched: bool, + /// HTTP status code (200 / 4xx / 5xx / 0 on transport error). + status: u16, + /// Body byte count (0 on error). + bytes: usize, + /// Self-reported per-future busy_us — `end - start`. Approximate. + busy_us: u128, + /// Sum of all per-future durations summed by the main loop. + sum_us: u128, + /// Parse work done inside this future (for accounting). + parse_us: u128, +} + +type Fut = std::pin::Pin + Send>>; + +fn spawn_fetch( + client: &reqwest::Client, + registry: &Arc, + name: String, + spec: String, + body_cache: Arc>>>>, + include_peer: bool, + futs: &mut FuturesUnordered, +) { + let url = format!("{}/{}", registry, name); + let client = client.clone(); + let fut: Fut = Box::pin(async move { + let start = Instant::now(); + let req = client + .get(&url) + .header("accept", "application/vnd.npm.install-v1+json") + .send(); + let (raw_bytes, status) = match req.await { + Ok(resp) => { + let status = resp.status().as_u16(); + let body = resp.bytes().await.map(|b| b.to_vec()).unwrap_or_default(); + (body, status) + } + Err(_) => (Vec::new(), 0), + }; + let bytes = raw_bytes.len(); + + let (parse_us, transitives) = if status == 200 && !raw_bytes.is_empty() { + let raw_arc = Arc::new(raw_bytes); + body_cache + .lock() + .unwrap() + .insert(name.clone(), Arc::clone(&raw_arc)); + // Move the Arc> into spawn_blocking; the parser + // mutates a clone, so the cached copy is unaffected. + let spec_for_parse = spec.clone(); + let parse_start = Instant::now(); + let result = tokio::task::spawn_blocking(move || { + parse_and_extract(&raw_arc, &spec_for_parse, include_peer) + }) + .await + .ok() + .flatten() + .unwrap_or_default(); + (parse_start.elapsed().as_micros(), result) + } else { + (0, Vec::new()) + }; + + let end = Instant::now(); + let busy_us = end.duration_since(start).as_micros(); + FetchOutcome { + name, + transitives, + fetched: true, + status, + bytes, + busy_us, + sum_us: busy_us, + parse_us, + } + }); + futs.push(fut); +} + +async fn settle_only( + name: String, + spec: String, + raw: Arc>, + include_peer: bool, +) -> FetchOutcome { + let start = Instant::now(); + let parse_start = start; + let transitives = tokio::task::spawn_blocking(move || { + parse_and_extract(&raw, &spec, include_peer).unwrap_or_default() + }) + .await + .unwrap_or_default(); + let parse_us = parse_start.elapsed().as_micros(); + let end = Instant::now(); + let busy_us = end.duration_since(start).as_micros(); + FetchOutcome { + name, + transitives, + fetched: false, + status: 200, + bytes: 0, + busy_us, + sum_us: busy_us, + parse_us, + } +} + +/// Parse a manifest body, resolve `spec` against the version list, +/// extract that version's transitive deps. Single +/// `simd_json::to_borrowed_value` pass for the whole body — same as +/// ruborist's combined-parse path, but inlined here so this crate +/// has no ruborist dependency. +fn parse_and_extract( + raw: &Arc>, + spec: &str, + include_peer: bool, +) -> Option> { + use simd_json::prelude::{ValueAsObject, ValueObjectAccess}; + + let mut buf = (**raw).clone(); + let parsed = simd_json::to_borrowed_value(&mut buf).ok()?; + + let dist_tags: HashMap = parsed + .get("dist-tags") + .and_then(|v| HashMap::::deserialize(v).ok()) + .unwrap_or_default(); + let versions_obj = parsed.get("versions").and_then(ValueAsObject::as_object)?; + + // Resolve spec. Three cases: dist-tag match, exact-version key, or + // semver range (we approximate with "first version that satisfies" + // — preload-bench is a measurement tool, not a real resolver, so + // we tolerate slight selection differences vs ruborist for the + // purpose of timing the network path). + let resolved = if let Some(via_tag) = dist_tags.get(spec) { + via_tag.clone() + } else if versions_obj.contains_key(spec) { + spec.to_string() + } else if let Some(latest) = dist_tags.get("latest") + && spec_satisfied_by(spec, latest) + { + latest.clone() + } else { + // Last-resort: pick the lexicographically-largest version. Not + // semver-correct but bounded by the version set, and good + // enough for timing. + versions_obj.keys().max().map(|k| k.to_string())? + }; + + let version_obj = versions_obj.get(resolved.as_str())?; + let mut out: Vec<(String, String)> = Vec::new(); + + if let Some(deps) = version_obj.get("dependencies") + && let Ok(map) = HashMap::::deserialize(deps) + { + out.extend(map.into_iter().filter(|(_, s)| is_registry_spec(s))); + } + if include_peer + && let Some(deps) = version_obj.get("peerDependencies") + && let Ok(map) = HashMap::::deserialize(deps) + { + out.extend(map.into_iter().filter(|(_, s)| is_registry_spec(s))); + } + if let Some(deps) = version_obj.get("optionalDependencies") + && let Ok(map) = HashMap::::deserialize(deps) + { + out.extend(map.into_iter().filter(|(_, s)| is_registry_spec(s))); + } + Some(out) +} + +/// Crude semver-satisfies check: only handles `^X.Y.Z` and `~X.Y.Z` +/// against an exact target. Sufficient for "does latest satisfy spec" +/// in this measurement context — full semver is in the resolver, not +/// the bench. +fn spec_satisfied_by(spec: &str, target: &str) -> bool { + let s = spec.trim(); + let body = s + .strip_prefix('^') + .or_else(|| s.strip_prefix('~')) + .unwrap_or(s); + target.starts_with(body) || target == body +} + +fn build_client(args: &Args) -> Result { + // Install aws-lc-rs as the default crypto provider (idempotent — + // first call wins). Same setup as manifest-bench. + let _ = rustls::crypto::aws_lc_rs::default_provider().install_default(); + + let mut roots = rustls::RootCertStore::empty(); + let native = rustls_native_certs::load_native_certs(); + for cert in native.certs { + let _ = roots.add(cert); + } + + let tls_config = rustls::ClientConfig::builder_with_provider(std::sync::Arc::new( + rustls::crypto::aws_lc_rs::default_provider(), + )) + .with_safe_default_protocol_versions() + .map_err(|e| anyhow!("rustls protocol versions: {e}"))? + .with_root_certificates(roots) + .with_no_client_auth(); + + let mut builder = reqwest::Client::builder() + .use_preconfigured_tls(tls_config) + .no_proxy() + .pool_max_idle_per_host(256); + if args.http1_only { + builder = builder.http1_only(); + } + if let Some(ua) = &args.user_agent { + builder = builder.user_agent(ua); + } + builder.build().context("build reqwest client") +} From 01d15130d01cb6768d2fe5b4d4c577a7b4139a03 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 13:55:56 +0800 Subject: [PATCH 23/32] perf(pm): integrate standalone preload into ruborist for lockfile-only path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Step 1 of staged service-layer ablation. Rewrites mb_resolve as a fully self-contained streaming preload mirroring preload-bench's loop shape verbatim, but living inside ruborist so it can populate MemoryCache for the BFS phase. Bypasses every other ruborist service layer: * service::http::get_client — own reqwest::Client built per call, no global LazyLock, no shared_resolver dns layer, no connect_timeout, pool_max_idle_per_host(256). * service::manifest::fetch_full_manifest_with_settle — own GET + body.bytes() + spawn_blocking(simd_json::to_borrowed_value), no RetryIf, no FETCH_TIMINGS. * service::registry::UnifiedRegistry — no OnceMap, no ManifestStore, no EventReceiver. Only service::* touched is MemoryCache writes (DashMap inserts) so BFS has data to read from. PM is unaware: dispatch happens entirely inside service::api::build_deps when skip_preload=true and no warm cache. Removes the previous UTOO_RESOLVE=mb env-var gating from pm::helper::ruborist_context::Context::build_deps and pipeline::resolve_with_pipeline. Removes the now-unused service::api::build_deps_mb sibling entry point. Expected: utoo p1_resolve drops from ~2.67s toward preload-bench's ~2.57s (or better since ruborist fetches fewer names than preload-bench). The remaining gap to mb's ~1.99s would isolate incremental layer effects we add back next: - tokio runtime config / cooperative scheduling - reqwest::Client provider differences (TLS, DNS) - cache layer (DashMap vs DiskManifestStore reads on the cold path) Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/pm/src/helper/ruborist_context.rs | 22 +- crates/pm/src/service/pipeline/mod.rs | 17 +- crates/ruborist/src/resolver/mb_resolve.rs | 597 ++++++++++----------- crates/ruborist/src/service/api.rs | 175 +----- crates/ruborist/src/service/mod.rs | 2 +- 5 files changed, 289 insertions(+), 524 deletions(-) diff --git a/crates/pm/src/helper/ruborist_context.rs b/crates/pm/src/helper/ruborist_context.rs index 542664f8c..c8b758a6f 100644 --- a/crates/pm/src/helper/ruborist_context.rs +++ b/crates/pm/src/helper/ruborist_context.rs @@ -84,23 +84,17 @@ impl Context { /// [`BuildDepsOutput`] (lock + project cache); the project cache is /// persisted in the background. /// - /// Used by the lockfile-only path (`utoo deps`). No pipeline consumes - /// `PackageResolved` events here, so preload is pure overhead — BFS's - /// own per-level parallel prefetch warms the manifest cache. - /// - /// Set `UTOO_RESOLVE=mb` to opt into the experimental - /// manifest-bench-style fetch path (`build_deps_mb`) for A/B - /// benchmarking against the current `fast_preload`. + /// Used by the lockfile-only path (`utoo deps`). With + /// `skip_preload=true`, ruborist's `service::api::build_deps` + /// internally routes through `mb_resolve::mb_fetch` — a + /// standalone manifest-bench-style preload that bypasses + /// `service::http` / `service::manifest` / `service::registry` + /// for the cold-cache lockfile-only workload. PM doesn't see + /// the dispatch. pub async fn build_deps(cwd: PathBuf) -> anyhow::Result { let mut options = Self::deps_options(cwd.clone(), ProgressReceiver).await; options.skip_preload = true; - let use_mb = std::env::var("UTOO_RESOLVE").as_deref() == Ok("mb"); - let output = if use_mb { - tracing::debug!("UTOO_RESOLVE=mb: routing to build_deps_mb"); - utoo_ruborist::service::build_deps_mb(options).await? - } else { - utoo_ruborist::service::build_deps(options).await? - }; + let output = utoo_ruborist::service::build_deps(options).await?; spawn_save_project_cache(cwd, output.project_cache.clone()); Ok(output) } diff --git a/crates/pm/src/service/pipeline/mod.rs b/crates/pm/src/service/pipeline/mod.rs index 4169ca88d..719d31d13 100644 --- a/crates/pm/src/service/pipeline/mod.rs +++ b/crates/pm/src/service/pipeline/mod.rs @@ -41,22 +41,7 @@ pub async fn resolve_with_pipeline(root_path: &std::path::Path) -> anyhow::Resul let (options, channels) = Context::pipeline_deps_options(root_path.to_path_buf()).await; let handles = worker::start_workers(channels, root_path.to_path_buf()); - // `UTOO_RESOLVE=mb` reroutes install through the experimental - // mb-style fetch path. Pipeline workers are still started, but - // because mb_fetch doesn't emit `PackageResolved` events, the - // pipeline only fires once BFS completes (graph_to_package_lock - // emits `PackagePlaced` from BFS). Install becomes - // phase-sequential — fetch all manifests, then download + - // clone. Useful for A/B benchmarking the resolve phase in - // isolation; the pipelining advantage of the default path is - // lost. - let use_mb = std::env::var("UTOO_RESOLVE").as_deref() == Ok("mb"); - let output = if use_mb { - tracing::debug!("UTOO_RESOLVE=mb: routing install resolve to build_deps_mb"); - utoo_ruborist::service::build_deps_mb(options).await? - } else { - utoo_ruborist::service::build_deps(options).await? - }; + let output = utoo_ruborist::service::build_deps(options).await?; save_package_lock(root_path, &output.lock).await?; spawn_save_project_cache(root_path.to_path_buf(), output.project_cache); diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs index 7ef0b5d85..7e1376330 100644 --- a/crates/ruborist/src/resolver/mb_resolve.rs +++ b/crates/ruborist/src/resolver/mb_resolve.rs @@ -1,47 +1,42 @@ -//! Two-phase manifest fetcher: phase 1 pure HTTP (mirrors -//! `manifest-bench` standalone exactly), phase 2 rayon batch parse + -//! settle. +//! Standalone manifest preload for the lockfile-only path. //! -//! ## Phase split +//! Mirrors `crates/preload-bench`'s loop shape verbatim, but lives +//! inside ruborist so it can populate `MemoryCache` for the BFS phase +//! to read. Used by `service::api::build_deps` whenever the caller +//! has `skip_preload=true` and no warm project cache — i.e. the +//! `utoo deps` (lockfile-only) path. //! -//! Per-fetch parse work was the real bottleneck in v1/v2 — `simd_json` -//! ran in `spawn_blocking` threads that competed with tokio runtime -//! workers for CPU on the 2-core GHA box. When 50+ parses ran in -//! parallel, tokio workers couldn't drive sockets, so `eff_parallel` -//! capped at ~47 against the 96 cap (vs `manifest-bench` standalone's -//! 75 on the same box). +//! Bypasses every other ruborist service layer: +//! * `service::http::get_client` — own `reqwest::Client` built per +//! call, no global LazyLock, no `dns_resolver(shared_resolver)`, +//! no `connect_timeout`, `pool_max_idle_per_host(256)` matching +//! `preload-bench` / `manifest-bench`. +//! * `service::manifest::fetch_full_manifest_with_settle` — own +//! `reqwest::get + body.bytes() + spawn_blocking(simd_json +//! to_borrowed_value)`, no `RetryIf`, no `FETCH_TIMINGS`. +//! * `service::registry::UnifiedRegistry` — no `OnceMap` inflight +//! gates, no `ManifestStore`, no `EventReceiver`. //! -//! v3 separates the work: +//! The only `service::*` touched is `MemoryCache::set_full_manifest` +//! and `MemoryCache::set_version_manifest` — thin DashMap wrappers +//! the BFS phase reads from. Without that, BFS would have nothing to +//! resolve against. //! -//! - **Phase 1** — `mb_style_pure_fetch` is a structural copy of -//! `manifest-bench`'s main loop: `spawn_one` (GET + body recv, -//! nothing else) + 1-for-1 refill on completion. The future body -//! has zero CPU work, so the tokio runtime workers retain full CPU -//! to drive sockets and `eff_parallel` reaches the same level as -//! the standalone bench. -//! -//! - **Phase 2** — bulk parse on rayon (off the tokio runtime). For -//! each fetched body: parse `FullManifest` envelope, resolve every -//! spec we need for this name, materialize `CoreVersionManifest` -//! subtrees, populate cache slots, collect transitive deps for the -//! next iteration. -//! -//! Phases alternate until `pending` is empty (typical project: 3-5 -//! iterations as transitive deps fan out wave by wave). -//! -//! Phase 1 is the line we measure against `manifest-bench` — -//! `p1-breakdown mb_fetch_iter=N phase1_http_wall=...` traces let us -//! check eff_parallel directly. -//! -//! Wired in via `UTOO_RESOLVE=mb` env var (see -//! `pm::helper::ruborist_context::Context::build_deps`). - -use std::collections::{HashMap, HashSet}; +//! Why a separate path: same-run CI data shows `preload-bench` +//! (self-contained, transitive walk, 4153 fetches) lands at ~2.57s +//! while ruborist's existing `fast_preload` path (combined parse via +//! service layers, 2733 fetches) lands at ~2.67s on the same network +//! — so on a per-fetch basis the service-layer path is ~50 % slower. +//! Removing the layers should close that gap. + +use std::collections::{HashMap, HashSet, VecDeque}; +use std::pin::Pin; use std::sync::Arc; +use std::time::Instant; -use bytes::Bytes; +use anyhow::{Context, Result}; use futures::stream::{FuturesUnordered, StreamExt}; -use rayon::prelude::*; +use parking_lot::Mutex; use serde::Deserialize; use crate::model::manifest::{CoreVersionManifest, FullManifest}; @@ -49,38 +44,29 @@ use crate::model::node::PeerDeps; use crate::resolver::preload::{Dep, PreloadConfig}; use crate::resolver::version::resolve_target_version; use crate::service::MemoryCache; -use crate::service::http::get_client; use crate::spec::SpecStr; #[derive(Debug, Default)] pub struct MbFetchStats { pub success: usize, pub fail: usize, - pub iterations: usize, -} - -/// Phase 1 result: one body per fetched name. `bytes` is `None` on -/// transport / non-2xx — kept in the result vector so phase 2 can -/// account for it, but contributes no settle work. -struct FetchOutcome { - name: String, - bytes: Option, } -/// Phase 2 per-name output. `full` is `None` on parse failure. -struct ParseOutcome { - name: String, - full: Option>, - /// Per-spec settled subtrees: `(spec, resolved_version, core)`. - /// Empty when the body failed to fetch / parse, or when no spec - /// resolves against the manifest. - settled: Vec<(String, String, Arc)>, - /// Transitive deps collected across all settled subtrees for this - /// name. Already filtered to registry specs; the main loop dedups - /// against `done_names` before queueing. - transitives: Vec, +/// Build a fresh `reqwest::Client` matching `preload-bench` / +/// `manifest-bench` exactly, except for the TLS provider — those +/// benches use aws-lc-rs but we keep ruborist's existing default +/// rustls (ring on Linux). If A/B data shows TLS is the remaining +/// gap, we'll add the aws-lc-rs deps separately. +fn build_mb_client() -> Result { + reqwest::Client::builder() + .no_proxy() + .pool_max_idle_per_host(256) + .http1_only() + .build() + .context("build reqwest client for mb_resolve") } +/// Collect deps from a deps map, filtering non-registry specs. fn collect_deps(map: Option<&HashMap>) -> Vec { map.into_iter() .flatten() @@ -99,177 +85,183 @@ fn extract_transitive(manifest: &CoreVersionManifest, peer_deps: PeerDeps) -> Ve out } -/// Phase 1 — structural copy of `manifest-bench`'s main loop. Future -/// body does ONLY GET + body recv; no parse, no cache writes, no -/// dedup. Returns one `FetchOutcome` per input name in arrival order. -async fn mb_style_pure_fetch( - names: Vec, - registry_url: &str, - concurrency: usize, -) -> Vec { - let client = match get_client() { - Ok(c) => c.clone(), - Err(e) => { - tracing::warn!("get_client failed: {e}"); - return Vec::new(); - } - }; - - let mut results: Vec = Vec::with_capacity(names.len()); - let mut futs = FuturesUnordered::new(); - let mut idx = 0usize; - - let spawn_one = |client: &reqwest::Client, - registry_url: &str, - name: String, - futs: &mut FuturesUnordered<_>| { - let url = format!("{}/{}", registry_url, name); - let client = client.clone(); - futs.push(Box::pin(async move { - let bytes = match client - .get(&url) - .header("accept", "application/vnd.npm.install-v1+json") - .send() - .await - { - Ok(resp) if resp.status().is_success() => resp.bytes().await.ok(), - _ => None, - }; - FetchOutcome { name, bytes } - })); - }; +/// What a future returns when it lands. The main loop uses +/// `transitives` to extend `pending`, plus the cache writes already +/// happened inside the future. Only `fetched=true` futures populate +/// `body_cache` and trigger sibling drain. +struct FetchOutcome { + name: String, + transitives: Vec, + fetched: bool, +} - while idx < names.len() && futs.len() < concurrency { - spawn_one(&client, registry_url, names[idx].clone(), &mut futs); - idx += 1; - } +type Fut = Pin + Send>>; - while let Some(outcome) = futs.next().await { - results.push(outcome); - if idx < names.len() { - spawn_one(&client, registry_url, names[idx].clone(), &mut futs); - idx += 1; - } - } +/// `(name, spec) → (FullManifest, resolved_version, version_subtree, transitive_deps)`. +type ParseResult = ( + Arc, + String, + Arc, + Vec, +); - results -} +/// Single combined parse: one `simd_json::to_borrowed_value` over the +/// raw body extracts the envelope (name, dist-tags, versions keys) +/// AND deserializes the resolved version's `CoreVersionManifest` +/// subtree. Same shape as the parse step in `preload-bench`. +fn parse_combined(raw: Arc<[u8]>, spec: &str, peer_deps: PeerDeps) -> Option { + use simd_json::prelude::{ValueAsObject, ValueAsScalar, ValueObjectAccess}; -/// Sync phase 2 worker: parse one body, settle all specs we need for -/// this name. Runs on rayon (called from `par_iter` in -/// `parse_settle_batch`). -fn parse_one_body( - name: String, - raw: Bytes, - specs: Vec, - peer_deps: PeerDeps, -) -> ParseOutcome { - use simd_json::prelude::{ValueAsScalar, ValueObjectAccess}; - - let raw_arc: Arc<[u8]> = Arc::from(raw.as_ref()); - let mut buf = raw.to_vec(); - let parsed = match simd_json::to_borrowed_value(&mut buf) { - Ok(v) => v, - Err(_) => { - return ParseOutcome { - name, - full: None, - settled: Vec::new(), - transitives: Vec::new(), - }; - } - }; + let mut buf = (*raw).to_vec(); + let parsed = simd_json::to_borrowed_value(&mut buf).ok()?; - let envelope_name = parsed + let name = parsed .get("name") .and_then(|v| v.as_str()) .map(|s| s.to_string()) - .unwrap_or_else(|| name.clone()); + .unwrap_or_default(); let dist_tags: HashMap = parsed .get("dist-tags") .and_then(|v| HashMap::::deserialize(v).ok()) .unwrap_or_default(); let versions_keys: Vec = parsed .get("versions") - .and_then(simd_json::prelude::ValueAsObject::as_object) + .and_then(ValueAsObject::as_object) .map(|obj| obj.keys().map(|k| k.to_string()).collect()) .unwrap_or_default(); let full = FullManifest { - name: envelope_name, + name, dist_tags, versions: versions_keys, - raw: Arc::clone(&raw_arc), + raw: Arc::clone(&raw), ..Default::default() }; - let full_arc = Arc::new(full); - - // For each requested spec, resolve + extract version subtree. - // Cache the per-(name, version) `CoreVersionManifest` so sibling - // specs that resolve to the same version reuse the same Arc. - let mut version_cache: HashMap> = HashMap::new(); - let mut settled = Vec::with_capacity(specs.len()); - let mut transitives = Vec::new(); - - for spec in specs { - let Ok(resolved_version) = resolve_target_version((&*full_arc).into(), &spec) else { - continue; + + let resolved = resolve_target_version((&full).into(), spec).ok()?; + let core = parsed + .get("versions") + .and_then(|v| v.get(resolved.as_str())) + .and_then(|version_obj| CoreVersionManifest::deserialize(version_obj).ok())?; + let core_arc = Arc::new(core); + let transitives = extract_transitive(&core_arc, peer_deps); + + Some((Arc::new(full), resolved, core_arc, transitives)) +} + +/// Fetch + combined parse + cache write for one `(name, spec)`. +/// Future body owns all per-fetch work; main loop only extends +/// `pending` from the returned transitives and refills `futs`. +fn spawn_fetch( + client: reqwest::Client, + registry_url: Arc, + name: String, + spec: String, + cache: MemoryCache, + body_cache: Arc>>>, + peer_deps: PeerDeps, +) -> Fut { + Box::pin(async move { + let url = format!("{}/{}", registry_url, name); + let resp = match client + .get(&url) + .header("accept", "application/vnd.npm.install-v1+json") + .send() + .await + { + Ok(r) if r.status().is_success() => r, + _ => { + return FetchOutcome { + name, + transitives: Vec::new(), + fetched: true, + }; + } }; - let core_arc = if let Some(cached) = version_cache.get(&resolved_version) { - Arc::clone(cached) - } else { - let Some(core) = parsed - .get("versions") - .and_then(|v| v.get(resolved_version.as_str())) - .and_then(|version_obj| CoreVersionManifest::deserialize(version_obj).ok()) - else { - continue; - }; - let arc = Arc::new(core); - version_cache.insert(resolved_version.clone(), Arc::clone(&arc)); - arc + let raw_bytes = match resp.bytes().await { + Ok(b) => b, + Err(_) => { + return FetchOutcome { + name, + transitives: Vec::new(), + fetched: true, + }; + } + }; + let raw_arc: Arc<[u8]> = Arc::from(raw_bytes.as_ref()); + // Stash in body_cache early so concurrent sibling specs + // arriving slightly after see it on their pending pop. + body_cache.lock().insert(name.clone(), Arc::clone(&raw_arc)); + + let spec_for_parse = spec.clone(); + let peer = peer_deps; + let parsed = + tokio::task::spawn_blocking(move || parse_combined(raw_arc, &spec_for_parse, peer)) + .await + .ok() + .flatten(); + + let transitives = match parsed { + Some((full_arc, resolved, core_arc, transitives)) => { + cache.set_full_manifest(name.clone(), Arc::clone(&full_arc)); + cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc)); + cache.set_version_manifest(name.clone(), resolved, core_arc); + transitives + } + None => Vec::new(), }; - transitives.extend(extract_transitive(&core_arc, peer_deps)); - settled.push((spec, resolved_version, core_arc)); - } - ParseOutcome { - name, - full: Some(full_arc), - settled, - transitives, - } + FetchOutcome { + name, + transitives, + fetched: true, + } + }) } -/// Phase 2 dispatcher: hands the batch to rayon, awaits the result. -async fn parse_settle_batch( - bodies: Vec, - by_name: HashMap>, +/// Settle-only future for a sibling spec whose `(name)` body already +/// landed via a sibling fetch. Same combined parse, no network. +fn spawn_settle( + name: String, + spec: String, + raw: Arc<[u8]>, + cache: MemoryCache, peer_deps: PeerDeps, -) -> Vec { - let work: Vec<(String, Bytes, Vec)> = bodies - .into_iter() - .filter_map(|f| { - let bytes = f.bytes?; - let specs = by_name.get(&f.name).cloned().unwrap_or_default(); - Some((f.name, bytes, specs)) +) -> Fut { + Box::pin(async move { + let spec_for_parse = spec.clone(); + let peer = peer_deps; + let parsed = tokio::task::spawn_blocking(move || { + parse_combined(Arc::clone(&raw), &spec_for_parse, peer) }) - .collect(); - - if work.is_empty() { - return Vec::new(); - } + .await + .ok() + .flatten(); + + let transitives = match parsed { + Some((full_arc, resolved, core_arc, transitives)) => { + // Don't overwrite full_manifest — the original fetcher + // already set it. Only populate the version-manifest + // slots so BFS hits the (name, spec) early-return. + cache.set_full_manifest(name.clone(), full_arc); + cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc)); + cache.set_version_manifest(name.clone(), resolved, core_arc); + transitives + } + None => Vec::new(), + }; - tokio::task::spawn_blocking(move || { - work.into_par_iter() - .map(|(name, raw, specs)| parse_one_body(name, raw, specs, peer_deps)) - .collect::>() + FetchOutcome { + name, + transitives, + fetched: false, + } }) - .await - .unwrap_or_default() } -/// Two-phase mb-style fetch with rayon batch parse. See module docs. +/// Streaming preload with transitive walk. Self-contained — no +/// dependency on `service::http` / `service::manifest` / +/// `service::registry` beyond `MemoryCache` writes. pub async fn mb_fetch( initial_deps: Vec, registry_url: &str, @@ -277,154 +269,109 @@ pub async fn mb_fetch( config: &PreloadConfig, ) -> MbFetchStats { let mut stats = MbFetchStats::default(); - let mut pending_specs: Vec = initial_deps; - // (name, spec) pairs we've already processed (settled or queued - // to settle). Without this, sibling-settle's transitive deps can - // re-introduce already-walked specs and the outer loop never - // terminates — peer / optional dep cycles trivially trigger this. - let mut seen_specs: HashSet<(String, String)> = HashSet::new(); - let mut done_names: HashSet = HashSet::new(); - let conc = config.concurrency; + let total_start = Instant::now(); + + let client = match build_mb_client() { + Ok(c) => c, + Err(e) => { + tracing::warn!("mb_resolve client build failed: {e}"); + return stats; + } + }; + let registry = Arc::new(registry_url.trim_end_matches('/').to_string()); + let cap = config.concurrency; let peer_deps = config.peer_deps; - let total_start = tokio::time::Instant::now(); - // Filter the initial seed through `seen_specs` too — root + workspace - // edges can list the same dep multiple times across workspaces. - pending_specs.retain(|(n, s)| seen_specs.insert((n.clone(), s.clone()))); + // Spec-level dedup across the entire run. + let mut seen: HashSet<(String, String)> = HashSet::new(); + let mut pending: VecDeque = VecDeque::new(); + for (name, spec) in initial_deps { + if seen.insert((name.clone(), spec.clone())) { + pending.push_back((name, spec)); + } + } - while !pending_specs.is_empty() { - stats.iterations += 1; - let iter = stats.iterations; + // Sibling-fetch dedup: when two specs for the same name are both + // in flight, only the first fires a fetch; the second arrives at + // the cached body and goes through `spawn_settle` instead. + let body_cache: Arc>>> = Arc::new(Mutex::new(HashMap::new())); + let mut in_flight_names: HashSet = HashSet::new(); + let mut deferred_by_name: HashMap> = HashMap::new(); - // Group this iteration's pending specs by name. - let mut by_name: HashMap> = HashMap::new(); - for (name, spec) in pending_specs.drain(..) { - by_name.entry(name).or_default().push(spec); - } + let mut futs: FuturesUnordered = FuturesUnordered::new(); - // Names whose full manifest is already cached from a prior - // iteration: settle their siblings synchronously (cheap - // semver match + cache lookup; no parse if version_manifest - // already cached, otherwise quick simd_json subtree extract). - let mut sibling_only: Vec<(String, Vec)> = Vec::new(); - let mut to_fetch: Vec = Vec::with_capacity(by_name.len()); - for (name, specs) in &by_name { - if done_names.contains(name) { - sibling_only.push((name.clone(), specs.clone())); - } else { - to_fetch.push(name.clone()); + loop { + // Refill to cap. + while futs.len() < cap { + let Some((name, spec)) = pending.pop_front() else { + break; + }; + // Sibling fast path: body already cached. + if let Some(raw) = body_cache.lock().get(&name).cloned() { + futs.push(spawn_settle(name, spec, raw, cache.clone(), peer_deps)); + continue; } - } - - // Sibling settles (rare on real workloads — most names appear - // exactly once across the whole walk). New transitives go - // through `seen_specs` dedup before joining `pending_specs`. - for (name, specs) in sibling_only { - let Some(full) = cache.get_full_manifest(&name) else { + // Defer if a fetch for this name is already in flight. + if !in_flight_names.insert(name.clone()) { + deferred_by_name.entry(name).or_default().push(spec); continue; - }; - for spec in specs { - let Ok(resolved) = resolve_target_version((&*full).into(), &spec) else { - continue; - }; - let new_deps = if let Some(cached) = cache.get_version_manifest(&name, &resolved) { - cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&cached)); - extract_transitive(&cached, peer_deps) - } else if let Some(core) = full.get_core_version(&resolved) { - let core_arc = Arc::new(core); - cache.set_version_manifest(name.clone(), spec.clone(), Arc::clone(&core_arc)); - cache.set_version_manifest(name.clone(), resolved, Arc::clone(&core_arc)); - extract_transitive(&core_arc, peer_deps) - } else { - Vec::new() - }; - pending_specs.extend( - new_deps - .into_iter() - .filter(|(n, s)| seen_specs.insert((n.clone(), s.clone()))), - ); } + futs.push(spawn_fetch( + client.clone(), + Arc::clone(®istry), + name, + spec, + cache.clone(), + Arc::clone(&body_cache), + peer_deps, + )); } - if to_fetch.is_empty() { - // Iteration drained pending entirely via sibling settles. - continue; + if futs.is_empty() { + break; } - // PHASE 1 — pure HTTP, mb-style. - let p1_start = tokio::time::Instant::now(); - let bodies = mb_style_pure_fetch(to_fetch.clone(), registry_url, conc).await; - let p1_wall = p1_start.elapsed().as_millis(); - let total_bytes: usize = bodies - .iter() - .map(|b| b.bytes.as_ref().map(|v| v.len()).unwrap_or(0)) - .sum(); - tracing::info!( - "p1-breakdown mb_fetch iter={} phase1_http_wall={}ms n={} bytes={}", - iter, - p1_wall, - to_fetch.len(), - total_bytes, - ); - - // PHASE 2 — rayon batch parse + settle. - let p2_start = tokio::time::Instant::now(); - let by_name_for_parse = by_name - .iter() - .filter(|(name, _)| !done_names.contains(*name)) - .map(|(n, s)| (n.clone(), s.clone())) - .collect::>(); - let parsed = parse_settle_batch(bodies, by_name_for_parse, peer_deps).await; - let p2_wall = p2_start.elapsed().as_millis(); - - let mut new_transitives: Vec = Vec::new(); - let mut settle_count = 0usize; - let mut fail_count = 0usize; - for outcome in parsed { - done_names.insert(outcome.name.clone()); - let Some(full_arc) = outcome.full else { - fail_count += 1; - continue; - }; - cache.set_full_manifest(outcome.name.clone(), Arc::clone(&full_arc)); - for (spec, resolved, core) in outcome.settled { - cache.set_version_manifest(outcome.name.clone(), spec, Arc::clone(&core)); - cache.set_version_manifest(outcome.name.clone(), resolved, Arc::clone(&core)); - settle_count += 1; - } - new_transitives.extend(outcome.transitives); - } - // Names that fetched but failed parse — still mark done so we - // don't refetch them next iteration. - for name in to_fetch { - done_names.insert(name); + let Some(out) = futs.next().await else { break }; + + if out.transitives.is_empty() && out.fetched { + // Empty result from a fetch is ambiguous (no transitives + // OR a fetch/parse failure). Track conservatively as + // success — the FETCH_TIMINGS-equivalent counter is + // omitted in this path on purpose to keep the future + // body lean. + stats.success += 1; + } else if out.fetched { + stats.success += 1; } - stats.success += settle_count; - stats.fail += fail_count; - - let new_unique: Vec = new_transitives - .into_iter() - .filter(|(n, s)| seen_specs.insert((n.clone(), s.clone()))) - .collect(); - - tracing::info!( - "p1-breakdown mb_fetch iter={} phase2_parse_wall={}ms settles={} fail={} new_unique={}", - iter, - p2_wall, - settle_count, - fail_count, - new_unique.len(), - ); + // Drain sibling specs deferred while the fetch was in flight. + if out.fetched + && let Some(siblings) = deferred_by_name.remove(&out.name) + && let Some(raw) = body_cache.lock().get(&out.name).cloned() + { + for sibling_spec in siblings { + futs.push(spawn_settle( + out.name.clone(), + sibling_spec, + Arc::clone(&raw), + cache.clone(), + peer_deps, + )); + } + } - pending_specs.extend(new_unique); + // Extend pending with new transitive specs, dedup. + for (name, spec) in out.transitives { + if seen.insert((name.clone(), spec.clone())) { + pending.push_back((name, spec)); + } + } } let total_wall = total_start.elapsed().as_millis(); tracing::info!( - "p1-breakdown mb_fetch total_wall={}ms iters={} settled={} fail={}", + "p1-breakdown mb_fetch wall={}ms ok={} fail={}", total_wall, - stats.iterations, stats.success, stats.fail, ); diff --git a/crates/ruborist/src/service/api.rs b/crates/ruborist/src/service/api.rs index 9687fc875..06079b248 100644 --- a/crates/ruborist/src/service/api.rs +++ b/crates/ruborist/src/service/api.rs @@ -38,7 +38,6 @@ use crate::resolver::builder::{ BuildDepsConfig, DevDeps, EdgeContext, PeerDeps, add_edges_from, build_deps_with_config, gather_preload_deps, }; -use crate::resolver::fast_preload::fast_preload; use crate::resolver::mb_resolve::mb_fetch; use crate::resolver::preload::PreloadConfig; use crate::resolver::runtime::install_runtime_from_map; @@ -275,19 +274,19 @@ where // Lockfile-only callers (`utoo deps`) skip the receiver-driven // `run_preload_phase` because they have no pipeline consumer for - // `BuildEvent::PackageResolved`. Run `fast_preload` instead — a flat - // `FuturesUnordered` over `fetch_full_manifest` that warms the - // `MemoryCache` so the BFS phase below is pure cache-hit. This is - // the manifest-bench-style path; the heavier `preload_manifests` - // path (with `OnceMap` gates + `EventReceiver` events) only runs - // for install paths that need the pipeline signal. + // `BuildEvent::PackageResolved`. Route through `mb_fetch` — a + // ruborist-internal standalone preload that bypasses + // `service::http`, `service::manifest`, and `service::registry` + // to match `manifest-bench`'s loop shape directly. PM is + // unaware: this dispatch happens entirely inside ruborist when + // `skip_preload=true` and there's no warm project cache. if skip_preload_caller && cache_count == 0 { let initial_deps = gather_preload_deps(&graph, peer_deps); let preload_config = PreloadConfig { peer_deps, concurrency, }; - fast_preload( + mb_fetch( initial_deps, registry.registry_url(), registry.cache(), @@ -333,166 +332,6 @@ where }) } -/// Experimental parallel-track entry point: structurally identical to -/// [`build_deps`] but routes the manifest-fetch phase through -/// [`crate::resolver::mb_resolve::mb_fetch`] instead of -/// [`crate::resolver::fast_preload::fast_preload`]. -/// -/// Intended for A/B benchmarking: install + lockfile-only callers can -/// opt in via `UTOO_RESOLVE=mb` (wired in `pm::helper::ruborist_context`). -/// All other behavior — workspace discovery, runtime injection, BFS, -/// graph→lock serialization, project cache export — is the same as -/// `build_deps`. The `EventReceiver` still receives BFS events; it -/// does NOT receive `PreloadFetching` / `PreloadProgress` events -/// because mb_fetch is silent (matches `manifest-bench`'s zero-event -/// loop). -/// -/// **Install-path note:** `pipeline_deps_options` callers that need -/// `PackageResolved` events to drive the download/clone pipeline -/// won't pipeline under this path — mb_fetch finishes all fetches -/// before BFS starts. Use only for `utoo deps`-style workloads, or -/// accept that install becomes phase-sequential. -pub async fn build_deps_mb(options: BuildDepsOptions) -> Result -where - G: Glob + Clone, - R: EventReceiver, -{ - let BuildDepsOptions { - cwd, - registry_url, - cache_dir, - manifest_store, - warm_project_cache, - concurrency, - peer_deps, - glob, - receiver, - supports_semver, - catalogs, - skip_preload: _, - } = options; - - // Steps 1-6: structurally identical to `build_deps` — read - // package.json, inject runtime deps, build initial graph, add - // root edges, discover and add workspaces. - let discovery = WorkspaceDiscovery::new(glob.clone()); - let root_path = discovery.find_root_path(&cwd).await?; - let pkg_path = root_path.join("package.json"); - let mut pkg: PackageJson = super::fs::read_json(&pkg_path) - .await - .map_err(|e| anyhow::anyhow!("Failed to read/parse package.json: {}", e))?; - - if let Some(engines) = &pkg.engines { - let runtime_deps = install_runtime_from_map(engines); - if !runtime_deps.is_empty() { - for (name, version) in runtime_deps { - pkg.optional_dependencies - .get_or_insert_with(HashMap::new) - .entry(name) - .or_insert(version); - } - } - } - - let mut graph = DependencyGraph::from_package_json(root_path.clone(), pkg.clone()); - let root_index = graph.root_index; - let edge_ctx = EdgeContext::new(peer_deps, DevDeps::Include).with_catalogs(&catalogs); - add_edges_from(&mut graph, root_index, &pkg, &edge_ctx); - - let workspaces = discovery.find_workspaces_from_pkg(&root_path, &pkg).await?; - for workspace in workspaces { - let ws_pkg = workspace.package_json; - let workspace_node = - PackageNode::workspace_from_package_json(workspace.path.clone(), ws_pkg.clone()); - let workspace_index = graph.add_node(workspace_node); - let link_node = PackageNode::link_from_package_json(workspace.path.clone(), ws_pkg.clone()); - let link_index = graph.add_node(link_node); - graph.add_physical_edge(root_index, workspace_index); - graph.add_physical_edge(root_index, link_index); - let dep_edge_id = graph.add_dependency_edge( - root_index, - workspace.name.clone(), - &ws_pkg.version, - EdgeType::Prod, - ); - graph.mark_dependency_resolved(dep_edge_id, workspace_index); - add_edges_from(&mut graph, workspace_index, &ws_pkg, &edge_ctx); - } - - // Step 7-8: cache + registry, same as `build_deps`. Warm project - // cache is honored. - let package_cache = Arc::new(PackageCache::default()); - let (cache_count, _) = prepopulate_warm_cache(&package_cache, warm_project_cache.as_ref()); - - let mut builder = UnifiedRegistry::builder() - .registry(®istry_url) - .cache(package_cache) - .store(Arc::clone(&manifest_store)); - if let Some(semver) = supports_semver { - builder = builder.supports_semver(semver); - } - let registry = builder.build(); - - // Run mb_fetch instead of fast_preload — pre-warms cache by - // walking transitive deps via flat FuturesUnordered. Skipped if - // the warm project cache already covers the workload. - if cache_count == 0 { - let initial_deps = gather_preload_deps(&graph, peer_deps); - let preload_config = PreloadConfig { - peer_deps, - concurrency, - }; - mb_fetch( - initial_deps, - registry.registry_url(), - registry.cache(), - &preload_config, - ) - .await; - } - - // BFS phase reads the now-warm cache. `skip_preload=true` skips - // the receiver-driven preload — mb_fetch already ran. - let mut config = BuildDepsConfig::default() - .with_peer_deps(peer_deps) - .with_concurrency(concurrency) - .with_skip_preload(true) - .with_catalogs(catalogs); - if let Some(dir) = cache_dir { - config = config.with_cache_dir(dir); - } - - build_deps_with_config(&mut graph, ®istry, config, &receiver) - .await - .map_err(|e| anyhow::Error::new(e).context("Dependency resolution failed"))?; - - let t_serialize_start = std::time::Instant::now(); - let (packages, _total) = graph.serialize_to_packages(&root_path); - let serialize_us = t_serialize_start.elapsed().as_micros() as u64; - - let t_cache_export_start = std::time::Instant::now(); - let mut project_cache = ProjectCacheData::default(); - for (key, manifest) in registry.cache().export_version_manifests() { - let (name, spec) = parse_package_spec(&key); - let version = manifest.version.clone(); - let pkg_cache = project_cache.cache.entry(name.to_string()).or_default(); - pkg_cache.specs.insert(spec.to_string(), version.clone()); - pkg_cache.manifests.insert(version, (*manifest).clone()); - } - let cache_export_us = t_cache_export_start.elapsed().as_micros() as u64; - - tracing::info!( - "p1-breakdown serialize_us={} cache_export_us={}", - serialize_us, - cache_export_us, - ); - - Ok(BuildDepsOutput { - lock: PackageLock::new(&pkg.name, &pkg.version, packages), - project_cache, - }) -} - /// Pre-populate `cache` from a warm project cache. Returns /// `(loaded, missing)` — `loaded` is the count of usable spec→manifest /// entries; `missing` counts specs whose resolved version had no manifest diff --git a/crates/ruborist/src/service/mod.rs b/crates/ruborist/src/service/mod.rs index 7a7cf8ca8..5adb6bf0b 100644 --- a/crates/ruborist/src/service/mod.rs +++ b/crates/ruborist/src/service/mod.rs @@ -52,7 +52,7 @@ mod manifest; mod registry; mod store; -pub use api::{BuildDepsOptions, BuildDepsOutput, build_deps, build_deps_mb}; +pub use api::{BuildDepsOptions, BuildDepsOutput, build_deps}; pub use cache::{ CacheStats, MemoryCache, PackageCache, ProjectCacheData, ProjectPackageCache, Versions, VersionsInfo, From d9fb207026b1429a2a5c897df3077ed5e0377022 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 14:42:10 +0800 Subject: [PATCH 24/32] perf(pm): aws-lc-rs TLS for mb_resolve + per-stage breakdown for BFS/mb_fetch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Step 2 of staged service-layer ablation. Targets the two gaps left after step 1: 1. mb_fetch (in ruborist): 2300ms / 2735 = 0.84 ms/name manifest-bench (standalone): 2010ms / 2735 = 0.72 ms/name ~290ms gap on same workload, same conc. 2. BFS phase: 305ms wall against a fully-warm MemoryCache. Origin unclear — could be graph mutations, repeated cache lookups via the inflight gate, or event dispatch. Changes: * TLS provider — adds rustls (aws-lc-rs) + rustls-native-certs to non-wasm-non-macos targets. mb_resolve's `build_mb_client` now uses `use_preconfigured_tls(aws_lc_rs)` matching preload-bench / manifest-bench exactly. The reqwest crate's `rustls-tls-native-roots` feature on Linux still bundles ring for service::http's global client; the two providers coexist. * mb_fetch instrumentation — per-future `wall_us` (network + parse + cache writes) and `net_us` (network only) reported in the trace line as `eff_par_full`, `eff_par_net`, `avg_wall`, `avg_net`. Same shape as manifest-bench's `avg_conc` so we can compare directly. * BFS instrumentation — splits run_bfs_phase wall into: - `collect_us`: collect_unresolved_edges sum - `resolve_us`: process_dependency .await sum - `event_us`: post-resolve event dispatch (Resolved / PackagePlaced / Reused / Skipped) sum Plus `levels` and `edges` counters. Trace line lets us attribute the 305ms. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/Cargo.toml | 12 ++- crates/ruborist/src/resolver/builder.rs | 28 ++++- crates/ruborist/src/resolver/mb_resolve.rs | 113 +++++++++++++++++++-- 3 files changed, 141 insertions(+), 12 deletions(-) diff --git a/crates/ruborist/Cargo.toml b/crates/ruborist/Cargo.toml index fdda5ea5e..57d96f187 100644 --- a/crates/ruborist/Cargo.toml +++ b/crates/ruborist/Cargo.toml @@ -52,9 +52,17 @@ workspace = true [dev-dependencies] tokio = { workspace = true, features = ["macros", "rt"] } -# Native (non-macOS) targets: reqwest's default rustls + ring. +# Native (non-macOS) targets: reqwest's default rustls + ring (used by +# `service::http`'s global client). `mb_resolve` separately brings +# `rustls` (with aws-lc-rs) and `rustls-native-certs` to build its +# own client via `use_preconfigured_tls(aws_lc_rs)` — same TLS choice +# as `manifest-bench` / `preload-bench`. The two providers coexist: +# reqwest's internal client uses ring; `mb_resolve`'s explicit client +# uses aws-lc-rs. [target.'cfg(not(any(target_arch = "wasm32", target_os = "macos")))'.dependencies] -reqwest = { version = "0.12", default-features = false, features = ["rustls-tls-native-roots"] } +reqwest = { version = "0.12", default-features = false, features = ["rustls-tls-native-roots"] } +rustls = { version = "0.23", default-features = false, features = ["aws-lc-rs", "logging", "std", "tls12"] } +rustls-native-certs = "0.8" # Native-only dependencies (not compiled for WASM) [target.'cfg(not(target_arch = "wasm32"))'.dependencies] diff --git a/crates/ruborist/src/resolver/builder.rs b/crates/ruborist/src/resolver/builder.rs index 156622502..a5d3e12a4 100644 --- a/crates/ruborist/src/resolver/builder.rs +++ b/crates/ruborist/src/resolver/builder.rs @@ -825,7 +825,18 @@ async fn run_bfs_phase( let start = tokio::time::Instant::now(); let mut current_level = vec![graph.root_index]; + // Per-stage instrumentation. The full BFS wall is `bfs_elapsed` + // below; these split it into work types so we can see whether + // graph traversal, edge resolution, or post-resolve event + // dispatch dominates. + let mut total_collect_us: u64 = 0; + let mut total_resolve_us: u64 = 0; + let mut total_event_us: u64 = 0; + let mut total_edges: u64 = 0; + let mut total_levels: u64 = 0; + while !current_level.is_empty() { + total_levels += 1; receiver.on_event(BuildEvent::LevelStart { node_count: current_level.len(), }); @@ -846,7 +857,10 @@ async fn run_bfs_phase( } // Process unresolved dependencies + let collect_start = std::time::Instant::now(); let unresolved = collect_unresolved_edges(graph, node_index); + total_collect_us += collect_start.elapsed().as_micros() as u64; + total_edges += unresolved.len() as u64; receiver.on_event(BuildEvent::DependencyCount { count: unresolved.len(), }); @@ -855,6 +869,7 @@ async fn run_bfs_phase( receiver.on_event(BuildEvent::Resolving { name: &edge_info.name, }); + let resolve_start = std::time::Instant::now(); let result = process_dependency(graph, registry, node_index, &edge_info, config) .await .map_err(|inner| { @@ -865,7 +880,10 @@ async fn run_bfs_phase( source: Box::new(inner), } }); - match result? { + total_resolve_us += resolve_start.elapsed().as_micros() as u64; + let event_start = std::time::Instant::now(); + let processed = result?; + match processed { ProcessResult::Created(idx) => { // Extract node info for events if let Some(node) = graph.get_node(idx) { @@ -905,6 +923,7 @@ async fn run_bfs_phase( }); } } + total_event_us += event_start.elapsed().as_micros() as u64; } } @@ -917,8 +936,13 @@ async fn run_bfs_phase( let bfs_elapsed = start.elapsed(); tracing::debug!("Build phase: {:?}", bfs_elapsed); tracing::info!( - "p1-breakdown bfs_wall={}ms | {}", + "p1-breakdown bfs_wall={}ms levels={} edges={} collect={}us resolve={}us event={}us | {}", bfs_elapsed.as_millis(), + total_levels, + total_edges, + total_collect_us, + total_resolve_us, + total_event_us, crate::util::FETCH_TIMINGS.snapshot().summary_line(), ); Ok(()) diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs index 7e1376330..a4b2ba8c1 100644 --- a/crates/ruborist/src/resolver/mb_resolve.rs +++ b/crates/ruborist/src/resolver/mb_resolve.rs @@ -34,7 +34,7 @@ use std::pin::Pin; use std::sync::Arc; use std::time::Instant; -use anyhow::{Context, Result}; +use anyhow::{Context, Result, anyhow}; use futures::stream::{FuturesUnordered, StreamExt}; use parking_lot::Mutex; use serde::Deserialize; @@ -53,12 +53,37 @@ pub struct MbFetchStats { } /// Build a fresh `reqwest::Client` matching `preload-bench` / -/// `manifest-bench` exactly, except for the TLS provider — those -/// benches use aws-lc-rs but we keep ruborist's existing default -/// rustls (ring on Linux). If A/B data shows TLS is the remaining -/// gap, we'll add the aws-lc-rs deps separately. +/// `manifest-bench` exactly: aws-lc-rs TLS provider via +/// `use_preconfigured_tls`, `pool_max_idle_per_host(256)`, no +/// proxy, `http1_only`. The reqwest crate's +/// `rustls-tls-native-roots` feature on Linux still bundles ring +/// for `service::http`'s global client, but this client overrides +/// at construction time — both providers coexist in the binary. +#[cfg(not(target_arch = "wasm32"))] fn build_mb_client() -> Result { + // Idempotent: first install_default wins; subsequent calls are + // no-ops. Sets the process-wide default for any rustls consumer + // that builds a `ClientConfig` without explicit provider. + let _ = rustls::crypto::aws_lc_rs::default_provider().install_default(); + + let mut roots = rustls::RootCertStore::empty(); + let native = rustls_native_certs::load_native_certs(); + for cert in native.certs { + // Tolerate individual bad roots — same defensive load pattern + // as `service::http::build_rustls_config`. + let _ = roots.add(cert); + } + + let tls_config = rustls::ClientConfig::builder_with_provider(std::sync::Arc::new( + rustls::crypto::aws_lc_rs::default_provider(), + )) + .with_safe_default_protocol_versions() + .map_err(|e| anyhow!("rustls protocol versions: {e}"))? + .with_root_certificates(roots) + .with_no_client_auth(); + reqwest::Client::builder() + .use_preconfigured_tls(tls_config) .no_proxy() .pool_max_idle_per_host(256) .http1_only() @@ -66,6 +91,14 @@ fn build_mb_client() -> Result { .context("build reqwest client for mb_resolve") } +#[cfg(target_arch = "wasm32")] +fn build_mb_client() -> Result { + reqwest::Client::builder() + .no_proxy() + .build() + .context("build reqwest client for mb_resolve") +} + /// Collect deps from a deps map, filtering non-registry specs. fn collect_deps(map: Option<&HashMap>) -> Vec { map.into_iter() @@ -93,6 +126,14 @@ struct FetchOutcome { name: String, transitives: Vec, fetched: bool, + /// Per-future wall (network + body recv + spawn_blocking parse). + /// Summed across all futures, divided by mb_fetch total wall = + /// eff_parallel — the same number `manifest-bench` reports as + /// `avg_conc`. Used to spot wave-shape underutilization. + wall_us: u64, + /// Per-future network-only wall (request.send + body.bytes). + /// `wall_us - net_us` is the spawn_blocking parse contribution. + net_us: u64, } type Fut = Pin + Send>>; @@ -162,6 +203,7 @@ fn spawn_fetch( peer_deps: PeerDeps, ) -> Fut { Box::pin(async move { + let fut_start = Instant::now(); let url = format!("{}/{}", registry_url, name); let resp = match client .get(&url) @@ -171,23 +213,30 @@ fn spawn_fetch( { Ok(r) if r.status().is_success() => r, _ => { + let wall_us = fut_start.elapsed().as_micros() as u64; return FetchOutcome { name, transitives: Vec::new(), fetched: true, + wall_us, + net_us: wall_us, }; } }; let raw_bytes = match resp.bytes().await { Ok(b) => b, Err(_) => { + let wall_us = fut_start.elapsed().as_micros() as u64; return FetchOutcome { name, transitives: Vec::new(), fetched: true, + wall_us, + net_us: wall_us, }; } }; + let net_us = fut_start.elapsed().as_micros() as u64; let raw_arc: Arc<[u8]> = Arc::from(raw_bytes.as_ref()); // Stash in body_cache early so concurrent sibling specs // arriving slightly after see it on their pending pop. @@ -211,10 +260,13 @@ fn spawn_fetch( None => Vec::new(), }; + let wall_us = fut_start.elapsed().as_micros() as u64; FetchOutcome { name, transitives, fetched: true, + wall_us, + net_us, } }) } @@ -229,6 +281,7 @@ fn spawn_settle( peer_deps: PeerDeps, ) -> Fut { Box::pin(async move { + let fut_start = Instant::now(); let spec_for_parse = spec.clone(); let peer = peer_deps; let parsed = tokio::task::spawn_blocking(move || { @@ -251,10 +304,14 @@ fn spawn_settle( None => Vec::new(), }; + let wall_us = fut_start.elapsed().as_micros() as u64; FetchOutcome { name, transitives, fetched: false, + wall_us, + // Settle-only futures have no network component. + net_us: 0, } }) } @@ -269,6 +326,15 @@ pub async fn mb_fetch( config: &PreloadConfig, ) -> MbFetchStats { let mut stats = MbFetchStats::default(); + // Per-future wall + net sums for eff_parallel computation. + // sum_wall_us / total_wall_ms / 1000 = eff_parallel for the + // whole future-body span (network + parse + cache writes). + // sum_net_us / total_wall_ms / 1000 = network-only eff_parallel, + // directly comparable to manifest-bench's avg_conc. + let mut sum_wall_us: u64 = 0; + let mut sum_net_us: u64 = 0; + let mut fetch_count: u64 = 0; + let mut settle_count: u64 = 0; let total_start = Instant::now(); let client = match build_mb_client() { @@ -333,6 +399,14 @@ pub async fn mb_fetch( let Some(out) = futs.next().await else { break }; + sum_wall_us += out.wall_us; + sum_net_us += out.net_us; + if out.fetched { + fetch_count += 1; + } else { + settle_count += 1; + } + if out.transitives.is_empty() && out.fetched { // Empty result from a fetch is ambiguous (no transitives // OR a fetch/parse failure). Track conservatively as @@ -368,12 +442,35 @@ pub async fn mb_fetch( } } - let total_wall = total_start.elapsed().as_millis(); + let total_wall_ms = total_start.elapsed().as_millis(); + let total_wall_us = (total_wall_ms as u64).saturating_mul(1000); + let eff_par_full = if total_wall_us > 0 { + sum_wall_us as f64 / total_wall_us as f64 + } else { + 0.0 + }; + let eff_par_net = if total_wall_us > 0 { + sum_net_us as f64 / total_wall_us as f64 + } else { + 0.0 + }; + let avg_wall_us = sum_wall_us + .checked_div(fetch_count + settle_count) + .unwrap_or(0); + let avg_net_us = sum_net_us.checked_div(fetch_count).unwrap_or(0); tracing::info!( - "p1-breakdown mb_fetch wall={}ms ok={} fail={}", - total_wall, + "p1-breakdown mb_fetch wall={}ms ok={} fail={} fetch={} settle={} sum_wall={}ms sum_net={}ms avg_wall={}us avg_net={}us eff_par_full={:.1} eff_par_net={:.1}", + total_wall_ms, stats.success, stats.fail, + fetch_count, + settle_count, + sum_wall_us / 1000, + sum_net_us / 1000, + avg_wall_us, + avg_net_us, + eff_par_full, + eff_par_net, ); stats From c02bb15280e5132fc96495a3cf1d3206c423ec31 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 15:19:38 +0800 Subject: [PATCH 25/32] perf(pm): fold preload + BFS into mb_fetch_with_graph for utoo deps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Step 3 of staged service-layer ablation. Targets the 305 ms BFS phase observed against a fully-warm MemoryCache — 100 % attributed to process_dependency.await sum (graph mutations) per d9fb2070's new bfs instrumentation. Adds: * `process_dependency_with_resolved` in builder.rs — sync variant of process_dependency for the registry-resolved case. Skips spec-routing (only Registry handled), skips resolve_registry_dep (resolved is the parameter), skips override re-resolve. Reuses existing helpers (find_compatible_node, create_package_node, add_edges_from, mark_dependency_resolved, update_node_type_from_edge). * `mb_fetch_with_graph` in mb_resolve.rs — folded streaming preload + graph build. Each fetch result triggers inline process_dependency_with_resolved for every parent edge waiting on (name, spec). New nodes' edges feed back into pending / edge_targets, so the walk continues streaming-style. CPU work (graph mutations, ~305 ms total) overlaps with network IO (mb_fetch's wall ~2.4 s). Wires `service::api::build_deps` to use mb_fetch_with_graph for the lockfile-only path (skip_preload + cold cache). The follow-up build_deps_with_config still runs to handle any non-registry edges left unresolved (workspace / git / http / file); on registry-only workloads it's near no-op. Install path unchanged — pipeline_deps_options keeps preload + PackageResolved early-start signal for tgz download. Expected: utoo p1 wall drops from ~2.76 s toward mb_fetch wall + serialize ≈ 2.4-2.5 s on good network. Tracing line: p1-breakdown mb_fetch_with_graph wall=Xms ok=N fetch=N settle=N sum_wall=Xms sum_net=Xms sum_graph=Xms avg_net=Xus eff_par_full=N.N eff_par_net=N.N unresolved_targets=N Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/resolver/builder.rs | 49 ++++ crates/ruborist/src/resolver/mb_resolve.rs | 297 +++++++++++++++++++++ crates/ruborist/src/service/api.rs | 36 +-- 3 files changed, 367 insertions(+), 15 deletions(-) diff --git a/crates/ruborist/src/resolver/builder.rs b/crates/ruborist/src/resolver/builder.rs index a5d3e12a4..97db89e79 100644 --- a/crates/ruborist/src/resolver/builder.rs +++ b/crates/ruborist/src/resolver/builder.rs @@ -651,6 +651,55 @@ pub async fn process_dependency( } } +/// Sync variant of [`process_dependency`] for callers that already +/// have a resolved registry manifest in hand (the +/// `mb_fetch_with_graph` lockfile-only path populates one +/// per fetch). Skips: +/// * spec-routing (`Git` / `Http` / `Local` / `Workspace`) — only +/// the `Registry` branch is handled. Non-registry edges are +/// left unresolved for the caller to defer. +/// * `resolve_registry_dep` (the resolved package is the +/// parameter). +/// * Override re-resolve (uses the original resolved package even +/// if `graph.check_override` would re-route the spec). Override +/// re-resolve requires another network round-trip; the +/// lockfile-only fast path skips it intentionally — overridden +/// specs that diverge from the original resolution will need a +/// follow-up BFS sweep. +/// +/// Returns the same [`ProcessResult`] shape as `process_dependency` +/// so the caller can register newly-created nodes' edges with +/// `edge_targets` for the streaming graph build. +pub fn process_dependency_with_resolved( + graph: &mut DependencyGraph, + parent_idx: NodeIndex, + edge_info: &DependencyEdgeInfo, + resolved: &ResolvedPackage, + config: &BuildDepsConfig, +) -> ProcessResult { + match graph.find_compatible_node(parent_idx, &edge_info.name, &edge_info.spec) { + FindResult::Reuse(existing_index) => { + graph.mark_dependency_resolved(edge_info.edge_id, existing_index); + update_node_type_from_edge(graph, parent_idx, existing_index, &edge_info.edge_type); + ProcessResult::Reused(existing_index) + } + FindResult::Conflict(conflict_parent) | FindResult::New(conflict_parent) => { + let new_node = create_package_node(&edge_info.name, resolved, conflict_parent, graph); + let new_index = graph.add_node(new_node); + graph.add_physical_edge(conflict_parent, new_index); + graph.mark_dependency_resolved(edge_info.edge_id, new_index); + update_node_type_from_edge(graph, parent_idx, new_index, &edge_info.edge_type); + add_edges_from( + graph, + new_index, + &*resolved.manifest, + &EdgeContext::new(config.peer_deps, DevDeps::Exclude), + ); + ProcessResult::Created(new_index) + } + } +} + /// Build the complete dependency tree using BFS traversal. /// /// This is the main entry point for dependency resolution. It starts from diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs index a4b2ba8c1..4e2f8cc85 100644 --- a/crates/ruborist/src/resolver/mb_resolve.rs +++ b/crates/ruborist/src/resolver/mb_resolve.rs @@ -37,14 +37,20 @@ use std::time::Instant; use anyhow::{Context, Result, anyhow}; use futures::stream::{FuturesUnordered, StreamExt}; use parking_lot::Mutex; +use petgraph::graph::{EdgeIndex, NodeIndex}; use serde::Deserialize; +use crate::model::graph::DependencyGraph; use crate::model::manifest::{CoreVersionManifest, FullManifest}; use crate::model::node::PeerDeps; +use crate::resolver::builder::{ + BuildDepsConfig, ProcessResult, collect_unresolved_edges, process_dependency_with_resolved, +}; use crate::resolver::preload::{Dep, PreloadConfig}; use crate::resolver::version::resolve_target_version; use crate::service::MemoryCache; use crate::spec::SpecStr; +use crate::traits::registry::ResolvedPackage; #[derive(Debug, Default)] pub struct MbFetchStats { @@ -475,3 +481,294 @@ pub async fn mb_fetch( stats } + +// ============================================================================ +// Folded streaming graph build — preload + BFS in one phase +// ============================================================================ + +/// Edges waiting on a `(name, spec)` fetch. Multiple parents can need +/// the same registry dep; we track them all and process inline as +/// soon as the manifest lands. +type EdgeTargets = HashMap<(String, String), Vec<(NodeIndex, EdgeIndex)>>; + +/// Collect the unresolved registry edges from `node_idx` into +/// pending + edge_targets, dedup by spec via `seen_specs`. +/// Non-registry edges (workspace / git / http / file) are +/// deliberately left for the follow-up BFS sweep. +fn enqueue_node_edges( + graph: &DependencyGraph, + node_idx: NodeIndex, + pending: &mut VecDeque, + seen_specs: &mut HashSet<(String, String)>, + edge_targets: &mut EdgeTargets, +) { + for edge in collect_unresolved_edges(graph, node_idx) { + if !edge.spec.is_registry_spec() { + continue; + } + let key = (edge.name.clone(), edge.spec.clone()); + edge_targets + .entry(key.clone()) + .or_default() + .push((node_idx, edge.edge_id)); + if seen_specs.insert(key.clone()) { + pending.push_back(key); + } + } +} + +/// Folded variant: combines `mb_fetch`'s streaming preload with the +/// graph mutations that BFS would otherwise do in a separate phase. +/// Each fetch result triggers inline `process_dependency_with_resolved` +/// for every parent edge waiting on `(name, spec)`. New nodes' edges +/// feed back into pending / edge_targets, so the walk continues +/// streaming-style without a separate level-by-level traversal. +/// +/// CPU work (graph mutations) overlaps with network IO (more fetches +/// in flight via `FuturesUnordered`), so the 305 ms BFS phase +/// observed against a fully-warm cache is collapsed into mb_fetch's +/// wall instead of running serially after it. +/// +/// Non-registry edges (workspace / git / http / file) and any edges +/// added after the streaming loop converges (override re-resolves +/// that diverge from the original spec) are left unresolved — the +/// caller must run a follow-up BFS sweep to handle them. For +/// `utoo deps` on registry-only workloads (the common case), the +/// sweep is a no-op. +pub async fn mb_fetch_with_graph( + graph: &mut DependencyGraph, + registry_url: &str, + cache: &MemoryCache, + preload_config: &PreloadConfig, + build_config: &BuildDepsConfig, +) -> Result { + let mut stats = MbFetchStats::default(); + let total_start = Instant::now(); + + let client = match build_mb_client() { + Ok(c) => c, + Err(e) => { + tracing::warn!("mb_resolve client build failed: {e}"); + return Ok(stats); + } + }; + let registry = Arc::new(registry_url.trim_end_matches('/').to_string()); + let cap = preload_config.concurrency; + let peer_deps = preload_config.peer_deps; + + // Initial seed: walk root + workspace nodes for unresolved + // registry edges. (Workspace nodes were created during graph + // initialization in `service::api::build_deps`.) + let mut seen_specs: HashSet<(String, String)> = HashSet::new(); + let mut pending: VecDeque = VecDeque::new(); + let mut edge_targets: EdgeTargets = HashMap::new(); + + let root_index = graph.root_index; + enqueue_node_edges( + graph, + root_index, + &mut pending, + &mut seen_specs, + &mut edge_targets, + ); + // Workspace nodes' direct edges. Workspace deps may be + // workspace: (resolved at graph init) or registry; registry + // ones land in pending. + for node_idx in graph.graph.node_indices() { + if let Some(node) = graph.get_node(node_idx) + && node.is_workspace() + { + enqueue_node_edges( + graph, + node_idx, + &mut pending, + &mut seen_specs, + &mut edge_targets, + ); + } + } + + // Sibling-fetch dedup carries over from `mb_fetch`. + let body_cache: Arc>>> = Arc::new(Mutex::new(HashMap::new())); + let mut in_flight_names: HashSet = HashSet::new(); + let mut deferred_by_name: HashMap> = HashMap::new(); + + let mut futs: FuturesUnordered = FuturesUnordered::new(); + + // Per-fetch-future timing accumulators (same as `mb_fetch`). + let mut sum_wall_us: u64 = 0; + let mut sum_net_us: u64 = 0; + let mut fetch_count: u64 = 0; + let mut settle_count: u64 = 0; + // Sum of CPU spent in inline graph mutations across all fetched + // events. Reported alongside the fetch totals so we can attribute + // the mb_fetch wall split between IO and CPU. + let mut sum_graph_us: u64 = 0; + + loop { + while futs.len() < cap { + let Some((name, spec)) = pending.pop_front() else { + break; + }; + if let Some(raw) = body_cache.lock().get(&name).cloned() { + futs.push(spawn_settle(name, spec, raw, cache.clone(), peer_deps)); + continue; + } + if !in_flight_names.insert(name.clone()) { + deferred_by_name.entry(name).or_default().push(spec); + continue; + } + futs.push(spawn_fetch( + client.clone(), + Arc::clone(®istry), + name, + spec, + cache.clone(), + Arc::clone(&body_cache), + peer_deps, + )); + } + + if futs.is_empty() { + break; + } + + let Some(out) = futs.next().await else { break }; + + sum_wall_us += out.wall_us; + sum_net_us += out.net_us; + if out.fetched { + fetch_count += 1; + } else { + settle_count += 1; + } + if out.fetched { + stats.success += 1; + } + + // Drain sibling specs deferred while the fetch was in flight. + if out.fetched + && let Some(siblings) = deferred_by_name.remove(&out.name) + && let Some(raw) = body_cache.lock().get(&out.name).cloned() + { + for sibling_spec in siblings { + futs.push(spawn_settle( + out.name.clone(), + sibling_spec, + Arc::clone(&raw), + cache.clone(), + peer_deps, + )); + } + } + + // Graph mutations: process every parent edge waiting on + // `(name, spec)` for each transitive spec the fetch resolved + // (the fetch itself touched only the primary spec; sibling + // settles touch their own specs). Each settle path covers + // its own bucket via the `out.transitives` path below. + // + // The fetched/settled (name, spec) pair has already been + // written to the cache by the future. Look up the version + // manifest to get the ResolvedPackage handed to + // process_dependency_with_resolved. + let graph_start = Instant::now(); + let process_key_specs: Vec<(String, String)> = out + .transitives + .iter() + .map(|(n, s)| (n.clone(), s.clone())) + .collect(); + // The primary fetched/settled spec itself: resolve it now. + let primary_keys: Vec<(String, String)> = edge_targets + .keys() + .filter(|(n, _)| n == &out.name) + .cloned() + .collect(); + for (k_name, k_spec) in primary_keys { + // Pull resolved manifest from cache for this spec. + let Some(core_arc) = cache.get_version_manifest(&k_name, &k_spec) else { + continue; + }; + let resolved = ResolvedPackage { + name: k_name.clone(), + version: core_arc.version.clone(), + manifest: core_arc, + }; + let waiting = edge_targets.remove(&(k_name.clone(), k_spec.clone())); + if let Some(targets) = waiting { + for (parent_idx, edge_id) in targets { + let edge_info = crate::resolver::edges::DependencyEdgeInfo { + edge_id, + name: k_name.clone(), + spec: k_spec.clone(), + // edge_type carried separately on the graph; we + // re-look-up the actual edge here for + // correctness. + edge_type: graph + .graph + .edge_weight(edge_id) + .and_then(|e| match e { + crate::model::graph::GraphEdge::Dependency(d) => Some(d.edge_type), + _ => None, + }) + .unwrap_or(crate::model::node::EdgeType::Prod), + }; + let result = process_dependency_with_resolved( + graph, + parent_idx, + &edge_info, + &resolved, + build_config, + ); + if let ProcessResult::Created(new_idx) = result { + // The new node's transitive edges become new + // pending entries. Same dedup as the seed. + enqueue_node_edges( + graph, + new_idx, + &mut pending, + &mut seen_specs, + &mut edge_targets, + ); + } + } + } + } + sum_graph_us += graph_start.elapsed().as_micros() as u64; + // Suppress an unused-vars warning when the transitive list is + // identical to the keys we just pulled from edge_targets — + // we keep collecting it for tracing parity with `mb_fetch`. + let _ = process_key_specs; + } + + let total_wall_ms = total_start.elapsed().as_millis(); + let total_wall_us = (total_wall_ms as u64).saturating_mul(1000); + let eff_par_full = if total_wall_us > 0 { + sum_wall_us as f64 / total_wall_us as f64 + } else { + 0.0 + }; + let eff_par_net = if total_wall_us > 0 { + sum_net_us as f64 / total_wall_us as f64 + } else { + 0.0 + }; + let avg_net_us = sum_net_us.checked_div(fetch_count).unwrap_or(0); + let unresolved_remaining = edge_targets.len(); + tracing::info!( + "p1-breakdown mb_fetch_with_graph wall={}ms ok={} fetch={} settle={} sum_wall={}ms sum_net={}ms sum_graph={}ms avg_net={}us eff_par_full={:.1} eff_par_net={:.1} unresolved_targets={}", + total_wall_ms, + stats.success, + fetch_count, + settle_count, + sum_wall_us / 1000, + sum_net_us / 1000, + sum_graph_us / 1000, + avg_net_us, + eff_par_full, + eff_par_net, + unresolved_remaining, + ); + + Ok(stats) +} diff --git a/crates/ruborist/src/service/api.rs b/crates/ruborist/src/service/api.rs index 06079b248..837ebfc5b 100644 --- a/crates/ruborist/src/service/api.rs +++ b/crates/ruborist/src/service/api.rs @@ -36,9 +36,8 @@ use crate::model::package_lock::PackageLock; use crate::model::util::parse_package_spec; use crate::resolver::builder::{ BuildDepsConfig, DevDeps, EdgeContext, PeerDeps, add_edges_from, build_deps_with_config, - gather_preload_deps, }; -use crate::resolver::mb_resolve::mb_fetch; +use crate::resolver::mb_resolve::mb_fetch_with_graph; use crate::resolver::preload::PreloadConfig; use crate::resolver::runtime::install_runtime_from_map; use crate::resolver::workspace::WorkspaceDiscovery; @@ -272,32 +271,39 @@ where ); } - // Lockfile-only callers (`utoo deps`) skip the receiver-driven - // `run_preload_phase` because they have no pipeline consumer for - // `BuildEvent::PackageResolved`. Route through `mb_fetch` — a - // ruborist-internal standalone preload that bypasses - // `service::http`, `service::manifest`, and `service::registry` - // to match `manifest-bench`'s loop shape directly. PM is - // unaware: this dispatch happens entirely inside ruborist when - // `skip_preload=true` and there's no warm project cache. - if skip_preload_caller && cache_count == 0 { - let initial_deps = gather_preload_deps(&graph, peer_deps); + // Lockfile-only callers (`utoo deps`) route through + // `mb_fetch_with_graph` — a folded streaming preload + graph + // build. The fetch loop drives manifest IO; per-result inline + // `process_dependency_with_resolved` mutates the graph. Result: + // no separate BFS phase. The follow-up + // `build_deps_with_config` call still runs to handle any + // non-registry edges (workspace / git / http / file) the fold + // path skipped, but on registry-only workloads it's near no-op. + let folded = skip_preload_caller && cache_count == 0; + if folded { let preload_config = PreloadConfig { peer_deps, concurrency, }; - mb_fetch( - initial_deps, + mb_fetch_with_graph( + &mut graph, registry.registry_url(), registry.cache(), &preload_config, + &config, ) - .await; + .await + .map_err(|e| e.context("mb_fetch_with_graph failed"))?; } // Preserve the typed error via `Error::new` + `.context(...)` so CLI // renderers (e.g. pm's format_print) can downcast and pretty-print the // dependency chain carried by `ResolveError::WithChain`. + // + // For the folded path this BFS sweeps remaining unresolved edges + // (non-registry: workspace / git / http / file). On + // registry-only workloads (the common case) the graph is fully + // built already, BFS walks nothing. build_deps_with_config(&mut graph, ®istry, config, &receiver) .await .map_err(|e| anyhow::Error::new(e).context("Dependency resolution failed"))?; From 63928a73b09d42953a8a41eba9d95deb0c4a47ad Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 15:44:43 +0800 Subject: [PATCH 26/32] =?UTF-8?q?fix(pm):=20mb=5Ffetch=5Fwith=5Fgraph=20?= =?UTF-8?q?=E2=80=94=20drain=20edge=5Ftargets=20via=20inline=20cache=20hit?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit c02bb152 had unresolved_targets=583 in trace — `enqueue_node_edges` was unconditionally pushing (parent, edge_id) into edge_targets without checking if the (name, spec) was already cached. When a later transitive's edge referenced an already-fetched (name, spec), no fetch result would land to drain that bucket — the parent edges sat unresolved, potentially missing packages from the lockfile. Fix: enqueue_node_edges now checks cache.get_version_manifest first. Cache hit → process_dependency_with_resolved inline (with a work_stack to recurse into newly-Created nodes' edges). Cache miss → original behavior (stash in edge_targets, push to pending). Side effect: more inline graph mutation work in the seed phase (workspace + root edges that hit warm cache from previous specs in the same root). Should reduce the number of fetch-result events that need to do graph mutations downstream, since orphan edges no longer accumulate. Targets the correctness bug from c02bb152 trace; perf impact TBD. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/resolver/mb_resolve.rs | 109 ++++++++++++++++----- 1 file changed, 84 insertions(+), 25 deletions(-) diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs index 4e2f8cc85..4252e7efd 100644 --- a/crates/ruborist/src/resolver/mb_resolve.rs +++ b/crates/ruborist/src/resolver/mb_resolve.rs @@ -495,24 +495,73 @@ type EdgeTargets = HashMap<(String, String), Vec<(NodeIndex, EdgeIndex)>>; /// pending + edge_targets, dedup by spec via `seen_specs`. /// Non-registry edges (workspace / git / http / file) are /// deliberately left for the follow-up BFS sweep. +/// Process this node's unresolved registry edges: +/// * If the (name, spec) is already cached (a sibling subtree +/// resolved it earlier), call `process_dependency_with_resolved` +/// inline now. Newly-created child nodes recurse via this same +/// function so their edges are also enqueued/processed. +/// * Otherwise, register the (parent, edge_id) under `edge_targets` +/// so the eventual fetch result drains it; push to `pending` if +/// this `(name, spec)` hasn't been seen. +/// +/// Without the inline-process path, `(name, spec)` keys added +/// AFTER their fetch already landed would never be drained — they'd +/// sit in `edge_targets` and the corresponding parent edges would +/// stay unresolved. CI run c02bb152 showed ~580 such orphans. fn enqueue_node_edges( - graph: &DependencyGraph, + graph: &mut DependencyGraph, node_idx: NodeIndex, pending: &mut VecDeque, seen_specs: &mut HashSet<(String, String)>, edge_targets: &mut EdgeTargets, + cache: &MemoryCache, + build_config: &BuildDepsConfig, ) { - for edge in collect_unresolved_edges(graph, node_idx) { - if !edge.spec.is_registry_spec() { - continue; - } - let key = (edge.name.clone(), edge.spec.clone()); - edge_targets - .entry(key.clone()) - .or_default() - .push((node_idx, edge.edge_id)); - if seen_specs.insert(key.clone()) { - pending.push_back(key); + let mut work_stack: Vec = vec![node_idx]; + while let Some(idx) = work_stack.pop() { + let edges = collect_unresolved_edges(graph, idx); + for edge in edges { + if !edge.spec.is_registry_spec() { + continue; + } + let key = (edge.name.clone(), edge.spec.clone()); + + // Cache-hit fast path: process immediately, no + // edge_targets stash. Reuses the same process logic the + // main loop uses on fetch result. + if let Some(core_arc) = cache.get_version_manifest(&edge.name, &edge.spec) { + let resolved = ResolvedPackage { + name: edge.name.clone(), + version: core_arc.version.clone(), + manifest: core_arc, + }; + let edge_info = crate::resolver::edges::DependencyEdgeInfo { + edge_id: edge.edge_id, + name: edge.name.clone(), + spec: edge.spec.clone(), + edge_type: edge.edge_type, + }; + if let ProcessResult::Created(new_idx) = process_dependency_with_resolved( + graph, + idx, + &edge_info, + &resolved, + build_config, + ) { + work_stack.push(new_idx); + } + // Whether Created or Reused, this edge is now + // resolved — don't queue. + continue; + } + + edge_targets + .entry(key.clone()) + .or_default() + .push((idx, edge.edge_id)); + if seen_specs.insert(key.clone()) { + pending.push_back(key); + } } } } @@ -570,22 +619,27 @@ pub async fn mb_fetch_with_graph( &mut pending, &mut seen_specs, &mut edge_targets, + cache, + build_config, ); // Workspace nodes' direct edges. Workspace deps may be // workspace: (resolved at graph init) or registry; registry // ones land in pending. - for node_idx in graph.graph.node_indices() { - if let Some(node) = graph.get_node(node_idx) - && node.is_workspace() - { - enqueue_node_edges( - graph, - node_idx, - &mut pending, - &mut seen_specs, - &mut edge_targets, - ); - } + let workspace_indices: Vec = graph + .graph + .node_indices() + .filter(|&i| graph.get_node(i).is_some_and(|n| n.is_workspace())) + .collect(); + for node_idx in workspace_indices { + enqueue_node_edges( + graph, + node_idx, + &mut pending, + &mut seen_specs, + &mut edge_targets, + cache, + build_config, + ); } // Sibling-fetch dedup carries over from `mb_fetch`. @@ -722,13 +776,18 @@ pub async fn mb_fetch_with_graph( ); if let ProcessResult::Created(new_idx) = result { // The new node's transitive edges become new - // pending entries. Same dedup as the seed. + // pending entries. enqueue handles cache-hit + // inline-process so we don't orphan + // edge_targets entries after their fetch + // already landed. enqueue_node_edges( graph, new_idx, &mut pending, &mut seen_specs, &mut edge_targets, + cache, + build_config, ); } } From 2527137b4bbb7a7e2622abeefedf3d03e1070c2a Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 16:09:41 +0800 Subject: [PATCH 27/32] ci(pcap): add manifest-bench + preload-bench captures for TCP-level diff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 700ms gap between utoo p1 (folded mb_fetch_with_graph) and manifest-bench standalone needs network-layer evidence. Same workload, same conc, same network → why does utoo wall trail by 700ms when per-fetch latency is matched (avg_net=53us = mb p50=40us ish)? Hypotheses to test via pcap diff: * Fewer concurrent TCP streams in flight at any moment (utoo's main loop CPU steals tokio dispatch capacity → in-flight count drops below conc cap) * More TLS handshakes (utoo's connection pool isn't reusing as effectively as mb's per-rep fresh client) * Larger inter-packet gaps per stream (utoo's runtime pauses mid download) * Different concurrent-stream-time profile (wave shape) Adds two captures at end of pm-bench-pcap.sh: manifest-bench-c96 — flat lockfile-derived names @ conc=96 preload-bench-c96 — transitive walk @ conc=96 (matches utoo's walk shape, but no graph build) Each captured with the same tcpdump + iostat as the existing utoo / utoo-next / bun captures. analyze_pcap globs *.pcap so the new files get the same TCP signal extraction (zwin / retx / dup_ack / per-stream gap p50/p99/max / distinct streams). Workflow: downloads manifest-bench-linux-x64 + preload-bench-linux-x64 artifacts (built by build-linux's benchmark-label conditional steps) into the pm-bench-pcap-linux job env so pm-bench-pcap.sh can find them. Trigger: workflow_dispatch with target=pm-bench-pcap. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/pm-e2e-bench.yml | 23 ++++++++++++++++++++ bench/pm-bench-pcap.sh | 34 ++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/.github/workflows/pm-e2e-bench.yml b/.github/workflows/pm-e2e-bench.yml index eb560969b..13b463a22 100644 --- a/.github/workflows/pm-e2e-bench.yml +++ b/.github/workflows/pm-e2e-bench.yml @@ -1000,6 +1000,29 @@ jobs: mv /tmp/utoo-next-dist/utoo /tmp/utoo-next echo "Baseline utoo (next) version: $(/tmp/utoo-next --version)" echo "UTOO_NEXT_BIN=/tmp/utoo-next" >> $GITHUB_ENV + # manifest-bench + preload-bench binaries for pcap-comparing + # utoo's TCP-level behaviour against pure-HTTP and + # transitive-walk baselines. + - name: Download manifest-bench binary + uses: actions/download-artifact@v4 + with: + name: manifest-bench-linux-x64 + path: /tmp/manifest-bench-dist + - name: Install manifest-bench + run: | + chmod +x /tmp/manifest-bench-dist/manifest-bench + mv /tmp/manifest-bench-dist/manifest-bench /tmp/manifest-bench + echo "MANIFEST_BENCH_BIN=/tmp/manifest-bench" >> $GITHUB_ENV + - name: Download preload-bench binary + uses: actions/download-artifact@v4 + with: + name: preload-bench-linux-x64 + path: /tmp/preload-bench-dist + - name: Install preload-bench + run: | + chmod +x /tmp/preload-bench-dist/preload-bench + mv /tmp/preload-bench-dist/preload-bench /tmp/preload-bench + echo "PRELOAD_BENCH_BIN=/tmp/preload-bench" >> $GITHUB_ENV - name: Capture pcap env: PROJECT: ${{ github.event.inputs.project || 'ant-design' }} diff --git a/bench/pm-bench-pcap.sh b/bench/pm-bench-pcap.sh index 7a0f7c819..d7f71e106 100755 --- a/bench/pm-bench-pcap.sh +++ b/bench/pm-bench-pcap.sh @@ -139,6 +139,40 @@ fi run_pm_phases bun "$(command -v bun)" "$BUN_CACHE" +# --- standalone bench captures (resolve-only baselines) ---------------- +# After all PM captures, regenerate a fresh package-lock.json via utoo +# deps (untimed) so manifest-bench has a stable name list to consume. +# Then pcap-capture each standalone bench at conc=96 — the same conc +# utoo's mb_fetch_with_graph ran with — so the TCP signals are +# directly comparable between the integrated path and the pure-HTTP +# / pure-streaming-walk ceilings. +cd "$PROJECT_DIR" +rm -f package-lock.json bun.lock +rm -rf "$UTOO_CACHE" node_modules +echo "=== regenerating package-lock.json for standalone benches ===" +utoo deps --registry="$REGISTRY" --cache-dir="$UTOO_CACHE" \ + >/dev/null 2>&1 || echo "lock regen failed" + +if [ -f package-lock.json ] && [ -n "${MANIFEST_BENCH_BIN:-}" ] && [ -x "$MANIFEST_BENCH_BIN" ]; then + capture_one "manifest-bench-c96" \ + "$MANIFEST_BENCH_BIN" \ + --lockfile package-lock.json \ + --registry "$REGISTRY" \ + --concurrency 96 --reps 1 --http1-only +else + echo "skip manifest-bench: bin missing or no lockfile" +fi + +if [ -n "${PRELOAD_BENCH_BIN:-}" ] && [ -x "$PRELOAD_BENCH_BIN" ]; then + capture_one "preload-bench-c96" \ + "$PRELOAD_BENCH_BIN" \ + --package-json package.json \ + --registry "$REGISTRY" \ + --concurrency 96 --reps 1 +else + echo "skip preload-bench: bin missing" +fi + # --- post-capture analysis: tshark metrics per pcap --------------------- # Extract TCP-level stress signals to validate the "install greediness # starves download" hypothesis. All of these are pre-TLS so we don't need From fe26709ebf7df12a70c65b06c86f0d20266cdf69 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 16:45:05 +0800 Subject: [PATCH 28/32] ci(pcap): upload small summaries artifact alongside the 2GB pcap dump MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous pm-bench-pcap artifact was 2GB (raw .pcap files for every PM × phase × bench), making the round-trip download impractical just to read JSON metrics. Adds a separate `pm-bench-pcap-summaries` artifact containing only the *.json / *.log / *.iostat.txt / dns.txt files — KB scale, downloads in seconds. Raw pcap artifact is preserved for cases where we want to re-run tshark with different filters. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/pm-e2e-bench.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.github/workflows/pm-e2e-bench.yml b/.github/workflows/pm-e2e-bench.yml index 13b463a22..1970a2cd5 100644 --- a/.github/workflows/pm-e2e-bench.yml +++ b/.github/workflows/pm-e2e-bench.yml @@ -1030,6 +1030,20 @@ jobs: run: | chmod +x bench/pm-bench-pcap.sh bash bench/pm-bench-pcap.sh + # Small artifact (KB scale) with just the per-capture + + # aggregated metrics — fast to download for diff analysis, + # avoids the 2GB pcap-corpus pull when we only need numbers. + - name: Upload pcap summaries + if: always() + uses: actions/upload-artifact@v4 + with: + name: pm-bench-pcap-summaries + path: | + /tmp/pm-bench-pcap/*.json + /tmp/pm-bench-pcap/*.log + /tmp/pm-bench-pcap/*.iostat.txt + /tmp/pm-bench-pcap/dns.txt + retention-days: 7 - name: Upload pcap artifact if: always() uses: actions/upload-artifact@v4 From 1ff58aec7f33282103361249d9f101ec1b394b8e Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 16:52:02 +0800 Subject: [PATCH 29/32] ci(pcap): upload summary-only artifact + print table to CI logs The pm-bench-pcap artifact is ~2 GB (pcap binaries dominate). gh run download keeps timing out before completion. Two fixes: 1. New `pm-bench-pcap-summaries` artifact uploads only the JSON summaries + .log + iostat.txt + dns.txt (small, fast download). The full pcap artifact stays for deep inspection when needed. 2. End of pm-bench-pcap.sh prints a tab-separated comparison table (name, wall_s, packets, streams, zwin, retx, dup_ack, gap_p99_us, gap_max_us) to stdout, so the data is visible in the CI run log without downloading anything. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/pm-e2e-bench.yml | 15 +++++++++++++++ bench/pm-bench-pcap.sh | 22 ++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/.github/workflows/pm-e2e-bench.yml b/.github/workflows/pm-e2e-bench.yml index 1970a2cd5..5b219199a 100644 --- a/.github/workflows/pm-e2e-bench.yml +++ b/.github/workflows/pm-e2e-bench.yml @@ -1051,3 +1051,18 @@ jobs: name: pm-bench-pcap path: /tmp/pm-bench-pcap retention-days: 7 + # Tiny summary-only artifact for quick comparison without + # re-downloading the multi-GB pcap blob. Includes the + # tshark-extracted JSON metrics + the pcap.log files (text, + # tiny) but no .pcap binaries. + - name: Upload pcap summaries (small) + if: always() + uses: actions/upload-artifact@v4 + with: + name: pm-bench-pcap-summaries + path: | + /tmp/pm-bench-pcap/*.json + /tmp/pm-bench-pcap/*.log + /tmp/pm-bench-pcap/*.iostat.txt + /tmp/pm-bench-pcap/dns.txt + retention-days: 7 diff --git a/bench/pm-bench-pcap.sh b/bench/pm-bench-pcap.sh index d7f71e106..7f37fc5db 100755 --- a/bench/pm-bench-pcap.sh +++ b/bench/pm-bench-pcap.sh @@ -367,3 +367,25 @@ fi echo "done. files:" ls -lh "$PCAP_DIR" + +# Print summary table to CI logs so we don't need to download the +# 2 GB pcap artifact just to read the comparison numbers. +echo +echo "=== summary table ===" +if command -v jq >/dev/null && [ -f "$PCAP_DIR/summary.json" ]; then + jq -r ' + .captures + | (["name", "wall_s", "packets", "streams", "zwin", "retx", "dup_ack", "gap_p99_us", "gap_max_us"] | @tsv), + (.[] | [ + .name, + (.wall_seconds | tostring), + (.packet_count | tostring), + (.distinct_streams | tostring), + (.zero_windows | tostring), + (.retransmits | tostring), + (.duplicate_acks | tostring), + (.gap_p99_us | tostring), + (.gap_max_us | tostring) + ] | @tsv) + ' "$PCAP_DIR/summary.json" | column -t +fi From a21f24bca1f6d0172b36a1b386ac3aed2ef61165 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 23:14:46 +0800 Subject: [PATCH 30/32] =?UTF-8?q?perf(pm):=20mb=5Ffetch=5Fwith=5Fgraph=20?= =?UTF-8?q?=E2=80=94=20channel-based=20separation=20of=20fetch=20+=20graph?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pcap evidence (utoo-resolve zwin=71 vs mb-c96 zwin=49) confirmed main loop CPU was starving tokio runtime workers from polling sockets. Inline graph mutations (sum_graph=450ms across the fetch loop) blocked the worker between awaits, so TCP receive buffers filled and the server paused sending — directly extending wall. This refactor: * Spawns `graph_worker` as a separate tokio task (gets its own runtime worker thread on multi-thread runtime). Owns the DependencyGraph + edge_targets + seen_specs. * Main loop owns FuturesUnordered + body_cache + dispatch state. No graph mutations on this path. * mpsc channels: main → graph (FetchEventMsg, just the name — cache writes already in the future), graph → main (Vec new pending specs to extend the fetch queue). * `tokio::select!` with `biased` drains specs first to unblock fetch dispatch. * `in_flight_graph` counter tracks outstanding messages to graph worker — termination = futs empty + in_flight_graph == 0. Function signature changed: takes `mut graph: DependencyGraph` by value, returns `(DependencyGraph, MbFetchStats)` since the worker task needs ownership of the graph (can't borrow across spawn). api.rs caller threads the graph through. Expected: zwin drops back toward mb's ~49 (no more main loop starvation), eff_par_net climbs from 56 toward mb's 72, wall saves ~200ms. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/resolver/mb_resolve.rs | 402 ++++++++++++++------- crates/ruborist/src/service/api.rs | 5 +- 2 files changed, 277 insertions(+), 130 deletions(-) diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs index 4252e7efd..197fcbc26 100644 --- a/crates/ruborist/src/resolver/mb_resolve.rs +++ b/crates/ruborist/src/resolver/mb_resolve.rs @@ -39,6 +39,7 @@ use futures::stream::{FuturesUnordered, StreamExt}; use parking_lot::Mutex; use petgraph::graph::{EdgeIndex, NodeIndex}; use serde::Deserialize; +use tokio::sync::mpsc; use crate::model::graph::DependencyGraph; use crate::model::manifest::{CoreVersionManifest, FullManifest}; @@ -584,13 +585,21 @@ fn enqueue_node_edges( /// caller must run a follow-up BFS sweep to handle them. For /// `utoo deps` on registry-only workloads (the common case), the /// sweep is a no-op. +/// One fetched/settled event, sent from main loop to graph worker. +/// The future already performed cache writes inline (cheap DashMap +/// inserts). Graph worker uses `cache.get_version_manifest` to +/// retrieve the manifest for `process_dependency_with_resolved`. +struct FetchEventMsg { + name: String, +} + pub async fn mb_fetch_with_graph( - graph: &mut DependencyGraph, + mut graph: DependencyGraph, registry_url: &str, cache: &MemoryCache, preload_config: &PreloadConfig, build_config: &BuildDepsConfig, -) -> Result { +) -> Result<(DependencyGraph, MbFetchStats)> { let mut stats = MbFetchStats::default(); let total_start = Instant::now(); @@ -598,7 +607,7 @@ pub async fn mb_fetch_with_graph( Ok(c) => c, Err(e) => { tracing::warn!("mb_resolve client build failed: {e}"); - return Ok(stats); + return Ok((graph, stats)); } }; let registry = Arc::new(registry_url.trim_end_matches('/').to_string()); @@ -606,15 +615,15 @@ pub async fn mb_fetch_with_graph( let peer_deps = preload_config.peer_deps; // Initial seed: walk root + workspace nodes for unresolved - // registry edges. (Workspace nodes were created during graph - // initialization in `service::api::build_deps`.) + // registry edges. Done inline before spawning workers (one-time + // cost, not on the hot path). let mut seen_specs: HashSet<(String, String)> = HashSet::new(); let mut pending: VecDeque = VecDeque::new(); let mut edge_targets: EdgeTargets = HashMap::new(); let root_index = graph.root_index; enqueue_node_edges( - graph, + &mut graph, root_index, &mut pending, &mut seen_specs, @@ -622,9 +631,6 @@ pub async fn mb_fetch_with_graph( cache, build_config, ); - // Workspace nodes' direct edges. Workspace deps may be - // workspace: (resolved at graph init) or registry; registry - // ones land in pending. let workspace_indices: Vec = graph .graph .node_indices() @@ -632,7 +638,7 @@ pub async fn mb_fetch_with_graph( .collect(); for node_idx in workspace_indices { enqueue_node_edges( - graph, + &mut graph, node_idx, &mut pending, &mut seen_specs, @@ -642,24 +648,48 @@ pub async fn mb_fetch_with_graph( ); } - // Sibling-fetch dedup carries over from `mb_fetch`. + // Channels: main → graph (fetched events) + graph → main (new + // pending specs). Bounded at 2 * cap so neither side stalls + // waiting for the other under bursty wave behavior. + let (fetch_tx, fetch_rx) = mpsc::channel::(cap * 2 + 16); + let (specs_tx, mut specs_rx) = mpsc::channel::>(cap * 2 + 16); + + // Spawn graph worker: owns the graph + edge_targets + seen_specs. + // This task is CPU-only (no awaits except channel IO), so on a + // multi-thread tokio runtime it gets its own worker thread, + // freeing the main task's worker to drive socket polling. That + // separation is the whole point of this rewrite — the inline + // version observed zwin events 71 vs mb's 49, evidence of main + // loop CPU starving the runtime's IO polling. + let cache_clone = cache.clone(); + let build_config_owned = build_config.clone(); + let graph_handle = tokio::spawn(graph_worker( + graph, + edge_targets, + seen_specs, + cache_clone, + build_config_owned, + fetch_rx, + specs_tx, + )); + + // Sibling-fetch dedup stays in main loop (drives FuturesUnordered). let body_cache: Arc>>> = Arc::new(Mutex::new(HashMap::new())); let mut in_flight_names: HashSet = HashSet::new(); let mut deferred_by_name: HashMap> = HashMap::new(); - let mut futs: FuturesUnordered = FuturesUnordered::new(); - // Per-fetch-future timing accumulators (same as `mb_fetch`). let mut sum_wall_us: u64 = 0; let mut sum_net_us: u64 = 0; let mut fetch_count: u64 = 0; let mut settle_count: u64 = 0; - // Sum of CPU spent in inline graph mutations across all fetched - // events. Reported alongside the fetch totals so we can attribute - // the mb_fetch wall split between IO and CPU. - let mut sum_graph_us: u64 = 0; + // Number of FetchEventMsg sent to graph worker that haven't yet + // had a corresponding Vec response. Drives termination: + // when futs empty + in_flight == 0, no more work pipelined. + let mut in_flight_graph: usize = 0; loop { + // Refill futs from pending up to cap. while futs.len() < cap { let Some((name, spec)) = pending.pop_front() else { break; @@ -683,123 +713,74 @@ pub async fn mb_fetch_with_graph( )); } - if futs.is_empty() { + // Termination: nothing in flight at fetch level AND graph + // worker has nothing pending. + if futs.is_empty() && in_flight_graph == 0 { break; } - let Some(out) = futs.next().await else { break }; - - sum_wall_us += out.wall_us; - sum_net_us += out.net_us; - if out.fetched { - fetch_count += 1; - } else { - settle_count += 1; - } - if out.fetched { - stats.success += 1; - } - - // Drain sibling specs deferred while the fetch was in flight. - if out.fetched - && let Some(siblings) = deferred_by_name.remove(&out.name) - && let Some(raw) = body_cache.lock().get(&out.name).cloned() - { - for sibling_spec in siblings { - futs.push(spawn_settle( - out.name.clone(), - sibling_spec, - Arc::clone(&raw), - cache.clone(), - peer_deps, - )); + // Drive both halves: prefer draining specs back from graph + // worker (unblocks new fetch dispatch) over starting another + // fetch landing. + tokio::select! { + biased; + maybe_specs = specs_rx.recv() => { + match maybe_specs { + Some(specs) => { + pending.extend(specs); + in_flight_graph -= 1; + } + None => { + // Graph worker exited unexpectedly. Bail. + break; + } + } } - } + maybe_result = futs.next(), if !futs.is_empty() => { + if let Some(out) = maybe_result { + sum_wall_us += out.wall_us; + sum_net_us += out.net_us; + if out.fetched { + fetch_count += 1; + stats.success += 1; + } else { + settle_count += 1; + } - // Graph mutations: process every parent edge waiting on - // `(name, spec)` for each transitive spec the fetch resolved - // (the fetch itself touched only the primary spec; sibling - // settles touch their own specs). Each settle path covers - // its own bucket via the `out.transitives` path below. - // - // The fetched/settled (name, spec) pair has already been - // written to the cache by the future. Look up the version - // manifest to get the ResolvedPackage handed to - // process_dependency_with_resolved. - let graph_start = Instant::now(); - let process_key_specs: Vec<(String, String)> = out - .transitives - .iter() - .map(|(n, s)| (n.clone(), s.clone())) - .collect(); - // The primary fetched/settled spec itself: resolve it now. - let primary_keys: Vec<(String, String)> = edge_targets - .keys() - .filter(|(n, _)| n == &out.name) - .cloned() - .collect(); - for (k_name, k_spec) in primary_keys { - // Pull resolved manifest from cache for this spec. - let Some(core_arc) = cache.get_version_manifest(&k_name, &k_spec) else { - continue; - }; - let resolved = ResolvedPackage { - name: k_name.clone(), - version: core_arc.version.clone(), - manifest: core_arc, - }; - let waiting = edge_targets.remove(&(k_name.clone(), k_spec.clone())); - if let Some(targets) = waiting { - for (parent_idx, edge_id) in targets { - let edge_info = crate::resolver::edges::DependencyEdgeInfo { - edge_id, - name: k_name.clone(), - spec: k_spec.clone(), - // edge_type carried separately on the graph; we - // re-look-up the actual edge here for - // correctness. - edge_type: graph - .graph - .edge_weight(edge_id) - .and_then(|e| match e { - crate::model::graph::GraphEdge::Dependency(d) => Some(d.edge_type), - _ => None, - }) - .unwrap_or(crate::model::node::EdgeType::Prod), - }; - let result = process_dependency_with_resolved( - graph, - parent_idx, - &edge_info, - &resolved, - build_config, - ); - if let ProcessResult::Created(new_idx) = result { - // The new node's transitive edges become new - // pending entries. enqueue handles cache-hit - // inline-process so we don't orphan - // edge_targets entries after their fetch - // already landed. - enqueue_node_edges( - graph, - new_idx, - &mut pending, - &mut seen_specs, - &mut edge_targets, - cache, - build_config, - ); + // Drain sibling specs deferred while the fetch + // was in flight. Sibling settles also produce a + // FetchEventMsg downstream. + if out.fetched + && let Some(siblings) = deferred_by_name.remove(&out.name) + && let Some(raw) = body_cache.lock().get(&out.name).cloned() + { + for sibling_spec in siblings { + futs.push(spawn_settle( + out.name.clone(), + sibling_spec, + Arc::clone(&raw), + cache.clone(), + peer_deps, + )); + } + } + + // Send to graph worker. `send().await` only + // blocks if channel is full (cap * 2 buffer); + // under steady state shouldn't happen. + if fetch_tx.send(FetchEventMsg { name: out.name }).await.is_ok() { + in_flight_graph += 1; } } } } - sum_graph_us += graph_start.elapsed().as_micros() as u64; - // Suppress an unused-vars warning when the transitive list is - // identical to the keys we just pulled from edge_targets — - // we keep collecting it for tracing parity with `mb_fetch`. - let _ = process_key_specs; } + // Signal graph worker to exit, then await its finalization to + // recover the graph + stats. + drop(fetch_tx); + let (graph, graph_stats) = graph_handle.await.context("graph worker join")??; + let total_wall_ms = total_start.elapsed().as_millis(); let total_wall_us = (total_wall_ms as u64).saturating_mul(1000); let eff_par_full = if total_wall_us > 0 { @@ -813,21 +794,186 @@ pub async fn mb_fetch_with_graph( 0.0 }; let avg_net_us = sum_net_us.checked_div(fetch_count).unwrap_or(0); - let unresolved_remaining = edge_targets.len(); tracing::info!( - "p1-breakdown mb_fetch_with_graph wall={}ms ok={} fetch={} settle={} sum_wall={}ms sum_net={}ms sum_graph={}ms avg_net={}us eff_par_full={:.1} eff_par_net={:.1} unresolved_targets={}", + "p1-breakdown mb_fetch_with_graph wall={}ms ok={} fetch={} settle={} sum_wall={}ms sum_net={}ms sum_graph={}ms avg_net={}us eff_par_full={:.1} eff_par_net={:.1} unresolved_targets={} graph_processed={} graph_new_specs={}", total_wall_ms, stats.success, fetch_count, settle_count, sum_wall_us / 1000, sum_net_us / 1000, - sum_graph_us / 1000, + graph_stats.sum_graph_us / 1000, avg_net_us, eff_par_full, eff_par_net, - unresolved_remaining, + graph_stats.unresolved_remaining, + graph_stats.processed, + graph_stats.new_specs_emitted, ); - Ok(stats) + Ok((graph, stats)) +} + +#[derive(Debug, Default)] +struct GraphWorkerStats { + sum_graph_us: u64, + processed: usize, + new_specs_emitted: usize, + unresolved_remaining: usize, +} + +/// CPU-only worker task that owns the graph + edge_targets + +/// seen_specs. Receives fetch events from main loop, mutates graph +/// via `process_dependency_with_resolved`, sends new pending specs +/// back. Designed to monopolize a tokio runtime worker thread so +/// the main loop's worker can drive socket polling without +/// competing for CPU. +async fn graph_worker( + mut graph: DependencyGraph, + mut edge_targets: EdgeTargets, + mut seen_specs: HashSet<(String, String)>, + cache: MemoryCache, + build_config: BuildDepsConfig, + mut fetch_rx: mpsc::Receiver, + specs_tx: mpsc::Sender>, +) -> Result<(DependencyGraph, GraphWorkerStats)> { + let mut stats = GraphWorkerStats::default(); + + while let Some(msg) = fetch_rx.recv().await { + let graph_start = Instant::now(); + stats.processed += 1; + + // Drain edge_targets for every spec keyed under this name. + // The fetch future already wrote both `(name, primary_spec)` + // and `(name, resolved_version)` cache slots, so any + // edge_targets entry for this name should hit cache. + let primary_keys: Vec<(String, String)> = edge_targets + .keys() + .filter(|(n, _)| n == &msg.name) + .cloned() + .collect(); + + let mut new_specs: Vec = Vec::new(); + for (k_name, k_spec) in primary_keys { + let Some(core_arc) = cache.get_version_manifest(&k_name, &k_spec) else { + continue; + }; + let resolved = ResolvedPackage { + name: k_name.clone(), + version: core_arc.version.clone(), + manifest: core_arc, + }; + let Some(targets) = edge_targets.remove(&(k_name.clone(), k_spec.clone())) else { + continue; + }; + for (parent_idx, edge_id) in targets { + let edge_info = crate::resolver::edges::DependencyEdgeInfo { + edge_id, + name: k_name.clone(), + spec: k_spec.clone(), + edge_type: graph + .graph + .edge_weight(edge_id) + .and_then(|e| match e { + crate::model::graph::GraphEdge::Dependency(d) => Some(d.edge_type), + _ => None, + }) + .unwrap_or(crate::model::node::EdgeType::Prod), + }; + let result = process_dependency_with_resolved( + &mut graph, + parent_idx, + &edge_info, + &resolved, + &build_config, + ); + if let ProcessResult::Created(new_idx) = result { + // Walk the new node's edges. enqueue handles + // recursive cache-hit drain so already-cached + // specs get processed inline (still on this + // worker thread — graph mutations can't run on + // multiple threads with `&mut graph`). + enqueue_node_edges_into( + &mut graph, + new_idx, + &mut new_specs, + &mut seen_specs, + &mut edge_targets, + &cache, + &build_config, + ); + } + } + } + + stats.sum_graph_us += graph_start.elapsed().as_micros() as u64; + stats.new_specs_emitted += new_specs.len(); + + // Always reply (even if empty) so main loop's `in_flight` + // counter decrements for each FetchEventMsg sent. + if specs_tx.send(new_specs).await.is_err() { + // Main loop dropped the receiver — bail. + break; + } + } + + stats.unresolved_remaining = edge_targets.len(); + Ok((graph, stats)) +} + +/// Same as `enqueue_node_edges` but pushes new specs into the +/// caller-provided `out` Vec instead of a VecDeque. Used by the +/// graph worker to batch "new specs from this fetch" before sending +/// them back to the main loop in one channel message. +fn enqueue_node_edges_into( + graph: &mut DependencyGraph, + node_idx: NodeIndex, + out: &mut Vec, + seen_specs: &mut HashSet<(String, String)>, + edge_targets: &mut EdgeTargets, + cache: &MemoryCache, + build_config: &BuildDepsConfig, +) { + let mut work_stack: Vec = vec![node_idx]; + while let Some(idx) = work_stack.pop() { + let edges = collect_unresolved_edges(graph, idx); + for edge in edges { + if !edge.spec.is_registry_spec() { + continue; + } + let key = (edge.name.clone(), edge.spec.clone()); + + if let Some(core_arc) = cache.get_version_manifest(&edge.name, &edge.spec) { + let resolved = ResolvedPackage { + name: edge.name.clone(), + version: core_arc.version.clone(), + manifest: core_arc, + }; + let edge_info = crate::resolver::edges::DependencyEdgeInfo { + edge_id: edge.edge_id, + name: edge.name.clone(), + spec: edge.spec.clone(), + edge_type: edge.edge_type, + }; + if let ProcessResult::Created(new_idx) = process_dependency_with_resolved( + graph, + idx, + &edge_info, + &resolved, + build_config, + ) { + work_stack.push(new_idx); + } + continue; + } + + edge_targets + .entry(key.clone()) + .or_default() + .push((idx, edge.edge_id)); + if seen_specs.insert(key.clone()) { + out.push(key); + } + } + } } diff --git a/crates/ruborist/src/service/api.rs b/crates/ruborist/src/service/api.rs index 837ebfc5b..6c9ada9a8 100644 --- a/crates/ruborist/src/service/api.rs +++ b/crates/ruborist/src/service/api.rs @@ -285,8 +285,8 @@ where peer_deps, concurrency, }; - mb_fetch_with_graph( - &mut graph, + let (returned_graph, _stats) = mb_fetch_with_graph( + graph, registry.registry_url(), registry.cache(), &preload_config, @@ -294,6 +294,7 @@ where ) .await .map_err(|e| e.context("mb_fetch_with_graph failed"))?; + graph = returned_graph; } // Preserve the typed error via `Error::new` + `.context(...)` so CLI From d1cf53e2f14acb5110048a33031401a38aca839a Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sat, 9 May 2026 23:54:41 +0800 Subject: [PATCH 31/32] perf(pm): integrate channel-based mb_fetch into install pipeline Plumb the PipelineReceiver through the folded mb_fetch_with_graph path so install (`utoo install`) gets the same channel-separated fetch + graph architecture as `utoo deps`, with download/clone pipelines starting as early as the legacy preload+BFS path: - mb_fetch_with_graph now takes Arc; main loop emits PackageResolved on each fetch land (looked up via cache with the new FetchOutcome::primary_spec), graph_worker emits PackagePlaced on ProcessResult::Created. - service::api::build_deps wraps the caller-supplied receiver in Arc once and shares it between mb_fetch_with_graph and build_deps_with_config; adds + 'static bound on R. - pipeline_deps_options sets skip_preload=true so install routes through the same folded path as the lockfile-only command. CI will validate that p1 resolve continues at/below 2.5s while p0_full_cold and p3_cold_install do not regress (download + clone pipelines remain saturated via emitted events). Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 842 +++++++++++++++++---- crates/pm/src/helper/ruborist_context.rs | 11 +- crates/ruborist/src/resolver/mb_resolve.rs | 63 +- crates/ruborist/src/service/api.rs | 10 +- 4 files changed, 781 insertions(+), 145 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3a136807b..c4b103915 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -61,6 +61,21 @@ dependencies = [ "memchr", ] +[[package]] +name = "alloc-no-stdlib" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" + +[[package]] +name = "alloc-stdlib" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" +dependencies = [ + "alloc-no-stdlib", +] + [[package]] name = "allocator-api2" version = "0.2.21" @@ -573,6 +588,27 @@ dependencies = [ "syn 2.0.106", ] +[[package]] +name = "brotli" +version = "8.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bd8b9603c7aa97359dbd97ecf258968c95f3adddd6db2f7e7a5bef101c84560" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + +[[package]] +name = "brotli-decompressor" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", +] + [[package]] name = "browserslist-data" version = "0.1.4" @@ -1110,6 +1146,7 @@ version = "0.4.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef8a506ec4b81c460798f572caead636d57d3d7e940f998160f52bd254bf2d23" dependencies = [ + "brotli", "compression-core", "flate2", "memchr", @@ -1690,23 +1727,6 @@ dependencies = [ "syn 2.0.106", ] -[[package]] -name = "ctor" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83cf0d42651b16c6dfe68685716d18480d18a9c39c62d76e8cf3eb6ed5d8bcbf" -dependencies = [ - "ctor-proc-macro", - "dtor", - "link-section", -] - -[[package]] -name = "ctor-proc-macro" -version = "0.0.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a949c44fcacbbbb7ada007dc7acb34603dd97cd47de5d054f2b6493ecebb483" - [[package]] name = "cty" version = "0.2.2" @@ -2246,21 +2266,6 @@ dependencies = [ "dtoa", ] -[[package]] -name = "dtor" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edf234dd1594d6dd434a8fb8cada51ddbbc593e40e4a01556a0b31c62da2775b" -dependencies = [ - "dtor-proc-macro", -] - -[[package]] -name = "dtor-proc-macro" -version = "0.0.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2647271c92754afcb174e758003cfd1cbf1e43e5a7853d7b1813e63e19e39a73" - [[package]] name = "dunce" version = "1.0.5" @@ -4824,12 +4829,6 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "link-section" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b685d66585d646efe09fec763d796c291049c8b6bf84e04954bffc8748341f0d" - [[package]] name = "linked-hash-map" version = "0.5.6" @@ -4944,6 +4943,21 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ca88d725a0a943b096803bd34e73a4437208b6077654cc4ecb2947a5f91618d" +[[package]] +name = "manifest-bench" +version = "0.0.0" +dependencies = [ + "anyhow", + "clap", + "futures", + "reqwest 0.12.24", + "rustls", + "rustls-native-certs", + "serde", + "serde_json", + "tokio", +] + [[package]] name = "markdown" version = "1.0.0" @@ -5380,7 +5394,7 @@ checksum = "55740c4ae1d8696773c78fdafd5d0e5fe9bc9f1b071c7ba493ba5c413a9184f3" dependencies = [ "anyhow", "bitflags 2.9.4", - "ctor 0.2.9", + "ctor", "napi-derive", "napi-sys", "once_cell", @@ -6358,6 +6372,22 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84350ffee5cedfabf9bee3e8825721f651da8ff79d50fe7a37cf0ca015c428ee" +[[package]] +name = "preload-bench" +version = "0.0.0" +dependencies = [ + "anyhow", + "clap", + "futures", + "reqwest 0.12.24", + "rustls", + "rustls-native-certs", + "serde", + "serde_json", + "simd-json", + "tokio", +] + [[package]] name = "preset_env_base" version = "7.0.0" @@ -7208,9 +7238,9 @@ dependencies = [ [[package]] name = "roaring" -version = "0.11.4" +version = "0.10.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1dedc5658c6ecb3bdb5ef5f3295bb9253f42dcf3fd1402c03f6b1f7659c3c4a9" +checksum = "19e8d2cfa184d94d0726d650a9f4a1be7f9b76ac9fdb954219878dc00c1c1e7b" dependencies = [ "bytemuck", "byteorder", @@ -8115,9 +8145,9 @@ dependencies = [ [[package]] name = "styled_components" -version = "4.0.0" +version = "3.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72418ea605a423c70ffa8590196c83b04b04636fd25aaceabe0fa7f1e15f66f0" +checksum = "99aeadac58111060ad883c7e7a01917bcecc6572243c06d41315f200cbaa9240" dependencies = [ "Inflector", "once_cell", @@ -8134,9 +8164,9 @@ dependencies = [ [[package]] name = "styled_jsx" -version = "4.0.0" +version = "3.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98cc5352e19f02be3ba10fb9ecbcd0d72e9b2d9762965712f1cbe737d1f428ec" +checksum = "c3917b257122e7cf3f46f95557af3178edaa9a3fd89fc1469768e05f01901e98" dependencies = [ "anyhow", "lightningcss", @@ -8155,9 +8185,9 @@ dependencies = [ "swc_css_prefixer", "swc_css_visit", "swc_ecma_ast 23.0.0", - "swc_ecma_minifier", - "swc_ecma_parser 39.0.2", - "swc_ecma_transforms_base 42.0.0", + "swc_ecma_minifier 51.1.0", + "swc_ecma_parser 38.0.2", + "swc_ecma_transforms_base 41.0.1", "swc_ecma_utils 29.1.0", "swc_ecma_visit 23.0.0", "swc_plugin_macro", @@ -8170,6 +8200,57 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" +[[package]] +name = "swc" +version = "61.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb7d502b72d0b5e059cefe3a55825c43587a2e3c81025862694e52deecddc3de" +dependencies = [ + "anyhow", + "base64 0.22.1", + "bytes-str", + "dashmap 5.5.3", + "either", + "indexmap 2.13.0", + "jsonc-parser", + "once_cell", + "par-core", + "par-iter", + "parking_lot", + "regex", + "rustc-hash 2.1.1", + "serde", + "serde_json", + "swc_atoms", + "swc_common 21.0.1", + "swc_compiler_base 54.0.0", + "swc_config", + "swc_ecma_ast 23.0.0", + "swc_ecma_codegen 26.0.1", + "swc_ecma_ext_transforms", + "swc_ecma_loader", + "swc_ecma_minifier 51.1.0", + "swc_ecma_parser 38.0.2", + "swc_ecma_preset_env 52.0.0", + "swc_ecma_transforms 51.0.0", + "swc_ecma_transforms_base 41.0.1", + "swc_ecma_transforms_compat 47.0.0", + "swc_ecma_transforms_optimization 43.0.0", + "swc_ecma_utils 29.1.0", + "swc_ecma_visit 23.0.0", + "swc_error_reporters", + "swc_node_comments", + "swc_plugin_proxy", + "swc_plugin_runner", + "swc_sourcemap", + "swc_timer", + "swc_transform_common", + "swc_visit", + "tokio", + "tracing", + "url", +] + [[package]] name = "swc" version = "63.0.0" @@ -8193,19 +8274,19 @@ dependencies = [ "serde_json", "swc_atoms", "swc_common 21.0.1", - "swc_compiler_base", + "swc_compiler_base 55.0.0", "swc_config", "swc_ecma_ast 23.0.0", "swc_ecma_codegen 26.0.1", "swc_ecma_ext_transforms", "swc_ecma_loader", - "swc_ecma_minifier", + "swc_ecma_minifier 52.0.4", "swc_ecma_parser 39.0.2", - "swc_ecma_preset_env", - "swc_ecma_transforms", + "swc_ecma_preset_env 53.0.0", + "swc_ecma_transforms 52.0.0", "swc_ecma_transforms_base 42.0.0", - "swc_ecma_transforms_compat", - "swc_ecma_transforms_optimization", + "swc_ecma_transforms_compat 48.0.0", + "swc_ecma_transforms_optimization 44.0.0", "swc_ecma_utils 29.1.0", "swc_ecma_visit 23.0.0", "swc_error_reporters", @@ -8322,6 +8403,32 @@ dependencies = [ "url", ] +[[package]] +name = "swc_compiler_base" +version = "54.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "386c6121a98d7630ef5a07b79acee964c778568d61d3b76a188be17f19418a9c" +dependencies = [ + "anyhow", + "base64 0.22.1", + "bytes-str", + "once_cell", + "pathdiff", + "rustc-hash 2.1.1", + "serde", + "serde_json", + "swc_atoms", + "swc_common 21.0.1", + "swc_config", + "swc_ecma_ast 23.0.0", + "swc_ecma_codegen 26.0.1", + "swc_ecma_minifier 51.1.0", + "swc_ecma_parser 38.0.2", + "swc_ecma_visit 23.0.0", + "swc_sourcemap", + "swc_timer", +] + [[package]] name = "swc_compiler_base" version = "55.0.0" @@ -8341,7 +8448,7 @@ dependencies = [ "swc_config", "swc_ecma_ast 23.0.0", "swc_ecma_codegen 26.0.1", - "swc_ecma_minifier", + "swc_ecma_minifier 52.0.4", "swc_ecma_parser 39.0.2", "swc_ecma_visit 23.0.0", "swc_sourcemap", @@ -8398,6 +8505,38 @@ dependencies = [ "vergen", ] +[[package]] +name = "swc_core" +version = "63.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb9470306b0d532da617be037de878f64ec0f04cb364d920e8cee05d658d66de" +dependencies = [ + "par-core", + "swc 61.0.0", + "swc_allocator", + "swc_atoms", + "swc_common 21.0.1", + "swc_ecma_ast 23.0.0", + "swc_ecma_codegen 26.0.1", + "swc_ecma_lints", + "swc_ecma_loader", + "swc_ecma_minifier 51.1.0", + "swc_ecma_parser 38.0.2", + "swc_ecma_preset_env 52.0.0", + "swc_ecma_quote_macros", + "swc_ecma_transforms_base 41.0.1", + "swc_ecma_transforms_optimization 43.0.0", + "swc_ecma_transforms_proposal 41.0.3", + "swc_ecma_transforms_react 45.0.0", + "swc_ecma_transforms_typescript 45.0.2", + "swc_ecma_utils 29.1.0", + "swc_ecma_visit 23.0.0", + "swc_plugin_proxy", + "swc_plugin_runner", + "testing", + "vergen", +] + [[package]] name = "swc_core" version = "65.0.3" @@ -8405,28 +8544,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "898413141c6d3e1fed24ac3a4c57cc61ef98194df2a7957820d48ad158a318f6" dependencies = [ "par-core", - "swc", + "swc 63.0.0", "swc_allocator", "swc_atoms", "swc_common 21.0.1", "swc_ecma_ast 23.0.0", "swc_ecma_codegen 26.0.1", - "swc_ecma_lints", "swc_ecma_loader", - "swc_ecma_minifier", + "swc_ecma_minifier 52.0.4", "swc_ecma_parser 39.0.2", - "swc_ecma_preset_env", - "swc_ecma_quote_macros", + "swc_ecma_preset_env 53.0.0", "swc_ecma_transforms_base 42.0.0", - "swc_ecma_transforms_optimization", - "swc_ecma_transforms_proposal", - "swc_ecma_transforms_react", - "swc_ecma_transforms_typescript", + "swc_ecma_transforms_optimization 44.0.0", + "swc_ecma_transforms_react 46.0.1", + "swc_ecma_transforms_typescript 46.0.1", "swc_ecma_utils 29.1.0", "swc_ecma_visit 23.0.0", "swc_plugin_proxy", "swc_plugin_runner", - "testing", "vergen", ] @@ -8656,6 +8791,24 @@ dependencies = [ "syn 2.0.106", ] +[[package]] +name = "swc_ecma_compat_bugfixes" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22d4da77f7014b5efd416bb5208ab6e3d005ad5d532df8ced2904e50ca233d44" +dependencies = [ + "rustc-hash 2.1.1", + "swc_atoms", + "swc_common 21.0.1", + "swc_ecma_ast 23.0.0", + "swc_ecma_compat_es2015 45.0.0", + "swc_ecma_transforms_base 41.0.1", + "swc_ecma_utils 29.1.0", + "swc_ecma_visit 23.0.0", + "swc_trace_macro", + "tracing", +] + [[package]] name = "swc_ecma_compat_bugfixes" version = "47.0.0" @@ -8666,7 +8819,7 @@ dependencies = [ "swc_atoms", "swc_common 21.0.1", "swc_ecma_ast 23.0.0", - "swc_ecma_compat_es2015", + "swc_ecma_compat_es2015 46.0.0", "swc_ecma_transforms_base 42.0.0", "swc_ecma_utils 29.1.0", "swc_ecma_visit 23.0.0", @@ -8674,6 +8827,18 @@ dependencies = [ "tracing", ] +[[package]] +name = "swc_ecma_compat_common" +version = "37.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d72d7d499e4bd4059ccfe432c1a52111a28fdd2b49b3882f18108fddfa3f6b4f" +dependencies = [ + "swc_common 21.0.1", + "swc_ecma_ast 23.0.0", + "swc_ecma_transformer 13.0.0", + "swc_ecma_utils 29.1.0", +] + [[package]] name = "swc_ecma_compat_common" version = "38.0.0" @@ -8682,8 +8847,36 @@ checksum = "04b936fe418e2bd707298357f560d269c1bdedc86a2325f7163307fe140806bd" dependencies = [ "swc_common 21.0.1", "swc_ecma_ast 23.0.0", - "swc_ecma_transformer", + "swc_ecma_transformer 14.0.0", + "swc_ecma_utils 29.1.0", +] + +[[package]] +name = "swc_ecma_compat_es2015" +version = "45.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5095800ee11e7c37df38a2e0fae2caa9d98b7801121d5f5ce70710ab65e21ec7" +dependencies = [ + "arrayvec", + "indexmap 2.13.0", + "is-macro", + "rustc-hash 2.1.1", + "serde", + "serde_derive", + "smallvec", + "swc_atoms", + "swc_common 21.0.1", + "swc_config", + "swc_ecma_ast 23.0.0", + "swc_ecma_compat_common 37.0.0", + "swc_ecma_transformer 13.0.0", + "swc_ecma_transforms_base 41.0.1", + "swc_ecma_transforms_classes 41.0.1", + "swc_ecma_transforms_macros", "swc_ecma_utils 29.1.0", + "swc_ecma_visit 23.0.0", + "swc_trace_macro", + "tracing", ] [[package]] @@ -8703,10 +8896,10 @@ dependencies = [ "swc_common 21.0.1", "swc_config", "swc_ecma_ast 23.0.0", - "swc_ecma_compat_common", - "swc_ecma_transformer", + "swc_ecma_compat_common 38.0.0", + "swc_ecma_transformer 14.0.0", "swc_ecma_transforms_base 42.0.0", - "swc_ecma_transforms_classes", + "swc_ecma_transforms_classes 42.0.0", "swc_ecma_transforms_macros", "swc_ecma_utils 29.1.0", "swc_ecma_visit 23.0.0", @@ -8714,6 +8907,19 @@ dependencies = [ "tracing", ] +[[package]] +name = "swc_ecma_compat_es2016" +version = "42.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1358f912b0b5bdb6509f64dada8dc9ac8dc9233175b1d033c571cd34ad0bbec" +dependencies = [ + "swc_ecma_ast 23.0.0", + "swc_ecma_transformer 13.0.0", + "swc_ecma_transforms_base 41.0.1", + "swc_ecma_utils 29.1.0", + "tracing", +] + [[package]] name = "swc_ecma_compat_es2016" version = "43.0.0" @@ -8721,7 +8927,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4402a84df86ebd3723decdd041743ba8e48c7903bfe7f5c7c712bac46642ac90" dependencies = [ "swc_ecma_ast 23.0.0", - "swc_ecma_transformer", + "swc_ecma_transformer 14.0.0", "swc_ecma_transforms_base 42.0.0", "swc_ecma_utils 29.1.0", "tracing", @@ -8729,77 +8935,170 @@ dependencies = [ [[package]] name = "swc_ecma_compat_es2017" -version = "43.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99d5f9f182e397fb69ea1f592770b67b94fe2bf201f3e6695cbeba66ccc1715a" +checksum = "65a437c6a98cbfed7b355e2da721a52b1731537b6debf81cadccc9f196bbdbba" dependencies = [ "serde", "swc_common 21.0.1", "swc_ecma_ast 23.0.0", - "swc_ecma_transformer", - "swc_ecma_transforms_base 42.0.0", + "swc_ecma_transformer 13.0.0", + "swc_ecma_transforms_base 41.0.1", "swc_ecma_utils 29.1.0", "tracing", ] [[package]] -name = "swc_ecma_compat_es2018" -version = "44.0.0" +name = "swc_ecma_compat_es2017" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "757acfefd8ececa3fd3491e7dcbf6da1b7b5fba602b70b8f2b36af30fac35eea" +checksum = "99d5f9f182e397fb69ea1f592770b67b94fe2bf201f3e6695cbeba66ccc1715a" dependencies = [ "serde", + "swc_common 21.0.1", "swc_ecma_ast 23.0.0", - "swc_ecma_transformer", + "swc_ecma_transformer 14.0.0", "swc_ecma_transforms_base 42.0.0", "swc_ecma_utils 29.1.0", "tracing", ] [[package]] -name = "swc_ecma_compat_es2019" +name = "swc_ecma_compat_es2018" version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a0f39d1ebadade7d0a0a137cedec958cfd38fe99c5c69c762d879650b5e9848" +checksum = "27ffcf499581d598250e4d93d45ef64fe81b16f83c3bcb8c21d27af2004e6f54" dependencies = [ - "swc_common 21.0.1", + "serde", "swc_ecma_ast 23.0.0", - "swc_ecma_transformer", - "swc_ecma_transforms_base 42.0.0", + "swc_ecma_transformer 13.0.0", + "swc_ecma_transforms_base 41.0.1", "swc_ecma_utils 29.1.0", "tracing", ] [[package]] -name = "swc_ecma_compat_es2020" -version = "45.0.0" +name = "swc_ecma_compat_es2018" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "170d1ba05307a49e53a55f13128e991e6d250819ed2f75be267dbd9a4a14b00d" +checksum = "757acfefd8ececa3fd3491e7dcbf6da1b7b5fba602b70b8f2b36af30fac35eea" dependencies = [ "serde", - "swc_common 21.0.1", "swc_ecma_ast 23.0.0", - "swc_ecma_compat_es2022", - "swc_ecma_transformer", + "swc_ecma_transformer 14.0.0", "swc_ecma_transforms_base 42.0.0", "swc_ecma_utils 29.1.0", - "swc_ecma_visit 23.0.0", "tracing", ] [[package]] -name = "swc_ecma_compat_es2021" -version = "43.0.0" +name = "swc_ecma_compat_es2019" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfef1313a8410a2229aca737b65bb82c4aa45bdd6cedc0a0083688da0b960b20" +checksum = "5125766d7ca9c4789eefdb68fd9d1bc9eba1119df21ad3d1fd7b0ac2808893d0" dependencies = [ + "swc_common 21.0.1", "swc_ecma_ast 23.0.0", - "swc_ecma_transformer", - "swc_ecma_transforms_base 42.0.0", + "swc_ecma_transformer 13.0.0", + "swc_ecma_transforms_base 41.0.1", "swc_ecma_utils 29.1.0", "tracing", ] +[[package]] +name = "swc_ecma_compat_es2019" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a0f39d1ebadade7d0a0a137cedec958cfd38fe99c5c69c762d879650b5e9848" +dependencies = [ + "swc_common 21.0.1", + "swc_ecma_ast 23.0.0", + "swc_ecma_transformer 14.0.0", + "swc_ecma_transforms_base 42.0.0", + "swc_ecma_utils 29.1.0", + "tracing", +] + +[[package]] +name = "swc_ecma_compat_es2020" +version = "44.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4eba7cf139b36cdf75daf9f1fc9096f566c8034d774ce040f09f0fccd4ffe02e" +dependencies = [ + "serde", + "swc_common 21.0.1", + "swc_ecma_ast 23.0.0", + "swc_ecma_compat_es2022 44.0.0", + "swc_ecma_transformer 13.0.0", + "swc_ecma_transforms_base 41.0.1", + "swc_ecma_utils 29.1.0", + "swc_ecma_visit 23.0.0", + "tracing", +] + +[[package]] +name = "swc_ecma_compat_es2020" +version = "45.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "170d1ba05307a49e53a55f13128e991e6d250819ed2f75be267dbd9a4a14b00d" +dependencies = [ + "serde", + "swc_common 21.0.1", + "swc_ecma_ast 23.0.0", + "swc_ecma_compat_es2022 45.0.0", + "swc_ecma_transformer 14.0.0", + "swc_ecma_transforms_base 42.0.0", + "swc_ecma_utils 29.1.0", + "swc_ecma_visit 23.0.0", + "tracing", +] + +[[package]] +name = "swc_ecma_compat_es2021" +version = "42.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f64ee2ff23cdc2bb9749f3fb730bd4a95cc26cdea84b384b85574a1ab43f78af" +dependencies = [ + "swc_ecma_ast 23.0.0", + "swc_ecma_transformer 13.0.0", + "swc_ecma_transforms_base 41.0.1", + "swc_ecma_utils 29.1.0", + "tracing", +] + +[[package]] +name = "swc_ecma_compat_es2021" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfef1313a8410a2229aca737b65bb82c4aa45bdd6cedc0a0083688da0b960b20" +dependencies = [ + "swc_ecma_ast 23.0.0", + "swc_ecma_transformer 14.0.0", + "swc_ecma_transforms_base 42.0.0", + "swc_ecma_utils 29.1.0", + "tracing", +] + +[[package]] +name = "swc_ecma_compat_es2022" +version = "44.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9e0499dc93f8eb04c88d5cf6aefc4ce34fdcca9dd69155d6882eb011339c9dd" +dependencies = [ + "rustc-hash 2.1.1", + "swc_atoms", + "swc_common 21.0.1", + "swc_ecma_ast 23.0.0", + "swc_ecma_transformer 13.0.0", + "swc_ecma_transforms_base 41.0.1", + "swc_ecma_transforms_classes 41.0.1", + "swc_ecma_transforms_macros", + "swc_ecma_utils 29.1.0", + "swc_ecma_visit 23.0.0", + "swc_trace_macro", + "tracing", +] + [[package]] name = "swc_ecma_compat_es2022" version = "45.0.0" @@ -8810,9 +9109,9 @@ dependencies = [ "swc_atoms", "swc_common 21.0.1", "swc_ecma_ast 23.0.0", - "swc_ecma_transformer", + "swc_ecma_transformer 14.0.0", "swc_ecma_transforms_base 42.0.0", - "swc_ecma_transforms_classes", + "swc_ecma_transforms_classes 42.0.0", "swc_ecma_transforms_macros", "swc_ecma_utils 29.1.0", "swc_ecma_visit 23.0.0", @@ -8899,6 +9198,42 @@ dependencies = [ "tracing", ] +[[package]] +name = "swc_ecma_minifier" +version = "51.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c25a685c2efe2f88ba359dde0a17382b28a206ea21b23bda612f97b2c423b2f2" +dependencies = [ + "arrayvec", + "bitflags 2.9.4", + "indexmap 2.13.0", + "num-bigint", + "num_cpus", + "once_cell", + "par-core", + "par-iter", + "parking_lot", + "phf", + "radix_fmt", + "rustc-hash 2.1.1", + "ryu-js", + "serde", + "serde_json", + "swc_atoms", + "swc_common 21.0.1", + "swc_config", + "swc_ecma_ast 23.0.0", + "swc_ecma_codegen 26.0.1", + "swc_ecma_hooks", + "swc_ecma_parser 38.0.2", + "swc_ecma_transforms_base 41.0.1", + "swc_ecma_transforms_optimization 43.0.0", + "swc_ecma_utils 29.1.0", + "swc_ecma_visit 23.0.0", + "swc_timer", + "tracing", +] + [[package]] name = "swc_ecma_minifier" version = "52.0.4" @@ -8928,7 +9263,7 @@ dependencies = [ "swc_ecma_hooks", "swc_ecma_parser 39.0.2", "swc_ecma_transforms_base 42.0.0", - "swc_ecma_transforms_optimization", + "swc_ecma_transforms_optimization 44.0.0", "swc_ecma_utils 29.1.0", "swc_ecma_visit 23.0.0", "swc_timer", @@ -8955,6 +9290,27 @@ dependencies = [ "tracing", ] +[[package]] +name = "swc_ecma_parser" +version = "38.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7c251d44e048647b5335861d1585b3e95fa8bc74f6e7a40570b0ea95d27ba66" +dependencies = [ + "bitflags 2.9.4", + "either", + "num-bigint", + "phf", + "rustc-hash 2.1.1", + "seq-macro", + "serde", + "smartstring", + "stacker", + "swc_atoms", + "swc_common 21.0.1", + "swc_ecma_ast 23.0.0", + "tracing", +] + [[package]] name = "swc_ecma_parser" version = "39.0.2" @@ -8969,13 +9325,37 @@ dependencies = [ "seq-macro", "serde", "smartstring", - "stacker", "swc_atoms", "swc_common 21.0.1", "swc_ecma_ast 23.0.0", "tracing", ] +[[package]] +name = "swc_ecma_preset_env" +version = "52.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5132d5890cddc4e47feb29c3388b4b0ca2251173c2c859c4b48b896794767c54" +dependencies = [ + "anyhow", + "foldhash 0.1.5", + "indexmap 2.13.0", + "once_cell", + "precomputed-map", + "preset_env_base", + "rustc-hash 2.1.1", + "serde", + "serde_json", + "string_enum", + "swc_atoms", + "swc_common 21.0.1", + "swc_ecma_ast 23.0.0", + "swc_ecma_transformer 13.0.0", + "swc_ecma_transforms 51.0.0", + "swc_ecma_utils 29.1.0", + "swc_ecma_visit 23.0.0", +] + [[package]] name = "swc_ecma_preset_env" version = "53.0.0" @@ -8995,17 +9375,17 @@ dependencies = [ "swc_atoms", "swc_common 21.0.1", "swc_ecma_ast 23.0.0", - "swc_ecma_transformer", - "swc_ecma_transforms", + "swc_ecma_transformer 14.0.0", + "swc_ecma_transforms 52.0.0", "swc_ecma_utils 29.1.0", "swc_ecma_visit 23.0.0", ] [[package]] name = "swc_ecma_quote_macros" -version = "39.0.0" +version = "38.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54e4d28106d86d9c45d187687688d03bab7064bd8480d8bc783df9ff2a5d5a9a" +checksum = "16896c184ff6915c85ee4bffd08db32e010b1c1a9628e6c4ee49a233653c20a7" dependencies = [ "anyhow", "proc-macro2", @@ -9014,7 +9394,7 @@ dependencies = [ "swc_atoms", "swc_common 21.0.1", "swc_ecma_ast 23.0.0", - "swc_ecma_parser 39.0.2", + "swc_ecma_parser 38.0.2", "swc_macros_common", "syn 2.0.106", ] @@ -9067,6 +9447,25 @@ dependencies = [ "swc_visit", ] +[[package]] +name = "swc_ecma_transformer" +version = "13.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65c334a42d7d8252e5a80dbae85a1230144d29f7ed4aa7feada2a47167f9282e" +dependencies = [ + "rustc-hash 2.1.1", + "swc_atoms", + "swc_common 21.0.1", + "swc_ecma_ast 23.0.0", + "swc_ecma_compat_regexp", + "swc_ecma_hooks", + "swc_ecma_regexp", + "swc_ecma_transforms_base 41.0.1", + "swc_ecma_utils 29.1.0", + "swc_ecma_visit 23.0.0", + "tracing", +] + [[package]] name = "swc_ecma_transformer" version = "14.0.0" @@ -9086,6 +9485,24 @@ dependencies = [ "tracing", ] +[[package]] +name = "swc_ecma_transforms" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b94503bbcd555d82cb33ff0e591e935bb925b79b254e94e706521f15d762b473" +dependencies = [ + "par-core", + "swc_common 21.0.1", + "swc_ecma_ast 23.0.0", + "swc_ecma_transforms_base 41.0.1", + "swc_ecma_transforms_compat 47.0.0", + "swc_ecma_transforms_optimization 43.0.0", + "swc_ecma_transforms_proposal 41.0.3", + "swc_ecma_transforms_react 45.0.0", + "swc_ecma_transforms_typescript 45.0.2", + "swc_ecma_utils 29.1.0", +] + [[package]] name = "swc_ecma_transforms" version = "52.0.0" @@ -9096,11 +9513,11 @@ dependencies = [ "swc_common 21.0.1", "swc_ecma_ast 23.0.0", "swc_ecma_transforms_base 42.0.0", - "swc_ecma_transforms_compat", - "swc_ecma_transforms_optimization", - "swc_ecma_transforms_proposal", - "swc_ecma_transforms_react", - "swc_ecma_transforms_typescript", + "swc_ecma_transforms_compat 48.0.0", + "swc_ecma_transforms_optimization 44.0.0", + "swc_ecma_transforms_proposal 42.0.0", + "swc_ecma_transforms_react 46.0.1", + "swc_ecma_transforms_typescript 46.0.1", "swc_ecma_utils 29.1.0", ] @@ -9126,6 +9543,28 @@ dependencies = [ "tracing", ] +[[package]] +name = "swc_ecma_transforms_base" +version = "41.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6be824dc326da1f7673d1e241790626e5f39f09e1d896175134143408eeaa081" +dependencies = [ + "better_scoped_tls", + "indexmap 2.13.0", + "once_cell", + "par-core", + "phf", + "rustc-hash 2.1.1", + "serde", + "swc_atoms", + "swc_common 21.0.1", + "swc_ecma_ast 23.0.0", + "swc_ecma_parser 38.0.2", + "swc_ecma_utils 29.1.0", + "swc_ecma_visit 23.0.0", + "tracing", +] + [[package]] name = "swc_ecma_transforms_base" version = "42.0.0" @@ -9148,6 +9587,19 @@ dependencies = [ "tracing", ] +[[package]] +name = "swc_ecma_transforms_classes" +version = "41.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ffae23e996fa1a7b20b77ff599aa0e4997a6eb21369e2e5e906c91b89fdffaa" +dependencies = [ + "swc_common 21.0.1", + "swc_ecma_ast 23.0.0", + "swc_ecma_transforms_base 41.0.1", + "swc_ecma_utils 29.1.0", + "swc_ecma_visit 23.0.0", +] + [[package]] name = "swc_ecma_transforms_classes" version = "42.0.0" @@ -9161,6 +9613,34 @@ dependencies = [ "swc_ecma_visit 23.0.0", ] +[[package]] +name = "swc_ecma_transforms_compat" +version = "47.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd54b7d82f0037f03367b4c9052a4ba2913e044df009fbeac388b2142c3ddd8a" +dependencies = [ + "indexmap 2.13.0", + "par-core", + "serde", + "swc_atoms", + "swc_common 21.0.1", + "swc_ecma_ast 23.0.0", + "swc_ecma_compat_bugfixes 46.0.0", + "swc_ecma_compat_common 37.0.0", + "swc_ecma_compat_es2015 45.0.0", + "swc_ecma_compat_es2016 42.0.0", + "swc_ecma_compat_es2017 42.0.0", + "swc_ecma_compat_es2018 43.0.0", + "swc_ecma_compat_es2019 42.0.0", + "swc_ecma_compat_es2020 44.0.0", + "swc_ecma_compat_es2021 42.0.0", + "swc_ecma_compat_es2022 44.0.0", + "swc_ecma_transforms_base 41.0.1", + "swc_ecma_utils 29.1.0", + "swc_ecma_visit 23.0.0", + "tracing", +] + [[package]] name = "swc_ecma_transforms_compat" version = "48.0.0" @@ -9173,16 +9653,16 @@ dependencies = [ "swc_atoms", "swc_common 21.0.1", "swc_ecma_ast 23.0.0", - "swc_ecma_compat_bugfixes", - "swc_ecma_compat_common", - "swc_ecma_compat_es2015", - "swc_ecma_compat_es2016", - "swc_ecma_compat_es2017", - "swc_ecma_compat_es2018", - "swc_ecma_compat_es2019", - "swc_ecma_compat_es2020", - "swc_ecma_compat_es2021", - "swc_ecma_compat_es2022", + "swc_ecma_compat_bugfixes 47.0.0", + "swc_ecma_compat_common 38.0.0", + "swc_ecma_compat_es2015 46.0.0", + "swc_ecma_compat_es2016 43.0.0", + "swc_ecma_compat_es2017 43.0.0", + "swc_ecma_compat_es2018 44.0.0", + "swc_ecma_compat_es2019 43.0.0", + "swc_ecma_compat_es2020 45.0.0", + "swc_ecma_compat_es2021 43.0.0", + "swc_ecma_compat_es2022 45.0.0", "swc_ecma_transforms_base 42.0.0", "swc_ecma_utils 29.1.0", "swc_ecma_visit 23.0.0", @@ -9201,6 +9681,30 @@ dependencies = [ "syn 2.0.106", ] +[[package]] +name = "swc_ecma_transforms_optimization" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae12179c92f0690850bae8932dfac2b7f191b8bfc6bac80dd81abfe6b0c014aa" +dependencies = [ + "bytes-str", + "dashmap 5.5.3", + "indexmap 2.13.0", + "once_cell", + "par-core", + "petgraph 0.7.1", + "rustc-hash 2.1.1", + "serde_json", + "swc_atoms", + "swc_common 21.0.1", + "swc_ecma_ast 23.0.0", + "swc_ecma_parser 38.0.2", + "swc_ecma_transforms_base 41.0.1", + "swc_ecma_utils 29.1.0", + "swc_ecma_visit 23.0.0", + "tracing", +] + [[package]] name = "swc_ecma_transforms_optimization" version = "44.0.0" @@ -9225,6 +9729,24 @@ dependencies = [ "tracing", ] +[[package]] +name = "swc_ecma_transforms_proposal" +version = "41.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02c49fd90ad7ef87cfacb9e15eb939bfecac83fe6638fdd4f94a31eff56b8276" +dependencies = [ + "either", + "rustc-hash 2.1.1", + "serde", + "swc_atoms", + "swc_common 21.0.1", + "swc_ecma_ast 23.0.0", + "swc_ecma_transforms_base 41.0.1", + "swc_ecma_transforms_classes 41.0.1", + "swc_ecma_utils 29.1.0", + "swc_ecma_visit 23.0.0", +] + [[package]] name = "swc_ecma_transforms_proposal" version = "42.0.0" @@ -9238,7 +9760,32 @@ dependencies = [ "swc_common 21.0.1", "swc_ecma_ast 23.0.0", "swc_ecma_transforms_base 42.0.0", - "swc_ecma_transforms_classes", + "swc_ecma_transforms_classes 42.0.0", + "swc_ecma_utils 29.1.0", + "swc_ecma_visit 23.0.0", +] + +[[package]] +name = "swc_ecma_transforms_react" +version = "45.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b41b35e76a78a01650dcfb92889d37fdebbc3b86932a052259c2a99e7955e699" +dependencies = [ + "base64 0.22.1", + "bytes-str", + "indexmap 2.13.0", + "once_cell", + "rustc-hash 2.1.1", + "serde", + "sha1", + "string_enum", + "swc_atoms", + "swc_common 21.0.1", + "swc_config", + "swc_ecma_ast 23.0.0", + "swc_ecma_hooks", + "swc_ecma_parser 38.0.2", + "swc_ecma_transforms_base 41.0.1", "swc_ecma_utils 29.1.0", "swc_ecma_visit 23.0.0", ] @@ -9268,6 +9815,24 @@ dependencies = [ "swc_ecma_visit 23.0.0", ] +[[package]] +name = "swc_ecma_transforms_typescript" +version = "45.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d25026f22efe873b50c97b3aaca6bfd178f954031effd14394e7b3add1e95fb" +dependencies = [ + "bytes-str", + "rustc-hash 2.1.1", + "serde", + "swc_atoms", + "swc_common 21.0.1", + "swc_ecma_ast 23.0.0", + "swc_ecma_transforms_base 41.0.1", + "swc_ecma_transforms_react 45.0.0", + "swc_ecma_utils 29.1.0", + "swc_ecma_visit 23.0.0", +] + [[package]] name = "swc_ecma_transforms_typescript" version = "46.0.1" @@ -9281,7 +9846,7 @@ dependencies = [ "swc_common 21.0.1", "swc_ecma_ast 23.0.0", "swc_ecma_transforms_base 42.0.0", - "swc_ecma_transforms_react", + "swc_ecma_transforms_react 46.0.1", "swc_ecma_utils 29.1.0", "swc_ecma_visit 23.0.0", ] @@ -9357,9 +9922,9 @@ dependencies = [ [[package]] name = "swc_emotion" -version = "4.0.0" +version = "3.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7077db4cd3dc9908a860c2e55b40ae6de8d6ce41d919867f2e58eb81b4019718" +checksum = "11d8058e754b05eb672671b71974c4f79673b32bc2a2763706ba6970f8d2c86f" dependencies = [ "base64 0.22.1", "byteorder", @@ -9373,7 +9938,7 @@ dependencies = [ "swc_common 21.0.1", "swc_ecma_ast 23.0.0", "swc_ecma_codegen 26.0.1", - "swc_ecma_transforms", + "swc_ecma_transforms 51.0.0", "swc_ecma_utils 29.1.0", "swc_ecma_visit 23.0.0", "swc_sourcemap", @@ -9506,9 +10071,9 @@ dependencies = [ [[package]] name = "swc_relay" -version = "4.0.0" +version = "3.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b592abba81c24baad593d6130a162beaa50699b5c2ba791a5b0db7be2dff1db4" +checksum = "d1a0e98d0497d914f2a0736be9be050af6c3c0fbb2a9d911dae40379fffcc7c8" dependencies = [ "once_cell", "regex", @@ -10465,7 +11030,6 @@ dependencies = [ "auto-hash-map", "bincode 2.0.1", "concurrent-queue", - "ctor 0.10.1", "dashmap 6.1.0", "either", "erased-serde", @@ -10638,10 +11202,8 @@ dependencies = [ name = "turbo-tasks-malloc" version = "0.1.0" dependencies = [ - "libc", "libmimalloc-sys", "mimalloc", - "windows-sys 0.60.2", ] [[package]] @@ -10741,6 +11303,7 @@ dependencies = [ "either", "indexmap 2.13.0", "num-bigint", + "once_cell", "patricia_tree", "petgraph 0.8.3", "ref-cast", @@ -10750,7 +11313,7 @@ dependencies = [ "serde", "serde_json", "smallvec", - "swc_core 65.0.3", + "swc_core 63.1.3", "swc_sourcemap", "tracing", "turbo-bincode", @@ -10782,7 +11345,7 @@ dependencies = [ "rustc-hash 2.1.1", "serde", "smallvec", - "swc_core 65.0.3", + "swc_core 63.1.3", "tokio", "tracing", "turbo-bincode", @@ -10849,6 +11412,7 @@ dependencies = [ "itertools 0.10.5", "num-bigint", "num-traits", + "once_cell", "parking_lot", "petgraph 0.8.3", "phf", @@ -10858,7 +11422,7 @@ dependencies = [ "serde_json", "smallvec", "strsim 0.11.1", - "swc_core 65.0.3", + "swc_core 63.1.3", "swc_sourcemap", "tokio", "tracing", @@ -10873,7 +11437,6 @@ dependencies = [ "turbopack-resolve", "turbopack-swc-utils", "url", - "urlencoding", ] [[package]] @@ -10901,7 +11464,7 @@ dependencies = [ "serde_json", "styled_components", "styled_jsx", - "swc_core 65.0.3", + "swc_core 63.1.3", "swc_emotion", "swc_plugin_backend_wasmtime", "swc_relay", @@ -10951,8 +11514,9 @@ dependencies = [ "bincode 2.0.1", "image", "mime", - "phf", + "once_cell", "regex", + "rustc-hash 2.1.1", "serde", "turbo-bincode", "turbo-rcstr", @@ -10996,6 +11560,7 @@ dependencies = [ "js-sys", "napi", "napi-derive", + "once_cell", "owo-colors", "parking_lot", "regex", @@ -11081,7 +11646,7 @@ version = "0.1.0" dependencies = [ "anyhow", "parking_lot", - "swc_core 65.0.3", + "swc_core 63.1.3", "turbo-rcstr", "turbo-tasks", "turbopack-core", @@ -11093,6 +11658,7 @@ version = "0.1.0" dependencies = [ "anyhow", "bincode 2.0.1", + "once_cell", "regex", "rustc-hash 2.1.1", "serde", @@ -11121,7 +11687,6 @@ dependencies = [ "rustc-hash 2.1.1", "serde", "serde_json", - "smallvec", "tungstenite 0.21.0", "turbo-rcstr", "turbo-tasks-malloc", @@ -11136,6 +11701,7 @@ dependencies = [ "anyhow", "crossbeam-channel", "crossbeam-utils", + "once_cell", "parking_lot", "postcard", "rustc-hash 2.1.1", diff --git a/crates/pm/src/helper/ruborist_context.rs b/crates/pm/src/helper/ruborist_context.rs index c8b758a6f..f5d883d8e 100644 --- a/crates/pm/src/helper/ruborist_context.rs +++ b/crates/pm/src/helper/ruborist_context.rs @@ -69,6 +69,14 @@ impl Context { /// Create BuildDepsOptions with PipelineReceiver for concurrent download/clone. /// Returns (options, channels) where channels are used to start pipeline workers. + /// + /// Sets `skip_preload=true` so ruborist's `service::api::build_deps` + /// routes through `mb_fetch_with_graph` (folded preload + graph + /// build). The pipeline still receives `PackageResolved` / + /// `PackagePlaced` events — emitted from inside + /// `mb_fetch_with_graph` (main loop and graph worker + /// respectively) — so download/clone start as early as the + /// classic preload+BFS path. pub async fn pipeline_deps_options( cwd: PathBuf, ) -> ( @@ -76,7 +84,8 @@ impl Context { PipelineChannels, ) { let (receiver, channels) = PipelineReceiver::new(ProgressReceiver); - let options = Self::deps_options(cwd, receiver).await; + let mut options = Self::deps_options(cwd, receiver).await; + options.skip_preload = true; (options, channels) } diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs index 197fcbc26..33e3819a8 100644 --- a/crates/ruborist/src/resolver/mb_resolve.rs +++ b/crates/ruborist/src/resolver/mb_resolve.rs @@ -51,6 +51,7 @@ use crate::resolver::preload::{Dep, PreloadConfig}; use crate::resolver::version::resolve_target_version; use crate::service::MemoryCache; use crate::spec::SpecStr; +use crate::traits::progress::{BuildEvent, EventReceiver}; use crate::traits::registry::ResolvedPackage; #[derive(Debug, Default)] @@ -131,6 +132,11 @@ fn extract_transitive(manifest: &CoreVersionManifest, peer_deps: PeerDeps) -> Ve /// `body_cache` and trigger sibling drain. struct FetchOutcome { name: String, + /// The spec that triggered this fetch / settle. Used by the + /// main loop to look up the cached `CoreVersionManifest` for + /// `PackageResolved` event emission (the future already wrote + /// `(name, primary_spec)` to the cache). + primary_spec: String, transitives: Vec, fetched: bool, /// Per-future wall (network + body recv + spawn_blocking parse). @@ -211,6 +217,7 @@ fn spawn_fetch( ) -> Fut { Box::pin(async move { let fut_start = Instant::now(); + let primary_spec = spec.clone(); let url = format!("{}/{}", registry_url, name); let resp = match client .get(&url) @@ -223,6 +230,7 @@ fn spawn_fetch( let wall_us = fut_start.elapsed().as_micros() as u64; return FetchOutcome { name, + primary_spec, transitives: Vec::new(), fetched: true, wall_us, @@ -236,6 +244,7 @@ fn spawn_fetch( let wall_us = fut_start.elapsed().as_micros() as u64; return FetchOutcome { name, + primary_spec, transitives: Vec::new(), fetched: true, wall_us, @@ -270,6 +279,7 @@ fn spawn_fetch( let wall_us = fut_start.elapsed().as_micros() as u64; FetchOutcome { name, + primary_spec, transitives, fetched: true, wall_us, @@ -289,6 +299,7 @@ fn spawn_settle( ) -> Fut { Box::pin(async move { let fut_start = Instant::now(); + let primary_spec = spec.clone(); let spec_for_parse = spec.clone(); let peer = peer_deps; let parsed = tokio::task::spawn_blocking(move || { @@ -314,6 +325,7 @@ fn spawn_settle( let wall_us = fut_start.elapsed().as_micros() as u64; FetchOutcome { name, + primary_spec, transitives, fetched: false, wall_us, @@ -593,13 +605,17 @@ struct FetchEventMsg { name: String, } -pub async fn mb_fetch_with_graph( +pub async fn mb_fetch_with_graph( mut graph: DependencyGraph, registry_url: &str, cache: &MemoryCache, preload_config: &PreloadConfig, build_config: &BuildDepsConfig, -) -> Result<(DependencyGraph, MbFetchStats)> { + receiver: Arc, +) -> Result<(DependencyGraph, MbFetchStats)> +where + R: EventReceiver + 'static, +{ let mut stats = MbFetchStats::default(); let total_start = Instant::now(); @@ -663,6 +679,7 @@ pub async fn mb_fetch_with_graph( // loop CPU starving the runtime's IO polling. let cache_clone = cache.clone(); let build_config_owned = build_config.clone(); + let receiver_for_graph = Arc::clone(&receiver); let graph_handle = tokio::spawn(graph_worker( graph, edge_targets, @@ -671,6 +688,7 @@ pub async fn mb_fetch_with_graph( build_config_owned, fetch_rx, specs_tx, + receiver_for_graph, )); // Sibling-fetch dedup stays in main loop (drives FuturesUnordered). @@ -747,6 +765,21 @@ pub async fn mb_fetch_with_graph( settle_count += 1; } + // Pipeline early-start signal: emit + // PackageResolved as soon as the manifest is in + // cache. The install path's PipelineReceiver + // forwards this to the download worker so + // tarball download begins before BFS finishes. + // For lockfile-only callers (NoopReceiver), this + // is a no-op. + if let Some(core_arc) = + cache.get_version_manifest(&out.name, &out.primary_spec) + { + receiver.on_event(BuildEvent::PackageResolved( + (&*core_arc).into(), + )); + } + // Drain sibling specs deferred while the fetch // was in flight. Sibling settles also produce a // FetchEventMsg downstream. @@ -828,7 +861,8 @@ struct GraphWorkerStats { /// back. Designed to monopolize a tokio runtime worker thread so /// the main loop's worker can drive socket polling without /// competing for CPU. -async fn graph_worker( +#[allow(clippy::too_many_arguments)] +async fn graph_worker( mut graph: DependencyGraph, mut edge_targets: EdgeTargets, mut seen_specs: HashSet<(String, String)>, @@ -836,7 +870,12 @@ async fn graph_worker( build_config: BuildDepsConfig, mut fetch_rx: mpsc::Receiver, specs_tx: mpsc::Sender>, -) -> Result<(DependencyGraph, GraphWorkerStats)> { + receiver: Arc, +) -> Result<(DependencyGraph, GraphWorkerStats)> +where + R: EventReceiver + 'static, +{ + use crate::model::manifest::NodeManifest; let mut stats = GraphWorkerStats::default(); while let Some(msg) = fetch_rx.recv().await { @@ -888,6 +927,22 @@ async fn graph_worker( &build_config, ); if let ProcessResult::Created(new_idx) = result { + // Pipeline clone signal: emit PackagePlaced so + // the install path's clone worker can begin + // hardlinking from cache as soon as a node is + // placed in the graph. lockfile-only callers + // (NoopReceiver) drop this on the floor. + if let Some(node) = graph.get_node(new_idx) + && let NodeManifest::Registry(ref manifest) = node.manifest + { + let parent_path = graph.get_node(parent_idx).map(|p| p.path.as_path()); + receiver.on_event(BuildEvent::PackagePlaced { + package: manifest.as_ref().into(), + path: &node.path, + parent_path, + }); + } + // Walk the new node's edges. enqueue handles // recursive cache-hit drain so already-cached // specs get processed inline (still on this diff --git a/crates/ruborist/src/service/api.rs b/crates/ruborist/src/service/api.rs index 6c9ada9a8..2dc7d62e8 100644 --- a/crates/ruborist/src/service/api.rs +++ b/crates/ruborist/src/service/api.rs @@ -131,7 +131,7 @@ pub struct BuildDepsOutput { pub async fn build_deps(options: BuildDepsOptions) -> Result where G: Glob + Clone, - R: EventReceiver, + R: EventReceiver + 'static, { let BuildDepsOptions { cwd, @@ -279,6 +279,11 @@ where // `build_deps_with_config` call still runs to handle any // non-registry edges (workspace / git / http / file) the fold // path skipped, but on registry-only workloads it's near no-op. + // Wrap receiver in Arc so the folded mb_fetch_with_graph can + // share it with its spawned graph_worker task. The follow-up + // BFS sweep also holds an &Arc via deref. + let receiver = Arc::new(receiver); + let folded = skip_preload_caller && cache_count == 0; if folded { let preload_config = PreloadConfig { @@ -291,6 +296,7 @@ where registry.cache(), &preload_config, &config, + Arc::clone(&receiver), ) .await .map_err(|e| e.context("mb_fetch_with_graph failed"))?; @@ -305,7 +311,7 @@ where // (non-registry: workspace / git / http / file). On // registry-only workloads (the common case) the graph is fully // built already, BFS walks nothing. - build_deps_with_config(&mut graph, ®istry, config, &receiver) + build_deps_with_config(&mut graph, ®istry, config, &*receiver) .await .map_err(|e| anyhow::Error::new(e).context("Dependency resolution failed"))?; From 21d9c7dce4c03b507701e0e4b2ca4bc2dc7b0ad8 Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Sun, 10 May 2026 00:32:24 +0800 Subject: [PATCH 32/32] =?UTF-8?q?fix(pm):=20mb=5Ffetch=5Fwith=5Fgraph=20?= =?UTF-8?q?=E2=80=94=20normalize=20npm:=20alias=20specs=20before=20fetch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously spawn_fetch / spawn_settle used the raw dep key as both the registry path segment and the cache lookup key. For an npm-alias dep like \`\"ms\": \"npm:raw-body@2.1.3\"\` this hit \`registry/ms\` instead of \`registry/raw-body\`, parsed ms's manifest against \`npm:raw-body@2.1.3\`, and ultimately installed the real ms into \`node_modules/ms/\` rather than raw-body. e2e \`utoo-pm.sh:466\` (\"top-level ms should be raw-body\") caught this on d1cf53e2. Fix: - spawn_fetch / spawn_settle call \`normalize_spec\` to split out the real package name + spec; URL hits \`registry/{real_name}\` and the combined parse runs against \`real_spec\` so version resolution sees the right manifest envelope. - Cache writes go under both keys: the original \`(alias_name, alias_spec)\` so \`graph_worker\` finds the manifest via \`edge_targets\`, and the normalized \`(real_name, resolved_version)\` for direct-dep dedup. - Main loop dedup state (in_flight_names / deferred_by_name / body_cache) keys by real_name so two distinct aliases pointing at the same registry package share dedup; deferred entries store \`(alias_name, spec)\` so the drain spawns spawn_settle with the correct cache key. - Adds \`real_name\` to FetchOutcome so the deferred-drain step can look up by real name without re-normalizing. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/ruborist/src/resolver/mb_resolve.rs | 127 ++++++++++++++------- 1 file changed, 86 insertions(+), 41 deletions(-) diff --git a/crates/ruborist/src/resolver/mb_resolve.rs b/crates/ruborist/src/resolver/mb_resolve.rs index 33e3819a8..6af0c80f1 100644 --- a/crates/ruborist/src/resolver/mb_resolve.rs +++ b/crates/ruborist/src/resolver/mb_resolve.rs @@ -48,6 +48,7 @@ use crate::resolver::builder::{ BuildDepsConfig, ProcessResult, collect_unresolved_edges, process_dependency_with_resolved, }; use crate::resolver::preload::{Dep, PreloadConfig}; +use crate::resolver::semver::normalize_spec; use crate::resolver::version::resolve_target_version; use crate::service::MemoryCache; use crate::spec::SpecStr; @@ -131,7 +132,16 @@ fn extract_transitive(manifest: &CoreVersionManifest, peer_deps: PeerDeps) -> Ve /// happened inside the future. Only `fetched=true` futures populate /// `body_cache` and trigger sibling drain. struct FetchOutcome { + /// The dep key (alias name as it appears in the parent's deps map). + /// Used by `graph_worker` to filter `edge_targets`, which is keyed + /// on the alias. name: String, + /// The real package name after npm-alias normalization (e.g. + /// `name="ms"` + `spec="npm:raw-body@2.1.3"` → `real_name="raw-body"`). + /// Used by the main loop for `body_cache` / `deferred_by_name` / + /// `in_flight_names` keying, so two distinct aliases pointing at + /// the same package share dedup. + real_name: String, /// The spec that triggered this fetch / settle. Used by the /// main loop to look up the cached `CoreVersionManifest` for /// `PackageResolved` event emission (the future already wrote @@ -218,7 +228,13 @@ fn spawn_fetch( Box::pin(async move { let fut_start = Instant::now(); let primary_spec = spec.clone(); - let url = format!("{}/{}", registry_url, name); + // Normalize npm-alias / workspace specs so the registry hit + // and the manifest parse run against the *real* package, not + // the alias name. Cache writes still go under the original + // (alias_name, alias_spec) key so `graph_worker` can locate + // them via `edge_targets`. + let (real_name, real_spec) = normalize_spec(&name, &spec); + let url = format!("{}/{}", registry_url, real_name); let resp = match client .get(&url) .header("accept", "application/vnd.npm.install-v1+json") @@ -230,6 +246,7 @@ fn spawn_fetch( let wall_us = fut_start.elapsed().as_micros() as u64; return FetchOutcome { name, + real_name, primary_spec, transitives: Vec::new(), fetched: true, @@ -244,6 +261,7 @@ fn spawn_fetch( let wall_us = fut_start.elapsed().as_micros() as u64; return FetchOutcome { name, + real_name, primary_spec, transitives: Vec::new(), fetched: true, @@ -254,23 +272,31 @@ fn spawn_fetch( }; let net_us = fut_start.elapsed().as_micros() as u64; let raw_arc: Arc<[u8]> = Arc::from(raw_bytes.as_ref()); - // Stash in body_cache early so concurrent sibling specs - // arriving slightly after see it on their pending pop. - body_cache.lock().insert(name.clone(), Arc::clone(&raw_arc)); - - let spec_for_parse = spec.clone(); + // Body cache is keyed by real_name so two aliases pointing at + // the same registry package share the body and only one fetch + // fires. Sibling drains know to use real_name (see + // `deferred_by_name` keying in the main loop). + body_cache + .lock() + .insert(real_name.clone(), Arc::clone(&raw_arc)); + + let real_spec_for_parse = real_spec.clone(); let peer = peer_deps; - let parsed = - tokio::task::spawn_blocking(move || parse_combined(raw_arc, &spec_for_parse, peer)) - .await - .ok() - .flatten(); + let parsed = tokio::task::spawn_blocking(move || { + parse_combined(raw_arc, &real_spec_for_parse, peer) + }) + .await + .ok() + .flatten(); let transitives = match parsed { Some((full_arc, resolved, core_arc, transitives)) => { - cache.set_full_manifest(name.clone(), Arc::clone(&full_arc)); + cache.set_full_manifest(real_name.clone(), Arc::clone(&full_arc)); + // Under the alias key so `graph_worker` finds it. cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc)); - cache.set_version_manifest(name.clone(), resolved, core_arc); + // Under the real key so subsequent direct deps on + // the same package@version dedupe correctly. + cache.set_version_manifest(real_name.clone(), resolved, core_arc); transitives } None => Vec::new(), @@ -279,6 +305,7 @@ fn spawn_fetch( let wall_us = fut_start.elapsed().as_micros() as u64; FetchOutcome { name, + real_name, primary_spec, transitives, fetched: true, @@ -300,10 +327,11 @@ fn spawn_settle( Box::pin(async move { let fut_start = Instant::now(); let primary_spec = spec.clone(); - let spec_for_parse = spec.clone(); + let (real_name, real_spec) = normalize_spec(&name, &spec); + let real_spec_for_parse = real_spec.clone(); let peer = peer_deps; let parsed = tokio::task::spawn_blocking(move || { - parse_combined(Arc::clone(&raw), &spec_for_parse, peer) + parse_combined(Arc::clone(&raw), &real_spec_for_parse, peer) }) .await .ok() @@ -312,11 +340,12 @@ fn spawn_settle( let transitives = match parsed { Some((full_arc, resolved, core_arc, transitives)) => { // Don't overwrite full_manifest — the original fetcher - // already set it. Only populate the version-manifest - // slots so BFS hits the (name, spec) early-return. - cache.set_full_manifest(name.clone(), full_arc); + // already set it under real_name. Populate version + // slots so BFS hits the (alias_name, alias_spec) + // early-return. + cache.set_full_manifest(real_name.clone(), full_arc); cache.set_version_manifest(name.clone(), spec, Arc::clone(&core_arc)); - cache.set_version_manifest(name.clone(), resolved, core_arc); + cache.set_version_manifest(real_name.clone(), resolved, core_arc); transitives } None => Vec::new(), @@ -325,6 +354,7 @@ fn spawn_settle( let wall_us = fut_start.elapsed().as_micros() as u64; FetchOutcome { name, + real_name, primary_spec, transitives, fetched: false, @@ -376,12 +406,15 @@ pub async fn mb_fetch( } } - // Sibling-fetch dedup: when two specs for the same name are both - // in flight, only the first fires a fetch; the second arrives at - // the cached body and goes through `spawn_settle` instead. + // Sibling-fetch dedup: when two specs for the same package are + // both in flight, only the first fires a fetch; the second + // arrives at the cached body and goes through `spawn_settle`. + // Keyed by *real* package name (post npm-alias normalization) + // so two distinct aliases pointing at the same registry package + // share dedup. let body_cache: Arc>>> = Arc::new(Mutex::new(HashMap::new())); - let mut in_flight_names: HashSet = HashSet::new(); - let mut deferred_by_name: HashMap> = HashMap::new(); + let mut in_flight_real_names: HashSet = HashSet::new(); + let mut deferred_by_real_name: HashMap> = HashMap::new(); let mut futs: FuturesUnordered = FuturesUnordered::new(); @@ -391,14 +424,18 @@ pub async fn mb_fetch( let Some((name, spec)) = pending.pop_front() else { break; }; + let (real_name, _) = normalize_spec(&name, &spec); // Sibling fast path: body already cached. - if let Some(raw) = body_cache.lock().get(&name).cloned() { + if let Some(raw) = body_cache.lock().get(&real_name).cloned() { futs.push(spawn_settle(name, spec, raw, cache.clone(), peer_deps)); continue; } - // Defer if a fetch for this name is already in flight. - if !in_flight_names.insert(name.clone()) { - deferred_by_name.entry(name).or_default().push(spec); + // Defer if a fetch for this real package is already in flight. + if !in_flight_real_names.insert(real_name.clone()) { + deferred_by_real_name + .entry(real_name) + .or_default() + .push((name, spec)); continue; } futs.push(spawn_fetch( @@ -439,12 +476,12 @@ pub async fn mb_fetch( // Drain sibling specs deferred while the fetch was in flight. if out.fetched - && let Some(siblings) = deferred_by_name.remove(&out.name) - && let Some(raw) = body_cache.lock().get(&out.name).cloned() + && let Some(siblings) = deferred_by_real_name.remove(&out.real_name) + && let Some(raw) = body_cache.lock().get(&out.real_name).cloned() { - for sibling_spec in siblings { + for (sibling_name, sibling_spec) in siblings { futs.push(spawn_settle( - out.name.clone(), + sibling_name, sibling_spec, Arc::clone(&raw), cache.clone(), @@ -692,9 +729,13 @@ where )); // Sibling-fetch dedup stays in main loop (drives FuturesUnordered). + // Keyed by *real* package name (post npm-alias normalization) + // so two distinct aliases pointing at the same registry package + // share dedup; siblings store their alias `(name, spec)` so the + // drain knows how to spawn `spawn_settle` with the right cache key. let body_cache: Arc>>> = Arc::new(Mutex::new(HashMap::new())); - let mut in_flight_names: HashSet = HashSet::new(); - let mut deferred_by_name: HashMap> = HashMap::new(); + let mut in_flight_real_names: HashSet = HashSet::new(); + let mut deferred_by_real_name: HashMap> = HashMap::new(); let mut futs: FuturesUnordered = FuturesUnordered::new(); let mut sum_wall_us: u64 = 0; @@ -712,12 +753,16 @@ where let Some((name, spec)) = pending.pop_front() else { break; }; - if let Some(raw) = body_cache.lock().get(&name).cloned() { + let (real_name, _) = normalize_spec(&name, &spec); + if let Some(raw) = body_cache.lock().get(&real_name).cloned() { futs.push(spawn_settle(name, spec, raw, cache.clone(), peer_deps)); continue; } - if !in_flight_names.insert(name.clone()) { - deferred_by_name.entry(name).or_default().push(spec); + if !in_flight_real_names.insert(real_name.clone()) { + deferred_by_real_name + .entry(real_name) + .or_default() + .push((name, spec)); continue; } futs.push(spawn_fetch( @@ -784,12 +829,12 @@ where // was in flight. Sibling settles also produce a // FetchEventMsg downstream. if out.fetched - && let Some(siblings) = deferred_by_name.remove(&out.name) - && let Some(raw) = body_cache.lock().get(&out.name).cloned() + && let Some(siblings) = deferred_by_real_name.remove(&out.real_name) + && let Some(raw) = body_cache.lock().get(&out.real_name).cloned() { - for sibling_spec in siblings { + for (sibling_name, sibling_spec) in siblings { futs.push(spawn_settle( - out.name.clone(), + sibling_name, sibling_spec, Arc::clone(&raw), cache.clone(),