diff --git a/Cargo.lock b/Cargo.lock index 16bcb0f104fd..b26b2d52cb28 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7258,6 +7258,7 @@ dependencies = [ "ic-secp256k1", "ic-test-utilities-types", "ic-types", + "ic-universal-canister", "ic-validator", "itertools 0.12.1", "prost 0.13.5", diff --git a/dev/local-net/prep.sh b/dev/local-net/prep.sh index 003f8af6e7b4..6aa3f71dd785 100755 --- a/dev/local-net/prep.sh +++ b/dev/local-net/prep.sh @@ -97,6 +97,7 @@ docker run --rm \ --nns-subnet-index "$SUBNET_IDX" \ --provisional-whitelist /bootstrap/.provisional_whitelist.json \ --use-specified-ids-allocation-range \ + --dkg-interval-length 49 \ "${NODE_ARGS[@]}" # Permissions: ic-prep wrote as root inside the container. Make the @@ -139,6 +140,13 @@ for i in 0 1 2 3; do crypto: { crypto_root: "/etc/ic/crypto", }, + hypervisor: { + // Nano profile: threshold == subnet memory capacity disables the storage + // cycle-reservation mechanism; small reservation lets canisters allocate + // up to ~the full subnet memory capacity. + subnet_memory_threshold: 536870912, + subnet_memory_reservation: 16777216, + }, http_handler: { listen_addr: "[::]:$HTTP_PORT", }, diff --git a/rs/canister_client/Cargo.toml b/rs/canister_client/Cargo.toml index 1dfd761ad595..905df0adade2 100644 --- a/rs/canister_client/Cargo.toml +++ b/rs/canister_client/Cargo.toml @@ -31,6 +31,7 @@ url = { workspace = true } [dev-dependencies] hex = { workspace = true } ic-certification-test-utils = { path = "../certification/test-utils" } +ic-universal-canister = { path = "../universal_canister/lib" } ic-crypto-test-utils-reproducible-rng = { path = "../crypto/test_utils/reproducible_rng" } ic-crypto-test-utils-root-of-trust = { path = "../crypto/test_utils/root_of_trust" } ic-crypto-test-utils-tls = { path = "../crypto/test_utils/tls" } diff --git a/rs/canister_client/examples/hammer.rs b/rs/canister_client/examples/hammer.rs new file mode 100644 index 000000000000..3dfda207a5b5 --- /dev/null +++ b/rs/canister_client/examples/hammer.rs @@ -0,0 +1,701 @@ +//! Stress-test driver for the local 4-node subnet (dev/local-net). +//! +//! Deploys N universal canisters via `provisional_create_canister_with_cycles` +//! and then hammers the subnet with compute and memory load, reporting how it +//! holds up. Drives the public endpoint with the in-repo `ic-canister-client` +//! Agent, so it needs no dfx / external SDK. +//! +//! Run with: +//! cargo run -p ic-canister-client --example hammer --release -- http://localhost:8080 +//! +//! Env knobs: HAMMER_CANISTERS (default 6), HAMMER_SECS (per throughput/compute +//! phase, default 15), HAMMER_CONCURRENCY (default 48). + +use ic_canister_client::{Agent, Sender}; +use ic_management_canister_types_private::{ + CanisterIdRecord, CanisterInstallMode, IC_00, InstallCodeArgs, Method, Payload, + ProvisionalCreateCanisterWithCyclesArgs, +}; +use ic_types::{CanisterId, PrincipalId}; +use ic_universal_canister::{call_args, get_universal_canister_wasm, wasm}; +use std::collections::BTreeMap; +use std::str::FromStr; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::{Arc, Mutex}; +use std::time::{Duration, Instant}; +use url::Url; + +const MIB: u32 = 1024 * 1024; +// A canister id that lives in this subnet's allocation range (see DEPLOY.md); +// used only to route `provisional_create_canister_with_cycles`. +const ROUTING_CANISTER_ID: &str = "bnz7o-iuaaa-aaaaa-qaaaa-cai"; + +static NONCE: AtomicU64 = AtomicU64::new(1); + +fn next_nonce() -> Vec { + NONCE.fetch_add(1, Ordering::Relaxed).to_le_bytes().to_vec() +} + +#[derive(Default)] +struct Stats { + ok: AtomicU64, + err: AtomicU64, + lat_sum_ms: AtomicU64, + lat_max_ms: AtomicU64, + err_classes: Mutex>, +} + +impl Stats { + fn record(&self, started: Instant, result: &Result>, String>) { + let ms = started.elapsed().as_millis() as u64; + self.lat_sum_ms.fetch_add(ms, Ordering::Relaxed); + self.lat_max_ms.fetch_max(ms, Ordering::Relaxed); + match result { + Ok(_) => { + self.ok.fetch_add(1, Ordering::Relaxed); + } + Err(e) => { + self.err.fetch_add(1, Ordering::Relaxed); + // Collapse to a short class so the histogram stays readable. + let class: String = e.split_whitespace().take(60).collect::>().join(" "); + let class: String = class.chars().take(400).collect(); + *self.err_classes.lock().unwrap().entry(class).or_insert(0) += 1; + } + } + } + + fn report(&self, label: &str, wall: Duration) { + let ok = self.ok.load(Ordering::Relaxed); + let err = self.err.load(Ordering::Relaxed); + let total = ok + err; + let avg = if total > 0 { + self.lat_sum_ms.load(Ordering::Relaxed) / total + } else { + 0 + }; + let rps = ok as f64 / wall.as_secs_f64().max(0.001); + println!( + "\n── {label} ──\n ok={ok} err={err} throughput={rps:.1} ok/s \ + latency avg={avg}ms max={}ms (wall {:.1}s)", + self.lat_max_ms.load(Ordering::Relaxed), + wall.as_secs_f64() + ); + let classes = self.err_classes.lock().unwrap(); + if !classes.is_empty() { + println!(" error classes:"); + for (c, n) in classes.iter() { + println!(" [{n:>5}] {c}"); + } + } + } +} + +async fn update( + agent: &Agent, + canister: &CanisterId, + payload: Vec, +) -> Result>, String> { + agent + .execute_update(canister, canister, "update", payload, next_nonce()) + .await +} + +/// Build a payload that, executed by the ingress-target canister, makes it call +/// canisters[start+1], which calls canisters[start+2], ... `depth` hops deep +/// around the canister ring; the innermost canister replies and the replies +/// propagate back. Generates ~2*depth inter-canister messages per ingress +/// (depth requests + depth responses), with `depth` outstanding callbacks at +/// peak (each holding a guaranteed-response memory reservation). +fn chain_payload(canisters: &[CanisterId], start: usize, depth: usize) -> Vec { + let k = canisters.len(); + let mut inner = wasm().reply().build(); // innermost callee just replies + for h in (1..=depth).rev() { + let callee = canisters[(start + h) % k].get().as_slice().to_vec(); + inner = wasm() + .call_simple(callee, "update", call_args().other_side(inner)) + .build(); + } + inner +} + +/// Build a payload that makes the ingress-target canister fire one parallel +/// update call to every canister (fan-out, fire-and-forget: on_reply/on_reject +/// are no-ops), then reply to the ingress immediately. Each in-flight ingress +/// thus leaves N = canisters.len() outstanding inter-canister calls, each +/// holding a guaranteed-response memory reservation (~2 MiB) and a callback +/// slot — so C concurrent ingresses drive up to C*N simultaneous outstanding +/// calls, stressing the 64 MiB guaranteed-response cap and the callback limits. +fn fanout_payload(canisters: &[CanisterId], mult: usize) -> Vec { + let callee_runs = wasm().reply().build(); // callee just replies + let noop = wasm().noop().build(); // fire-and-forget: ignore the response + let mut p = wasm(); + // Fire `mult` calls to each canister in ONE message: all the response-memory + // reservations are taken before any callee runs, so a single message issuing + // > ~32 calls exceeds the 64 MiB guaranteed-response cap. + for _ in 0..mult { + for c in canisters { + let callee = c.get().as_slice().to_vec(); + p = p + .call_new(callee, "update", call_args().on_reply(noop.clone()).on_reject(noop.clone())) + .call_data_append(&callee_runs) + .call_perform(); + } + } + p.reply().build() +} + +/// Create + install a universal canister; optionally pre-grow its stable memory. +async fn deploy_one( + agent: &Agent, + routing_id: &CanisterId, + pre_grow_pages: u32, +) -> Result { + let args = ProvisionalCreateCanisterWithCyclesArgs::new( + Some(1_000_000_000_000_000_u128), // 1 Pcycle, never freezes + None, + ); + let reply = agent + .execute_update( + routing_id, + &IC_00, + Method::ProvisionalCreateCanisterWithCycles, + args.encode(), + next_nonce(), + ) + .await? + .ok_or("provisional_create: empty reply")?; + let canister_id = CanisterIdRecord::decode(&reply) + .map_err(|e| format!("decode CanisterIdRecord: {e}"))? + .get_canister_id(); + + agent + .install_canister(InstallCodeArgs::new( + CanisterInstallMode::Install, + canister_id, + get_universal_canister_wasm(), + vec![], + )) + .await?; + + if pre_grow_pages > 0 { + update(agent, &canister_id, wasm().stable_grow(pre_grow_pages).reply().build()).await?; + } + Ok(canister_id) +} + +/// Run `make_payload` against the canister pool from `concurrency` workers until +/// `dur` elapses. +async fn storm( + agent: Arc, + canisters: Arc>, + concurrency: usize, + dur: Duration, + make_payload: Arc Vec + Send + Sync>, + is_query: bool, +) -> Stats { + let stats = Arc::new(Stats::default()); + let deadline = Instant::now() + dur; + let rr = Arc::new(AtomicU64::new(0)); + let mut handles = Vec::new(); + for _ in 0..concurrency { + let agent = agent.clone(); + let canisters = canisters.clone(); + let stats = stats.clone(); + let rr = rr.clone(); + let make_payload = make_payload.clone(); + handles.push(tokio::spawn(async move { + while Instant::now() < deadline { + let idx = rr.fetch_add(1, Ordering::Relaxed) as usize % canisters.len(); + let canister = canisters[idx]; + let payload = make_payload(); + let started = Instant::now(); + let res = if is_query { + agent.execute_query(&canister, "query", payload).await + } else { + update(&agent, &canister, payload).await + }; + stats.record(started, &res); + } + })); + } + for h in handles { + let _ = h.await; + } + Arc::try_unwrap(stats).unwrap_or_default() +} + +#[tokio::main(flavor = "multi_thread", worker_threads = 8)] +async fn main() { + let url = std::env::args() + .nth(1) + .unwrap_or_else(|| "http://localhost:8080".to_string()); + let num_canisters: usize = std::env::var("HAMMER_CANISTERS") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(6); + let secs: u64 = std::env::var("HAMMER_SECS") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(15); + let concurrency: usize = std::env::var("HAMMER_CONCURRENCY") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(48); + // probe mode: skip the throughput/compute/growth storms, run only the + // per-message dirty-page-limit probe (Phase C). + let probe_only = std::env::var("HAMMER_MODE").map(|m| m == "probe").unwrap_or(false); + // read mode: populate canisters with large state, then read-heavy updates on + // all-but-one and read-heavy queries on the last; plus a read-limit probe. + let read_mode = std::env::var("HAMMER_MODE").map(|m| m == "read").unwrap_or(false); + // heap mode: the heap-memory (Wasm) analogue of the stable-memory tests + // (compute/dirty-limit/read). Heap has no per-execution dirty/accessed cap + // (the 32 MiB limits are stable-only), so a single message can touch + // arbitrarily large heap. + let heap_mode = std::env::var("HAMMER_MODE").map(|m| m == "heap").unwrap_or(false); + // calls mode: thrash inter-canister communication — each ingress triggers a + // chain of canister-to-canister update calls `HAMMER_CALL_DEPTH` hops deep. + let calls_mode = std::env::var("HAMMER_MODE").map(|m| m == "calls").unwrap_or(false); + let call_depth: usize = std::env::var("HAMMER_CALL_DEPTH") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(4); + // fanout mode: each ingress fires N parallel calls (fire-and-forget) → + // N outstanding calls per in-flight ingress, to stress the guaranteed- + // response memory reservation and callback limits. + let fanout_mode = std::env::var("HAMMER_MODE").map(|m| m == "fanout").unwrap_or(false); + // hybrid mode: reads + writes + inter-canister messaging all at once. + let hybrid_mode = std::env::var("HAMMER_MODE").map(|m| m == "hybrid").unwrap_or(false); + // heapread mode: large heap-memory reads pulling lots of distinct state into + // RAM. Each canister holds a 96 MiB heap global (built via append, small + // transient); reads use queries (heap reads via update would OOM because + // get_global_data copies the global to the stack, permanently growing heap). + let heapread_mode = std::env::var("HAMMER_MODE").map(|m| m == "heapread").unwrap_or(false); + + let agent = Arc::new(Agent::new( + Url::parse(&url).expect("bad url"), + Sender::Anonymous, + )); + let routing_id = + CanisterId::unchecked_from_principal(PrincipalId::from_str(ROUTING_CANISTER_ID).unwrap()); + + println!("== hammer =="); + println!("target={url} canisters={num_canisters} phase_secs={secs} concurrency={concurrency}"); + + // ---- Deploy ---- + println!("\n[1/5] deploying {num_canisters} universal canisters (pre-grow 32 MiB stable each)..."); + let t0 = Instant::now(); + let mut canisters = Vec::new(); + for i in 0..num_canisters { + match deploy_one(&agent, &routing_id, 512).await { + Ok(id) => { + println!(" + canister {i} = {id}"); + canisters.push(id); + } + Err(e) => println!(" ! deploy {i} failed: {e}"), + } + } + if canisters.is_empty() { + eprintln!("no canisters deployed; aborting"); + std::process::exit(1); + } + println!(" deployed {} canisters in {:.1}s", canisters.len(), t0.elapsed().as_secs_f64()); + let canisters = Arc::new(canisters); + + if hybrid_mode { + // ---- Hybrid: heavy reads + writes + inter-canister messaging at once ---- + const BIG_MIB: u32 = 96; + let chunk: u32 = 24 * MIB; + let grow_pages = chunk / 65536; + let windows = ((BIG_MIB * MIB) / chunk) as u64; + println!("\n[hybrid] populating {} canisters to ~{BIG_MIB} MiB stable each...", canisters.len()); + for (i, c) in canisters.iter().enumerate() { + let mut off = 0u32; + while off + chunk <= BIG_MIB * MIB { + let _ = update(&agent, c, wasm().stable_grow(grow_pages).stable_fill(off, 0x40 + i as u32, chunk).reply().build()).await; + off += chunk; + } + } + println!("[hybrid] waiting ~20s for a checkpoint..."); + tokio::time::sleep(Duration::from_secs(20)).await; + + // Three concurrent storms over the full canister pool, splitting the + // concurrency budget. Each canister sees a mix of query-reads, + // update-writes, and inter-canister call chains simultaneously. + let each = (concurrency / 3).max(1); + let roff = Arc::new(AtomicU64::new(0)); + let woff = Arc::new(AtomicU64::new(0)); + let mctr = Arc::new(AtomicU64::new(0)); + let read_mk: Arc Vec + Send + Sync> = { + let roff = roff.clone(); + Arc::new(move || { + let n = roff.fetch_add(1, Ordering::Relaxed); + wasm().stable_read(((n % windows) as u32) * chunk, chunk).reply().build() + }) + }; + let write_mk: Arc Vec + Send + Sync> = { + let woff = woff.clone(); + Arc::new(move || { + let n = woff.fetch_add(1, Ordering::Relaxed); + // overwrite 8 MiB within an existing window (dirties, no growth) + wasm().stable_fill(((n % windows) as u32) * chunk, 0x77, 8 * MIB).reply().build() + }) + }; + let msg_cans = canisters.clone(); + let msg_mk: Arc Vec + Send + Sync> = { + let msg_cans = msg_cans.clone(); + Arc::new(move || { + let n = mctr.fetch_add(1, Ordering::Relaxed) as usize; + chain_payload(&msg_cans, n % msg_cans.len(), 3) + }) + }; + println!( + "\n[hybrid] storm ({secs}s): reads(query 24 MiB) + writes(update 8 MiB) + messages(3-hop chains), {each} concurrent each over {} canisters", + canisters.len() + ); + let t = Instant::now(); + let (rs, ws, ms) = tokio::join!( + storm(agent.clone(), canisters.clone(), each, Duration::from_secs(secs), read_mk, true), + storm(agent.clone(), canisters.clone(), each, Duration::from_secs(secs), write_mk, false), + storm(agent.clone(), canisters.clone(), each, Duration::from_secs(secs), msg_mk, false), + ); + rs.report("HYBRID reads (query, 24 MiB stable_read)", t.elapsed()); + ws.report("HYBRID writes (update, 8 MiB stable_fill)", t.elapsed()); + ms.report("HYBRID messages (3-hop call chains)", t.elapsed()); + + println!("\n== done =="); + return; + } + + if calls_mode { + // ---- Inter-canister call thrash ---- + // Each ingress makes the target canister start a `call_depth`-hop chain + // of update calls around the canister ring. With C concurrent ingresses + // there are up to C*call_depth outstanding inter-canister calls at peak. + let cans = canisters.clone(); + let ctr = Arc::new(AtomicU64::new(0)); + let mk: Arc Vec + Send + Sync> = { + let cans = cans.clone(); + Arc::new(move || { + let n = ctr.fetch_add(1, Ordering::Relaxed) as usize; + chain_payload(&cans, n % cans.len(), call_depth) + }) + }; + println!( + "\n[calls] inter-canister call storm ({secs}s): {call_depth}-hop chains, {concurrency} concurrent ingresses across {} canisters", + canisters.len() + ); + println!(" (~{} inter-canister messages per ingress; up to {} outstanding calls at peak)", 2 * call_depth, concurrency * call_depth); + let t = Instant::now(); + let stats = storm( + agent.clone(), + canisters.clone(), + concurrency, + Duration::from_secs(secs), + mk, + false, + ) + .await; + stats.report(&format!("INTER-CANISTER CALLS ({call_depth}-hop chains)"), t.elapsed()); + + println!("\n== done =="); + return; + } + + if fanout_mode { + // ---- Inter-canister FAN-OUT thrash (stresses response-memory reservation) ---- + let mult: usize = std::env::var("HAMMER_FANOUT_MULT") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(1); + let n = canisters.len() * mult; + let cans = canisters.clone(); + let mk: Arc Vec + Send + Sync> = { + let cans = cans.clone(); + Arc::new(move || fanout_payload(&cans, mult)) + }; + println!( + "\n[fanout] inter-canister FAN-OUT storm ({secs}s): each ingress fires {n} parallel calls in ONE message (fire-and-forget), {concurrency} concurrent ingresses" + ); + println!( + " ({n} simultaneous reservations/ingress; the 64 MiB guaranteed-response cap allows only ~32 — expect rejections when {n} > ~32)" + ); + let t = Instant::now(); + let stats = storm( + agent.clone(), + canisters.clone(), + concurrency, + Duration::from_secs(secs), + mk, + false, + ) + .await; + stats.report(&format!("INTER-CANISTER FAN-OUT (x{n} parallel/ingress)"), t.elapsed()); + + println!("\n== done =="); + return; + } + + if read_mode { + // Populate each canister with ~120 MiB of real stable data (written in + // <=24 MiB chunks to respect the 32 MiB per-message dirty limit). + const BIG_MIB: u32 = 128; + let chunk: u32 = 24 * MIB; + let pages: u32 = (BIG_MIB * MIB) / 65536; // 64 KiB Wasm pages + println!("\n[read] populating {} canisters to ~{BIG_MIB} MiB stable each...", canisters.len()); + let _ = pages; // grow incrementally below instead of one big grow + let grow_pages = chunk / 65536; // pages per 24 MiB step + for (i, c) in canisters.iter().enumerate() { + let (mut off, mut werr) = (0u32, 0u32); + // Grow + fill one 24 MiB window at a time: a single 128 MiB grow can + // be rejected, but small incremental grows reliably build the state. + while off + chunk <= BIG_MIB * MIB { + let p = wasm().stable_grow(grow_pages).stable_fill(off, 0x40 + i as u32, chunk).reply().build(); + if update(&agent, c, p).await.is_err() { + werr += 1; + } + off += chunk; + } + println!(" canister {i} = {c} populated ({werr} write errors)"); + } + println!("[read] waiting ~25s for a checkpoint to flush state to disk..."); + tokio::time::sleep(Duration::from_secs(25)).await; + + // Read 24 MiB (< 32 MiB accessed limit) per call, cycling the offset + // window across the populated range. + let off_ctr = Arc::new(AtomicU64::new(0)); + let mk: Arc Vec + Send + Sync> = { + let off_ctr = off_ctr.clone(); + let windows = ((BIG_MIB * MIB) / chunk) as u64; // cycle across the FULL state + Arc::new(move || { + let n = off_ctr.fetch_add(1, Ordering::Relaxed); + let off = ((n % windows) as u32) * chunk; + wasm().stable_read(off, chunk).reply().build() + }) + }; + let all_cans = Arc::new(canisters.as_ref().clone()); + println!( + "\n[read] read storm ({secs}s): 24 MiB stable_read/call — QUERIES across all {} canisters (cycling full range)", + all_cans.len() + ); + let t = Instant::now(); + let qs = storm(agent.clone(), all_cans, concurrency, Duration::from_secs(secs), mk.clone(), true).await; + qs.report("READ-QUERY (24 MiB stable_read, all canisters)", t.elapsed()); + + // Read-limit probe: access 48 MiB in one execution (> 32 MiB accessed + // limit) -> expect a trap, for both update and query. + println!("\n[read] read-limit probe: 48 MiB stable_read in one execution (accessed limit 32 MiB)"); + let ru = update(&agent, &canisters[0], wasm().stable_read(0, 48 * MIB).reply().build()).await; + println!( + " update read 48 MiB: {}", + if ru.is_ok() { "OK (no limit!)".to_string() } else { format!("TRAP {}", ru.as_ref().err().unwrap().chars().take(220).collect::()) } + ); + let rq = agent + .execute_query(&canisters[canisters.len() - 1], "query", wasm().stable_read(0, 48 * MIB).reply().build()) + .await; + println!( + " query read 48 MiB: {}", + if rq.is_ok() { "OK (no limit!)".to_string() } else { format!("TRAP {}", rq.as_ref().err().unwrap().chars().take(220).collect::()) } + ); + + println!("\n== done =="); + return; + } + + if heapread_mode { + // Build a large heap global per canister via append (24 MiB chunks, so + // the transient heap stays small and all 3 globals fit under the cap). + const BIG_MIB: u32 = 96; + let chunk: u32 = 24 * MIB; + let appends = (BIG_MIB * MIB) / chunk; + println!("\n[heapread] populating {} canisters with a {BIG_MIB} MiB heap global...", canisters.len()); + for (i, c) in canisters.iter().enumerate() { + let mut ok = true; + for _ in 0..appends { + if update(&agent, c, wasm().push_equal_bytes(0x41 + i as u32, chunk).append_to_global_data().reply().build()).await.is_err() { + ok = false; + } + } + println!(" canister {i} = {c}: {}", if ok { "populated" } else { "PARTIAL/FAILED" }); + } + println!("[heapread] waiting ~25s for a checkpoint to flush state to disk..."); + tokio::time::sleep(Duration::from_secs(25)).await; + + // Read the full 96 MiB global per call via queries on ALL canisters. + // (Heap reads via update OOM: the get_global_data stack copy permanently + // grows the heap. Queries discard it.) This pulls ~3x96 MiB of distinct + // heap state into the page cache. + println!( + "\n[heapread] heap-read QUERY storm ({secs}s): get_global_data ({BIG_MIB} MiB) on all {} canisters, {concurrency} concurrent", + canisters.len() + ); + let t = Instant::now(); + let qs = storm( + agent.clone(), + canisters.clone(), + concurrency, + Duration::from_secs(secs), + Arc::new(|| wasm().get_global_data().reply().build()), + true, + ) + .await; + qs.report("HEAP-READ-QUERY (96 MiB/read)", t.elapsed()); + println!("\n== done =="); + return; + } + + if heap_mode { + // ---- Heap per-message write probe ---- + // Stable memory traps a single message that dirties/accesses > 32 MiB; + // heap (Wasm) memory has no such per-message cap. push_equal_bytes(b, n) + // pushes n bytes onto the data stack, dirtying n bytes of heap. + println!("\n[heap] per-message heap-write probe (stable's per-msg limit is 32 MiB; heap has none)"); + for sz in [24u32, 48, 96] { + let r = update(&agent, &canisters[0], wasm().push_equal_bytes(0x61, sz * MIB).reply().build()).await; + println!( + " push {sz} MiB onto heap in ONE message: {}", + if r.is_ok() { "OK".to_string() } else { format!("TRAP {}", r.as_ref().err().unwrap().chars().take(200).collect::()) } + ); + } + + // ---- Heap-write storm (analogue of the COMPUTE storm) ---- + let upd_cans = Arc::new(canisters[..canisters.len() - 1].to_vec()); + println!("\n[heap] heap-write storm ({secs}s): 8 MiB heap write/call on {} canisters", upd_cans.len()); + let t = Instant::now(); + let ws = storm( + agent.clone(), + upd_cans.clone(), + concurrency, + Duration::from_secs(secs), + Arc::new(|| wasm().push_equal_bytes(0x61, 8 * MIB).reply().build()), + false, + ) + .await; + ws.report("HEAP-WRITE (8 MiB/call)", t.elapsed()); + + // ---- Populate a persistent heap global, then read it ---- + // 96 MiB so each get_global_data read pulls ~96 MiB of distinct state + // into memory (no per-execution accessed cap on heap, unlike stable's + // 32 MiB). 3 canisters x 96 MiB = ~288 MiB distinct read working set. + const BIG_MIB: u32 = 96; + println!("\n[heap] populating {} canisters with a {BIG_MIB} MiB heap global...", canisters.len()); + for (i, c) in canisters.iter().enumerate() { + let r = update( + &agent, + c, + wasm().push_equal_bytes(0x41 + i as u32, BIG_MIB * MIB).set_global_data_from_stack().reply().build(), + ) + .await; + println!(" canister {i} = {c}: {}", if r.is_ok() { "populated".to_string() } else { format!("ERR {}", r.as_ref().err().unwrap().chars().take(160).collect::()) }); + } + println!("[heap] waiting ~25s for a checkpoint..."); + tokio::time::sleep(Duration::from_secs(25)).await; + + // ---- Heap-read storm (analogue of the stable READ test) ---- + // get_global_data reads the whole 40 MiB global in one execution — more + // than the 32 MiB stable per-message accessed limit would ever allow. + let qry_cans = Arc::new(vec![canisters[canisters.len() - 1]]); + println!( + "\n[heap] heap-read storm ({secs}s): read {BIG_MIB} MiB heap global/call — UPDATES on {} canisters, QUERIES on 1", + upd_cans.len() + ); + let t = Instant::now(); + let (us, qs) = tokio::join!( + storm(agent.clone(), upd_cans.clone(), concurrency, Duration::from_secs(secs), Arc::new(|| wasm().get_global_data().reply().build()), false), + storm(agent.clone(), qry_cans.clone(), concurrency, Duration::from_secs(secs), Arc::new(|| wasm().get_global_data().reply().build()), true), + ); + us.report("HEAP-READ-UPDATE (40 MiB heap read)", t.elapsed()); + qs.report("HEAP-READ-QUERY (40 MiB heap read)", t.elapsed()); + + println!("\n== done =="); + return; + } + + if !probe_only { + // ---- Phase A: ingress/throughput storm (near-empty updates) ---- + println!("\n[2/5] THROUGHPUT storm: empty update calls, {concurrency} concurrent, {secs}s"); + let t = Instant::now(); + let stats = storm( + agent.clone(), + canisters.clone(), + concurrency, + Duration::from_secs(secs), + Arc::new(|| wasm().reply().build()), + false, + ) + .await; + stats.report("THROUGHPUT (empty updates)", t.elapsed()); + + // ---- Phase B: compute storm (8 MiB stable fill per call, within dirty limit) ---- + println!("\n[3/5] COMPUTE storm: 8 MiB stable_fill per call, {concurrency} concurrent, {secs}s"); + let t = Instant::now(); + let stats = storm( + agent.clone(), + canisters.clone(), + concurrency, + Duration::from_secs(secs), + Arc::new(|| wasm().stable_fill(0, 0x61, 8 * MIB).reply().build()), + false, + ) + .await; + stats.report("COMPUTE (8 MiB fill)", t.elapsed()); + } + + // ---- Phase C: per-message dirty-page limit (32 MiB) ---- + // Grow in a separate (committed) message first, then fill in-bounds amounts + // so we isolate the *dirty-page* limit from any grow/bounds effects. + println!("\n[4/5] DIRTY-LIMIT probe (per-message stable dirty limit = 32 MiB)"); + let c = canisters[0]; + let g = update(&agent, &c, wasm().stable_grow(1024).reply().build()).await; // +64 MiB, commit + println!( + " grow +64 MiB (own message): {}", + if g.is_ok() { "OK".to_string() } else { format!("ERR {}", g.as_ref().err().unwrap().chars().take(200).collect::()) } + ); + let small = update(&agent, &c, wasm().stable_fill(0, 0x62, 24 * MIB).reply().build()).await; + println!( + " fill 24 MiB (UNDER 32 MiB limit): {}", + if small.is_ok() { "OK".to_string() } else { format!("ERR {}", small.as_ref().err().unwrap().chars().take(260).collect::()) } + ); + let big = update(&agent, &c, wasm().stable_fill(0, 0x62, 48 * MIB).reply().build()).await; + println!( + " fill 48 MiB (OVER 32 MiB limit): {}", + if big.is_ok() { "OK — NO LIMIT ENFORCED".to_string() } else { format!("TRAP {}", big.as_ref().err().unwrap().chars().take(320).collect::()) } + ); + + if !probe_only { + // ---- Phase D: grow stable memory toward the 512 MiB subnet cap ---- + println!("\n[5/5] MEMORY-GROWTH storm: grow 16 MiB + fill per call across all canisters until rejected"); + let grow = Arc::new(Stats::default()); + let total_mib = Arc::new(AtomicU64::new(0)); + let mut handles = Vec::new(); + for &c in canisters.iter() { + let agent = agent.clone(); + let grow = grow.clone(); + let total_mib = total_mib.clone(); + handles.push(tokio::spawn(async move { + // Hard cap iterations so a misbehaving run can't loop forever. + for _ in 0..64 { + let p = wasm().stable_grow(256).stable_fill(0, 0x63, 16 * MIB).reply().build(); + let started = Instant::now(); + let res = update(&agent, &c, p).await; + let ok = res.is_ok(); + grow.record(started, &res); + if ok { + total_mib.fetch_add(16, Ordering::Relaxed); + } else { + break; // first rejection for this canister: stop growing it + } + } + })); + } + for h in handles { + let _ = h.await; + } + grow.report("MEMORY-GROWTH", Duration::from_secs(1)); + println!( + " approx stable memory successfully grown across subnet: ~{} MiB", + total_mib.load(Ordering::Relaxed) + ); + } + + println!("\n== done =="); +} diff --git a/rs/canister_sandbox/src/replica_controller/sandboxed_execution_controller.rs b/rs/canister_sandbox/src/replica_controller/sandboxed_execution_controller.rs index 420a629a14ac..0f88fc359c52 100644 --- a/rs/canister_sandbox/src/replica_controller/sandboxed_execution_controller.rs +++ b/rs/canister_sandbox/src/replica_controller/sandboxed_execution_controller.rs @@ -72,17 +72,27 @@ const SANDBOX_PROCESS_UPDATE_INTERVAL: Duration = Duration::from_secs(10); /// distributed across 4 execution cores. const SANDBOX_PROCESSES_TO_EVICT: usize = 200; -/// The RSS to evict in one go in order to amortize for the eviction cost (1 GiB). -const SANDBOX_PROCESSES_RSS_TO_EVICT: NumBytes = NumBytes::new(1024 * 1024 * 1024); +/// The RSS to evict in one go in order to amortize for the eviction cost. +/// Nano-replica profile: small batches (64 MiB) since the whole sandbox RSS +/// budget is only ~128 MiB. +const SANDBOX_PROCESSES_RSS_TO_EVICT: NumBytes = NumBytes::new(64 * 1024 * 1024); /// By default, assume each sandbox process consumes 5 MiB of RSS. /// The actual memory usage is updated asynchronously. /// See `monitor_and_evict_sandbox_processes` const DEFAULT_SANDBOX_PROCESS_RSS: NumBytes = NumBytes::new(5 * 1024 * 1024); -/// The maximum sandbox RSS is computed as `subnet_heap_delta_capacity / MAX_SANDBOXES_RSS_TO_HEAP_DELTA_RATIO`. +/// The maximum sandbox RSS is computed as `subnet_heap_delta_capacity / MAX_SANDBOXES_RSS_TO_HEAP_DELTA_RATIO`, +/// but never below `MIN_SANDBOXES_RSS`. const MAX_SANDBOXES_RSS_TO_HEAP_DELTA_RATIO: u64 = 3; +/// Floor on the maximum total sandbox RSS, independent of the heap delta +/// capacity. On the nano-replica profile the heap delta capacity is tiny +/// (tens of MiB), and `heap_delta_capacity / 3` would otherwise starve the +/// sandboxes and cause constant respawning. Keeping a small set of canisters +/// warm matters more than a large heap delta buffer when checkpoints are cheap. +const MIN_SANDBOXES_RSS: NumBytes = NumBytes::new(128 * 1024 * 1024); + /// To speedup synchronous operations, the sandbox RSS-based eviction /// is triggered only when the system's available memory falls below /// the specified byte threshold. @@ -1509,7 +1519,7 @@ impl SandboxedExecutionController { .maximum_state_delta .and_then(|d| if d.get() != 0 { Some(d) } else { None }) .unwrap_or(self.default_subnet_heap_delta_capacity); - heap_delta_capacity / MAX_SANDBOXES_RSS_TO_HEAP_DELTA_RATIO + (heap_delta_capacity / MAX_SANDBOXES_RSS_TO_HEAP_DELTA_RATIO).max(MIN_SANDBOXES_RSS) } fn trigger_sandbox_eviction( diff --git a/rs/config/src/embedders.rs b/rs/config/src/embedders.rs index 167c0bfabbdb..347b18780584 100644 --- a/rs/config/src/embedders.rs +++ b/rs/config/src/embedders.rs @@ -49,7 +49,7 @@ pub(crate) const MAX_NUMBER_EXPORTED_FUNCTIONS: usize = 1000; pub(crate) const MAX_SUM_EXPORTED_FUNCTION_NAME_LENGTHS: usize = 20000; /// The number of threads to use for query execution per canister. /// See also `QUERY_EXECUTION_THREADS_TOTAL`. -pub(crate) const QUERY_EXECUTION_THREADS_PER_CANISTER: usize = 2; +pub(crate) const QUERY_EXECUTION_THREADS_PER_CANISTER: usize = 1; /// In terms of execution time, compiling 1 WASM instructions takes as much time /// as actually executing 6_000 instructions. Only public for use in tests. @@ -63,25 +63,30 @@ pub const DEFAULT_CREATE_EXECUTION_STATE_BASE_COST: NumInstructions = NumInstructions::new(20_000_000); /// The number of rayon threads used by wasmtime to compile wasm binaries -const DEFAULT_WASMTIME_RAYON_COMPILATION_THREADS: usize = 10; +/// Nano-replica profile: minimal parallelism. +const DEFAULT_WASMTIME_RAYON_COMPILATION_THREADS: usize = 2; /// The number of rayon threads use for the parallel page copying optimization. -const DEFAULT_PAGE_ALLOCATOR_THREADS: usize = 8; +/// Nano-replica profile: minimal parallelism. +const DEFAULT_PAGE_ALLOCATOR_THREADS: usize = 2; /// Sandbox process eviction ensures that the number of sandbox processes is -/// always below this threshold. Idle sandboxes should be using at most ~5MiB -/// resident memory with the on-disk compilation cache, so 10,000 sandboxes -/// shouldn't be more than 50 GiB. -pub(crate) const DEFAULT_MAX_SANDBOX_COUNT: usize = 10_000; +/// always below this threshold. Nano-replica profile: at ~5MiB idle RSS each, +/// 32 sandboxes stay well under the budget while keeping a small working set +/// of canisters warm (sandbox respawn is expensive and serialized behind the +/// single update thread). +pub(crate) const DEFAULT_MAX_SANDBOX_COUNT: usize = 32; /// A sandbox process may be evicted after it has been idle for this /// duration and sandbox process eviction is activated. -pub(crate) const DEFAULT_MAX_SANDBOX_IDLE_TIME: Duration = Duration::from_secs(30 * 60); +/// Nano-replica profile: evict idle sandboxes quickly. +pub(crate) const DEFAULT_MAX_SANDBOX_IDLE_TIME: Duration = Duration::from_secs(2 * 60); /// The maximum number of pages that a message dirties without optimizing dirty /// page copying by triggering a new execution slice for copying pages. -/// This default is 1 GiB. -pub(crate) const DEFAULT_MAX_DIRTY_PAGES_WITHOUT_OPTIMIZATION: usize = (GIB as usize) / PAGE_SIZE; +/// Nano-replica profile: 32 MiB. +pub(crate) const DEFAULT_MAX_DIRTY_PAGES_WITHOUT_OPTIMIZATION: usize = + (32 * 1024 * 1024) / PAGE_SIZE; /// Scheduling overhead for copying dirty pages, in instructions. pub(crate) const DIRTY_PAGE_COPY_OVERHEAD: NumInstructions = NumInstructions::new(3_000); @@ -90,31 +95,37 @@ pub(crate) const DIRTY_PAGE_COPY_OVERHEAD: NumInstructions = NumInstructions::ne pub const WASM64_DIRTY_PAGE_OVERHEAD_MULTIPLIER: u64 = 4; const KIB: u64 = 1024; -const GIB: u64 = KIB * KIB * KIB; +const MIB: u64 = KIB * KIB; + +// Nano-replica profile: these limits bound the *resident* working set of a +// single execution. On a 512 MiB - 1 GiB VM they MUST stay well below the +// available RAM, otherwise a single message can OOM-kill the replica (which on +// a replicated subnet means state divergence). The mainnet values were 1-8 GiB. // Maximum number of stable memory dirty OS pages (4KiB) that an upgrade/install message execution // is allowed to produce. const STABLE_MEMORY_DIRTY_PAGE_LIMIT_UPGRADE: NumOsPages = - NumOsPages::new(8 * GIB / (PAGE_SIZE as u64)); + NumOsPages::new(128 * MIB / (PAGE_SIZE as u64)); // Maximum number of stable memory dirty OS pages (4KiB) that a regular message (update) execution // is allowed to produce. const STABLE_MEMORY_DIRTY_PAGE_LIMIT_MESSAGE: NumOsPages = - NumOsPages::new(2 * GIB / (PAGE_SIZE as u64)); + NumOsPages::new(32 * MIB / (PAGE_SIZE as u64)); // Maximum number of stable memory dirty OS pages (4KiB) that a non-replicated query is allowed to produce. -const STABLE_MEMORY_DIRTY_PAGE_LIMIT_QUERY: NumOsPages = NumOsPages::new(GIB / (PAGE_SIZE as u64)); +const STABLE_MEMORY_DIRTY_PAGE_LIMIT_QUERY: NumOsPages = + NumOsPages::new(32 * MIB / (PAGE_SIZE as u64)); // Maximum number of stable memory OS pages (4KiB) that that an upgrade/install message execution // is allowed to access. const STABLE_MEMORY_ACCESSED_PAGE_LIMIT_UPGRADE: NumOsPages = - NumOsPages::new(8 * GIB / (PAGE_SIZE as u64)); + NumOsPages::new(128 * MIB / (PAGE_SIZE as u64)); // Maximum number of stable memory OS pages (4KiB) that a that a regular message (update) execution // is allowed to access. const STABLE_MEMORY_ACCESSED_PAGE_LIMIT_MESSAGE: NumOsPages = - NumOsPages::new(2 * GIB / (PAGE_SIZE as u64)); + NumOsPages::new(32 * MIB / (PAGE_SIZE as u64)); // Maximum number of stable memory OS pages (4KiB) that a single non-replicated query execution // is allowed to access. const STABLE_MEMORY_ACCESSED_PAGE_LIMIT_QUERY: NumOsPages = - NumOsPages::new(GIB / (PAGE_SIZE as u64)); + NumOsPages::new(32 * MIB / (PAGE_SIZE as u64)); /// The maximum size in bytes for an uncompressed Wasm module. This value is /// also used as the maximum size for the Wasm chunk store of each canister. diff --git a/rs/config/src/execution_environment.rs b/rs/config/src/execution_environment.rs index 75a72fc8aacc..4a7ce32c2f6a 100644 --- a/rs/config/src/execution_environment.rs +++ b/rs/config/src/execution_environment.rs @@ -9,7 +9,6 @@ use std::{str::FromStr, time::Duration}; const KIB: u64 = 1024; const MIB: u64 = 1024 * KIB; const GIB: u64 = 1024 * MIB; -const TIB: u64 = 1024 * GIB; const REPLICATED_INTER_CANISTER_LOG_FETCH_FEATURE: FlagStatus = FlagStatus::Disabled; @@ -34,7 +33,12 @@ pub const TEST_DEFAULT_LOG_MEMORY_USAGE: u64 = if LOG_MEMORY_STORE_FEATURE_ENABL /// This specifies the threshold in bytes at which the subnet memory usage is /// considered to be high. If this value is greater or equal to the subnet /// capacity, then the subnet is never considered to have high usage. -const SUBNET_MEMORY_THRESHOLD: NumBytes = NumBytes::new(750 * GIB); +// Nano-replica profile: set equal to the subnet memory capacity so the subnet +// is never considered "high usage" and the storage cycle-reservation mechanism +// stays disabled — canisters can allocate freely up to the subnet capacity +// without reserving cycles (reservation pricing is calibrated for mainnet and +// would otherwise reject growth on a tiny subnet). +const SUBNET_MEMORY_THRESHOLD: NumBytes = NumBytes::new(512 * MIB); /// This is the upper limit on how much logical storage canisters can request to /// be store on a given subnet. @@ -42,7 +46,8 @@ const SUBNET_MEMORY_THRESHOLD: NumBytes = NumBytes::new(750 * GIB); /// Logical storage is the amount of storage being used from the point of view /// of the canister. The actual storage used by the nodes can be higher as the /// IC protocol requires storing copies of the canister state. -const SUBNET_MEMORY_CAPACITY: NumBytes = NumBytes::new(2 * TIB); +// Nano-replica profile: a few hundred MB of logical storage for the whole subnet. +const SUBNET_MEMORY_CAPACITY: NumBytes = NumBytes::new(512 * MIB); /// This is the upper limit on how much memory can be used by all guaranteed /// response canister messages on a given subnet. @@ -50,24 +55,27 @@ const SUBNET_MEMORY_CAPACITY: NumBytes = NumBytes::new(2 * TIB); /// Guaranteed response message memory usage is calculated as the total size of /// enqueued guaranteed responses; plus the maximum allowed response size per /// reserved guaranteed response slot. -const SUBNET_GUARANTEED_RESPONSE_MESSAGE_MEMORY_CAPACITY: NumBytes = NumBytes::new(15 * GIB); +// Nano-replica profile: guaranteed-response messages are heavily reserved +// (~2 MiB per outstanding call), so keep this small. Consider rejecting +// guaranteed-response calls entirely (best-effort-only subnet). +const SUBNET_GUARANTEED_RESPONSE_MESSAGE_MEMORY_CAPACITY: NumBytes = NumBytes::new(64 * MIB); /// The limit on how much memory may be used by all guaranteed response messages /// on a given subnet at the end of a round. /// /// During the round, the best-effort message memory usage may exceed the limit, /// but the constraint is restored at the end of the round by shedding messages. -const SUBNET_BEST_EFFORT_MESSAGE_MEMORY_CAPACITY: NumBytes = NumBytes::new(5 * GIB); +const SUBNET_BEST_EFFORT_MESSAGE_MEMORY_CAPACITY: NumBytes = NumBytes::new(32 * MIB); /// This is the upper limit on how much memory can be used by the ingress /// history on a given subnet. It is lower than the subnet message memory /// capacity because here we count actual memory consumption as opposed to /// memory plus reservations. -const INGRESS_HISTORY_MEMORY_CAPACITY: NumBytes = NumBytes::new(4 * GIB); +const INGRESS_HISTORY_MEMORY_CAPACITY: NumBytes = NumBytes::new(32 * MIB); /// This is the upper limit on how much memory can be used by wasm custom /// sections on a given subnet. -const SUBNET_WASM_CUSTOM_SECTIONS_MEMORY_CAPACITY: NumBytes = NumBytes::new(2 * GIB); +const SUBNET_WASM_CUSTOM_SECTIONS_MEMORY_CAPACITY: NumBytes = NumBytes::new(16 * MIB); // The gen 1 production machines should have 64 cores. // We could in theory use 32 threads, leaving other threads for query handling, @@ -79,15 +87,26 @@ const SUBNET_WASM_CUSTOM_SECTIONS_MEMORY_CAPACITY: NumBytes = NumBytes::new(2 * // We needs to ensure: // `SUBNET_MEMORY_CAPACITY / number_of_threads >= max_canister_memory` // If you change this number please adjust other constants as well. -pub(crate) const NUMBER_OF_EXECUTION_THREADS: usize = 4; +// Nano-replica profile: minimum viable update-execution parallelism. This also +// sets `SchedulerConfig::scheduler_cores` and divides the (small) subnet memory +// capacity across threads. +// +// NOTE: the DTS scheduler requires at least 2 cores — compute capacity is +// `(scheduler_cores - 1) * 100%` (see `round_schedule::compute_capacity_percent`), +// so a single core yields 0% allocatable capacity and trips a scheduler +// invariant on every round. 2 is the floor. +pub(crate) const NUMBER_OF_EXECUTION_THREADS: usize = 2; /// The number of bytes reserved for response callback executions. -/// For each thread, we reserve 2.5GiB of memory or, equivalently, 2560MiB. +/// Nano-replica profile: keep this small (8 MiB per thread) so canisters can +/// allocate almost the entire subnet memory capacity. The reservation only +/// guards response-callback execution headroom; on a best-effort-leaning nano +/// subnet a small reservation is sufficient. pub const SUBNET_MEMORY_RESERVATION: NumBytes = - NumBytes::new(2560 * MIB * NUMBER_OF_EXECUTION_THREADS as u64); + NumBytes::new(8 * MIB * NUMBER_OF_EXECUTION_THREADS as u64); /// The soft limit on the subnet-wide number of callbacks. -pub const SUBNET_CALLBACK_SOFT_LIMIT: usize = 1_000_000; +pub const SUBNET_CALLBACK_SOFT_LIMIT: usize = 4_096; /// The number of callbacks that are guaranteed to each canister. pub const CANISTER_GUARANTEED_CALLBACK_QUOTA: usize = 50; @@ -108,7 +127,11 @@ pub const STOP_CANISTER_TIMEOUT_DURATION: Duration = Duration::from_secs(5 * 60) /// potential fragmentation. This limit should be larger than the maximum /// canister memory size to guarantee that a message that overwrites the whole /// memory can succeed. -pub(crate) const SUBNET_HEAP_DELTA_CAPACITY: NumBytes = NumBytes::new(140 * GIB); +// Nano-replica profile: heap deltas are the dominant *resident* cost between +// checkpoints. Keep this small and checkpoint frequently (the subnet state is +// only a few hundred MB, so checkpoints are cheap). Must be >= the per-message +// dirty page limit so a single message can still complete. +pub(crate) const SUBNET_HEAP_DELTA_CAPACITY: NumBytes = NumBytes::new(96 * MIB); /// The maximum number of instructions for inspect_message calls. const MAX_INSTRUCTIONS_FOR_MESSAGE_ACCEPTANCE_CALLS: NumInstructions = @@ -126,7 +149,8 @@ pub const INSTRUCTION_OVERHEAD_PER_QUERY_CALL: u64 = 50_000_000; /// The number of query execution threads overall for all canisters. /// See also `QUERY_EXECUTION_THREADS_PER_CANISTER`. -pub(crate) const QUERY_EXECUTION_THREADS_TOTAL: usize = 4; +// Nano-replica profile: a single query-execution thread. +pub(crate) const QUERY_EXECUTION_THREADS_TOTAL: usize = 1; /// When a canister is scheduled for query execution, it is allowed to run for /// this amount of time. This limit controls how many queries the canister @@ -147,7 +171,7 @@ const QUERY_SCHEDULING_TIME_SLICE_PER_CANISTER: Duration = Duration::from_millis /// /// The limit includes both cache keys and values, for successful query /// executions and user errors. -const QUERY_CACHE_CAPACITY: NumBytes = NumBytes::new(200 * MIB); +const QUERY_CACHE_CAPACITY: NumBytes = NumBytes::new(16 * MIB); /// The upper limit on how long the cache entry stays valid in the query cache. const QUERY_CACHE_MAX_EXPIRY_TIME: Duration = Duration::from_secs(600); @@ -187,7 +211,9 @@ pub const DOGECOIN_MAINNET_CANISTER_ID: &str = "gordg-fyaaa-aaaan-aaadq-cai"; const DOGECOIN_MAINNET_STAGING_CANISTER_ID: &str = "bhuiy-ciaaa-aaaad-abwea-cai"; /// The capacity of the Wasm compilation cache. -pub const MAX_COMPILATION_CACHE_SIZE: NumBytes = NumBytes::new(10 * GIB); +// Nano-replica profile: the compilation cache is on-disk; keep the in-memory +// bound small. +pub const MAX_COMPILATION_CACHE_SIZE: NumBytes = NumBytes::new(64 * MIB); /// Maximum number of controllers allowed in a request (specified in the interface spec). pub const MAX_ALLOWED_CONTROLLERS_COUNT: usize = 10; diff --git a/rs/config/src/message_routing.rs b/rs/config/src/message_routing.rs index b1f7c064c560..355912020beb 100644 --- a/rs/config/src/message_routing.rs +++ b/rs/config/src/message_routing.rs @@ -4,13 +4,14 @@ use serde::{Deserialize, Serialize}; /// /// At most `MAX_STREAM_MESSAGES` are enqueued into a stream; but only until its /// `count_bytes()` is greater than or equal to `TARGET_STREAM_SIZE_BYTES`. -pub const TARGET_STREAM_SIZE_BYTES: usize = 10 * 1024 * 1024; +// Nano-replica profile: smaller XNet streams to bound per-stream memory. +pub const TARGET_STREAM_SIZE_BYTES: usize = 2 * 1024 * 1024; /// Maximum number of messages in a stream. /// /// At most `MAX_STREAM_MESSAGES` are enqueued into a stream; but only until its /// `count_bytes()` is greater than or equal to `TARGET_STREAM_SIZE_BYTES`. -pub const MAX_STREAM_MESSAGES: usize = 10_000; +pub const MAX_STREAM_MESSAGES: usize = 1_000; #[derive(Clone, Eq, PartialEq, Debug, Deserialize, Serialize)] #[serde(default)] diff --git a/rs/config/src/subnet_config.rs b/rs/config/src/subnet_config.rs index 14d51e72f421..69189a446556 100644 --- a/rs/config/src/subnet_config.rs +++ b/rs/config/src/subnet_config.rs @@ -26,7 +26,7 @@ impl SubnetSecurity { } } -const GIB: u64 = 1024 * 1024 * 1024; +const MIB: u64 = 1024 * 1024; const M: u64 = 1_000_000; const B: u64 = 1_000_000_000; const T: u128 = 1_000_000_000_000; @@ -103,12 +103,17 @@ const SYSTEM_SUBNET_FACTOR: u64 = 10; // so a round may take 1 to 4 seconds. To avoid regressing the throughput of // slow subnets while maintaining the speed of fast subnets, we use the middle // value of 200MB. -const MAX_HEAP_DELTA_PER_ITERATION: NumBytes = NumBytes::new(200 * M); +// Nano-replica profile: keep a single round's heap-delta production below the +// SUBNET_HEAP_DELTA_CAPACITY (96 MiB) so one round cannot overshoot the cap and +// spike unreclaimable (anonymous) resident memory. This bounds the per-round +// dirty working set so writes stay safe on a 512 MiB - 1 GiB VM. +const MAX_HEAP_DELTA_PER_ITERATION: NumBytes = NumBytes::new(64 * M); /// The reserve represents the freely available portion of the /// `subnet_heap_delta_capacity` that can be used as a heap delta burst /// during the initial rounds following a checkpoint. -const HEAP_DELTA_INITIAL_RESERVE: NumBytes = NumBytes::new(32 * GIB); +/// Nano-replica profile: must not exceed `SUBNET_HEAP_DELTA_CAPACITY`. +const HEAP_DELTA_INITIAL_RESERVE: NumBytes = NumBytes::new(32 * MIB); // Log all messages that took more than this value to execute. pub const MAX_MESSAGE_DURATION_BEFORE_WARN_IN_SECONDS: f64 = 5.0; @@ -122,7 +127,9 @@ pub const MAX_MESSAGE_DURATION_BEFORE_WARN_IN_SECONDS: f64 = 5.0; /// /// long installs + long updates + query threads = 1 + 4 + 2 = 7 /// -const MAX_PAUSED_EXECUTIONS: usize = 4; +// Nano-replica profile: limit concurrent paused (DTS) executions to keep the +// number of simultaneously live Wasm instances small. +const MAX_PAUSED_EXECUTIONS: usize = 1; /// Cost for creating a new canister. pub const CANISTER_CREATION_FEE: Cycles = Cycles::new(500_000_000_000); @@ -318,7 +325,10 @@ impl SchedulerConfig { max_heap_delta_per_iteration: MAX_HEAP_DELTA_PER_ITERATION, max_message_duration_before_warn_in_seconds: MAX_MESSAGE_DURATION_BEFORE_WARN_IN_SECONDS, - heap_delta_rate_limit: NumBytes::from(75 * 1024 * 1024), + // Nano-replica profile: cap per-canister heap delta per round so a + // single canister cannot fill the (small) subnet heap delta capacity + // in one round. + heap_delta_rate_limit: NumBytes::from(32 * MIB), install_code_rate_limit: MAX_INSTRUCTIONS_PER_SLICE, dirty_page_overhead: DEFAULT_DIRTY_PAGE_OVERHEAD, accumulated_priority_reset_interval: ACCUMULATED_PRIORITY_RESET_INTERVAL,