diff --git a/Cargo.lock b/Cargo.lock
index 16bcb0f104fd..b26b2d52cb28 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -7258,6 +7258,7 @@ dependencies = [
  "ic-secp256k1",
  "ic-test-utilities-types",
  "ic-types",
+ "ic-universal-canister",
  "ic-validator",
  "itertools 0.12.1",
  "prost 0.13.5",
diff --git a/dev/local-net/prep.sh b/dev/local-net/prep.sh
index 003f8af6e7b4..6aa3f71dd785 100755
--- a/dev/local-net/prep.sh
+++ b/dev/local-net/prep.sh
@@ -97,6 +97,7 @@ docker run --rm \
     --nns-subnet-index "$SUBNET_IDX" \
     --provisional-whitelist /bootstrap/.provisional_whitelist.json \
     --use-specified-ids-allocation-range \
+    --dkg-interval-length 49 \
     "${NODE_ARGS[@]}"
 
 # Permissions: ic-prep wrote as root inside the container. Make the
@@ -139,6 +140,13 @@ for i in 0 1 2 3; do
   crypto: {
     crypto_root: "/etc/ic/crypto",
   },
+  hypervisor: {
+    // Nano profile: threshold == subnet memory capacity disables the storage
+    // cycle-reservation mechanism; small reservation lets canisters allocate
+    // up to ~the full subnet memory capacity.
+    subnet_memory_threshold: 536870912,
+    subnet_memory_reservation: 16777216,
+  },
   http_handler: {
     listen_addr: "[::]:$HTTP_PORT",
   },
diff --git a/rs/canister_client/Cargo.toml b/rs/canister_client/Cargo.toml
index 1dfd761ad595..905df0adade2 100644
--- a/rs/canister_client/Cargo.toml
+++ b/rs/canister_client/Cargo.toml
@@ -31,6 +31,7 @@ url = { workspace = true }
 [dev-dependencies]
 hex = { workspace = true }
 ic-certification-test-utils = { path = "../certification/test-utils" }
+ic-universal-canister = { path = "../universal_canister/lib" }
 ic-crypto-test-utils-reproducible-rng = { path = "../crypto/test_utils/reproducible_rng" }
 ic-crypto-test-utils-root-of-trust = { path = "../crypto/test_utils/root_of_trust" }
 ic-crypto-test-utils-tls = { path = "../crypto/test_utils/tls" }
diff --git a/rs/canister_client/examples/hammer.rs b/rs/canister_client/examples/hammer.rs
new file mode 100644
index 000000000000..3dfda207a5b5
--- /dev/null
+++ b/rs/canister_client/examples/hammer.rs
@@ -0,0 +1,701 @@
+//! Stress-test driver for the local 4-node subnet (dev/local-net).
+//!
+//! Deploys N universal canisters via `provisional_create_canister_with_cycles`
+//! and then hammers the subnet with compute and memory load, reporting how it
+//! holds up. Drives the public endpoint with the in-repo `ic-canister-client`
+//! Agent, so it needs no dfx / external SDK.
+//!
+//! Run with:
+//!   cargo run -p ic-canister-client --example hammer --release -- http://localhost:8080
+//!
+//! Env knobs: HAMMER_CANISTERS (default 6), HAMMER_SECS (per throughput/compute
+//! phase, default 15), HAMMER_CONCURRENCY (default 48).
+
+use ic_canister_client::{Agent, Sender};
+use ic_management_canister_types_private::{
+    CanisterIdRecord, CanisterInstallMode, IC_00, InstallCodeArgs, Method, Payload,
+    ProvisionalCreateCanisterWithCyclesArgs,
+};
+use ic_types::{CanisterId, PrincipalId};
+use ic_universal_canister::{call_args, get_universal_canister_wasm, wasm};
+use std::collections::BTreeMap;
+use std::str::FromStr;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+use url::Url;
+
+const MIB: u32 = 1024 * 1024;
+// A canister id that lives in this subnet's allocation range (see DEPLOY.md);
+// used only to route `provisional_create_canister_with_cycles`.
+const ROUTING_CANISTER_ID: &str = "bnz7o-iuaaa-aaaaa-qaaaa-cai";
+
+static NONCE: AtomicU64 = AtomicU64::new(1);
+
+fn next_nonce() -> Vec<u8> {
+    NONCE.fetch_add(1, Ordering::Relaxed).to_le_bytes().to_vec()
+}
+
+#[derive(Default)]
+struct Stats {
+    ok: AtomicU64,
+    err: AtomicU64,
+    lat_sum_ms: AtomicU64,
+    lat_max_ms: AtomicU64,
+    err_classes: Mutex<BTreeMap<String, u64>>,
+}
+
+impl Stats {
+    fn record(&self, started: Instant, result: &Result<Option<Vec<u8>>, String>) {
+        let ms = started.elapsed().as_millis() as u64;
+        self.lat_sum_ms.fetch_add(ms, Ordering::Relaxed);
+        self.lat_max_ms.fetch_max(ms, Ordering::Relaxed);
+        match result {
+            Ok(_) => {
+                self.ok.fetch_add(1, Ordering::Relaxed);
+            }
+            Err(e) => {
+                self.err.fetch_add(1, Ordering::Relaxed);
+                // Collapse to a short class so the histogram stays readable.
+                let class: String = e.split_whitespace().take(60).collect::<Vec<_>>().join(" ");
+                let class: String = class.chars().take(400).collect();
+                *self.err_classes.lock().unwrap().entry(class).or_insert(0) += 1;
+            }
+        }
+    }
+
+    fn report(&self, label: &str, wall: Duration) {
+        let ok = self.ok.load(Ordering::Relaxed);
+        let err = self.err.load(Ordering::Relaxed);
+        let total = ok + err;
+        let avg = if total > 0 {
+            self.lat_sum_ms.load(Ordering::Relaxed) / total
+        } else {
+            0
+        };
+        let rps = ok as f64 / wall.as_secs_f64().max(0.001);
+        println!(
+            "\n── {label} ──\n  ok={ok} err={err}  throughput={rps:.1} ok/s  \
+             latency avg={avg}ms max={}ms  (wall {:.1}s)",
+            self.lat_max_ms.load(Ordering::Relaxed),
+            wall.as_secs_f64()
+        );
+        let classes = self.err_classes.lock().unwrap();
+        if !classes.is_empty() {
+            println!("  error classes:");
+            for (c, n) in classes.iter() {
+                println!("    [{n:>5}] {c}");
+            }
+        }
+    }
+}
+
+async fn update(
+    agent: &Agent,
+    canister: &CanisterId,
+    payload: Vec<u8>,
+) -> Result<Option<Vec<u8>>, String> {
+    agent
+        .execute_update(canister, canister, "update", payload, next_nonce())
+        .await
+}
+
+/// Build a payload that, executed by the ingress-target canister, makes it call
+/// canisters[start+1], which calls canisters[start+2], ... `depth` hops deep
+/// around the canister ring; the innermost canister replies and the replies
+/// propagate back. Generates ~2*depth inter-canister messages per ingress
+/// (depth requests + depth responses), with `depth` outstanding callbacks at
+/// peak (each holding a guaranteed-response memory reservation).
+fn chain_payload(canisters: &[CanisterId], start: usize, depth: usize) -> Vec<u8> {
+    let k = canisters.len();
+    let mut inner = wasm().reply().build(); // innermost callee just replies
+    for h in (1..=depth).rev() {
+        let callee = canisters[(start + h) % k].get().as_slice().to_vec();
+        inner = wasm()
+            .call_simple(callee, "update", call_args().other_side(inner))
+            .build();
+    }
+    inner
+}
+
+/// Build a payload that makes the ingress-target canister fire one parallel
+/// update call to every canister (fan-out, fire-and-forget: on_reply/on_reject
+/// are no-ops), then reply to the ingress immediately. Each in-flight ingress
+/// thus leaves N = canisters.len() outstanding inter-canister calls, each
+/// holding a guaranteed-response memory reservation (~2 MiB) and a callback
+/// slot — so C concurrent ingresses drive up to C*N simultaneous outstanding
+/// calls, stressing the 64 MiB guaranteed-response cap and the callback limits.
+fn fanout_payload(canisters: &[CanisterId], mult: usize) -> Vec<u8> {
+    let callee_runs = wasm().reply().build(); // callee just replies
+    let noop = wasm().noop().build(); // fire-and-forget: ignore the response
+    let mut p = wasm();
+    // Fire `mult` calls to each canister in ONE message: all the response-memory
+    // reservations are taken before any callee runs, so a single message issuing
+    // > ~32 calls exceeds the 64 MiB guaranteed-response cap.
+    for _ in 0..mult {
+        for c in canisters {
+            let callee = c.get().as_slice().to_vec();
+            p = p
+                .call_new(callee, "update", call_args().on_reply(noop.clone()).on_reject(noop.clone()))
+                .call_data_append(&callee_runs)
+                .call_perform();
+        }
+    }
+    p.reply().build()
+}
+
+/// Create + install a universal canister; optionally pre-grow its stable memory.
+async fn deploy_one(
+    agent: &Agent,
+    routing_id: &CanisterId,
+    pre_grow_pages: u32,
+) -> Result<CanisterId, String> {
+    let args = ProvisionalCreateCanisterWithCyclesArgs::new(
+        Some(1_000_000_000_000_000_u128), // 1 Pcycle, never freezes
+        None,
+    );
+    let reply = agent
+        .execute_update(
+            routing_id,
+            &IC_00,
+            Method::ProvisionalCreateCanisterWithCycles,
+            args.encode(),
+            next_nonce(),
+        )
+        .await?
+        .ok_or("provisional_create: empty reply")?;
+    let canister_id = CanisterIdRecord::decode(&reply)
+        .map_err(|e| format!("decode CanisterIdRecord: {e}"))?
+        .get_canister_id();
+
+    agent
+        .install_canister(InstallCodeArgs::new(
+            CanisterInstallMode::Install,
+            canister_id,
+            get_universal_canister_wasm(),
+            vec![],
+        ))
+        .await?;
+
+    if pre_grow_pages > 0 {
+        update(agent, &canister_id, wasm().stable_grow(pre_grow_pages).reply().build()).await?;
+    }
+    Ok(canister_id)
+}
+
+/// Run `make_payload` against the canister pool from `concurrency` workers until
+/// `dur` elapses.
+async fn storm(
+    agent: Arc<Agent>,
+    canisters: Arc<Vec<CanisterId>>,
+    concurrency: usize,
+    dur: Duration,
+    make_payload: Arc<dyn Fn() -> Vec<u8> + Send + Sync>,
+    is_query: bool,
+) -> Stats {
+    let stats = Arc::new(Stats::default());
+    let deadline = Instant::now() + dur;
+    let rr = Arc::new(AtomicU64::new(0));
+    let mut handles = Vec::new();
+    for _ in 0..concurrency {
+        let agent = agent.clone();
+        let canisters = canisters.clone();
+        let stats = stats.clone();
+        let rr = rr.clone();
+        let make_payload = make_payload.clone();
+        handles.push(tokio::spawn(async move {
+            while Instant::now() < deadline {
+                let idx = rr.fetch_add(1, Ordering::Relaxed) as usize % canisters.len();
+                let canister = canisters[idx];
+                let payload = make_payload();
+                let started = Instant::now();
+                let res = if is_query {
+                    agent.execute_query(&canister, "query", payload).await
+                } else {
+                    update(&agent, &canister, payload).await
+                };
+                stats.record(started, &res);
+            }
+        }));
+    }
+    for h in handles {
+        let _ = h.await;
+    }
+    Arc::try_unwrap(stats).unwrap_or_default()
+}
+
+#[tokio::main(flavor = "multi_thread", worker_threads = 8)]
+async fn main() {
+    let url = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| "http://localhost:8080".to_string());
+    let num_canisters: usize = std::env::var("HAMMER_CANISTERS")
+        .ok()
+        .and_then(|s| s.parse().ok())
+        .unwrap_or(6);
+    let secs: u64 = std::env::var("HAMMER_SECS")
+        .ok()
+        .and_then(|s| s.parse().ok())
+        .unwrap_or(15);
+    let concurrency: usize = std::env::var("HAMMER_CONCURRENCY")
+        .ok()
+        .and_then(|s| s.parse().ok())
+        .unwrap_or(48);
+    // probe mode: skip the throughput/compute/growth storms, run only the
+    // per-message dirty-page-limit probe (Phase C).
+    let probe_only = std::env::var("HAMMER_MODE").map(|m| m == "probe").unwrap_or(false);
+    // read mode: populate canisters with large state, then read-heavy updates on
+    // all-but-one and read-heavy queries on the last; plus a read-limit probe.
+    let read_mode = std::env::var("HAMMER_MODE").map(|m| m == "read").unwrap_or(false);
+    // heap mode: the heap-memory (Wasm) analogue of the stable-memory tests
+    // (compute/dirty-limit/read). Heap has no per-execution dirty/accessed cap
+    // (the 32 MiB limits are stable-only), so a single message can touch
+    // arbitrarily large heap.
+    let heap_mode = std::env::var("HAMMER_MODE").map(|m| m == "heap").unwrap_or(false);
+    // calls mode: thrash inter-canister communication — each ingress triggers a
+    // chain of canister-to-canister update calls `HAMMER_CALL_DEPTH` hops deep.
+    let calls_mode = std::env::var("HAMMER_MODE").map(|m| m == "calls").unwrap_or(false);
+    let call_depth: usize = std::env::var("HAMMER_CALL_DEPTH")
+        .ok()
+        .and_then(|s| s.parse().ok())
+        .unwrap_or(4);
+    // fanout mode: each ingress fires N parallel calls (fire-and-forget) →
+    // N outstanding calls per in-flight ingress, to stress the guaranteed-
+    // response memory reservation and callback limits.
+    let fanout_mode = std::env::var("HAMMER_MODE").map(|m| m == "fanout").unwrap_or(false);
+    // hybrid mode: reads + writes + inter-canister messaging all at once.
+    let hybrid_mode = std::env::var("HAMMER_MODE").map(|m| m == "hybrid").unwrap_or(false);
+    // heapread mode: large heap-memory reads pulling lots of distinct state into
+    // RAM. Each canister holds a 96 MiB heap global (built via append, small
+    // transient); reads use queries (heap reads via update would OOM because
+    // get_global_data copies the global to the stack, permanently growing heap).
+    let heapread_mode = std::env::var("HAMMER_MODE").map(|m| m == "heapread").unwrap_or(false);
+
+    let agent = Arc::new(Agent::new(
+        Url::parse(&url).expect("bad url"),
+        Sender::Anonymous,
+    ));
+    let routing_id =
+        CanisterId::unchecked_from_principal(PrincipalId::from_str(ROUTING_CANISTER_ID).unwrap());
+
+    println!("== hammer ==");
+    println!("target={url} canisters={num_canisters} phase_secs={secs} concurrency={concurrency}");
+
+    // ---- Deploy ----
+    println!("\n[1/5] deploying {num_canisters} universal canisters (pre-grow 32 MiB stable each)...");
+    let t0 = Instant::now();
+    let mut canisters = Vec::new();
+    for i in 0..num_canisters {
+        match deploy_one(&agent, &routing_id, 512).await {
+            Ok(id) => {
+                println!("  + canister {i} = {id}");
+                canisters.push(id);
+            }
+            Err(e) => println!("  ! deploy {i} failed: {e}"),
+        }
+    }
+    if canisters.is_empty() {
+        eprintln!("no canisters deployed; aborting");
+        std::process::exit(1);
+    }
+    println!("  deployed {} canisters in {:.1}s", canisters.len(), t0.elapsed().as_secs_f64());
+    let canisters = Arc::new(canisters);
+
+    if hybrid_mode {
+        // ---- Hybrid: heavy reads + writes + inter-canister messaging at once ----
+        const BIG_MIB: u32 = 96;
+        let chunk: u32 = 24 * MIB;
+        let grow_pages = chunk / 65536;
+        let windows = ((BIG_MIB * MIB) / chunk) as u64;
+        println!("\n[hybrid] populating {} canisters to ~{BIG_MIB} MiB stable each...", canisters.len());
+        for (i, c) in canisters.iter().enumerate() {
+            let mut off = 0u32;
+            while off + chunk <= BIG_MIB * MIB {
+                let _ = update(&agent, c, wasm().stable_grow(grow_pages).stable_fill(off, 0x40 + i as u32, chunk).reply().build()).await;
+                off += chunk;
+            }
+        }
+        println!("[hybrid] waiting ~20s for a checkpoint...");
+        tokio::time::sleep(Duration::from_secs(20)).await;
+
+        // Three concurrent storms over the full canister pool, splitting the
+        // concurrency budget. Each canister sees a mix of query-reads,
+        // update-writes, and inter-canister call chains simultaneously.
+        let each = (concurrency / 3).max(1);
+        let roff = Arc::new(AtomicU64::new(0));
+        let woff = Arc::new(AtomicU64::new(0));
+        let mctr = Arc::new(AtomicU64::new(0));
+        let read_mk: Arc<dyn Fn() -> Vec<u8> + Send + Sync> = {
+            let roff = roff.clone();
+            Arc::new(move || {
+                let n = roff.fetch_add(1, Ordering::Relaxed);
+                wasm().stable_read(((n % windows) as u32) * chunk, chunk).reply().build()
+            })
+        };
+        let write_mk: Arc<dyn Fn() -> Vec<u8> + Send + Sync> = {
+            let woff = woff.clone();
+            Arc::new(move || {
+                let n = woff.fetch_add(1, Ordering::Relaxed);
+                // overwrite 8 MiB within an existing window (dirties, no growth)
+                wasm().stable_fill(((n % windows) as u32) * chunk, 0x77, 8 * MIB).reply().build()
+            })
+        };
+        let msg_cans = canisters.clone();
+        let msg_mk: Arc<dyn Fn() -> Vec<u8> + Send + Sync> = {
+            let msg_cans = msg_cans.clone();
+            Arc::new(move || {
+                let n = mctr.fetch_add(1, Ordering::Relaxed) as usize;
+                chain_payload(&msg_cans, n % msg_cans.len(), 3)
+            })
+        };
+        println!(
+            "\n[hybrid] storm ({secs}s): reads(query 24 MiB) + writes(update 8 MiB) + messages(3-hop chains), {each} concurrent each over {} canisters",
+            canisters.len()
+        );
+        let t = Instant::now();
+        let (rs, ws, ms) = tokio::join!(
+            storm(agent.clone(), canisters.clone(), each, Duration::from_secs(secs), read_mk, true),
+            storm(agent.clone(), canisters.clone(), each, Duration::from_secs(secs), write_mk, false),
+            storm(agent.clone(), canisters.clone(), each, Duration::from_secs(secs), msg_mk, false),
+        );
+        rs.report("HYBRID reads (query, 24 MiB stable_read)", t.elapsed());
+        ws.report("HYBRID writes (update, 8 MiB stable_fill)", t.elapsed());
+        ms.report("HYBRID messages (3-hop call chains)", t.elapsed());
+
+        println!("\n== done ==");
+        return;
+    }
+
+    if calls_mode {
+        // ---- Inter-canister call thrash ----
+        // Each ingress makes the target canister start a `call_depth`-hop chain
+        // of update calls around the canister ring. With C concurrent ingresses
+        // there are up to C*call_depth outstanding inter-canister calls at peak.
+        let cans = canisters.clone();
+        let ctr = Arc::new(AtomicU64::new(0));
+        let mk: Arc<dyn Fn() -> Vec<u8> + Send + Sync> = {
+            let cans = cans.clone();
+            Arc::new(move || {
+                let n = ctr.fetch_add(1, Ordering::Relaxed) as usize;
+                chain_payload(&cans, n % cans.len(), call_depth)
+            })
+        };
+        println!(
+            "\n[calls] inter-canister call storm ({secs}s): {call_depth}-hop chains, {concurrency} concurrent ingresses across {} canisters",
+            canisters.len()
+        );
+        println!("  (~{} inter-canister messages per ingress; up to {} outstanding calls at peak)", 2 * call_depth, concurrency * call_depth);
+        let t = Instant::now();
+        let stats = storm(
+            agent.clone(),
+            canisters.clone(),
+            concurrency,
+            Duration::from_secs(secs),
+            mk,
+            false,
+        )
+        .await;
+        stats.report(&format!("INTER-CANISTER CALLS ({call_depth}-hop chains)"), t.elapsed());
+
+        println!("\n== done ==");
+        return;
+    }
+
+    if fanout_mode {
+        // ---- Inter-canister FAN-OUT thrash (stresses response-memory reservation) ----
+        let mult: usize = std::env::var("HAMMER_FANOUT_MULT")
+            .ok()
+            .and_then(|s| s.parse().ok())
+            .unwrap_or(1);
+        let n = canisters.len() * mult;
+        let cans = canisters.clone();
+        let mk: Arc<dyn Fn() -> Vec<u8> + Send + Sync> = {
+            let cans = cans.clone();
+            Arc::new(move || fanout_payload(&cans, mult))
+        };
+        println!(
+            "\n[fanout] inter-canister FAN-OUT storm ({secs}s): each ingress fires {n} parallel calls in ONE message (fire-and-forget), {concurrency} concurrent ingresses"
+        );
+        println!(
+            "  ({n} simultaneous reservations/ingress; the 64 MiB guaranteed-response cap allows only ~32 — expect rejections when {n} > ~32)"
+        );
+        let t = Instant::now();
+        let stats = storm(
+            agent.clone(),
+            canisters.clone(),
+            concurrency,
+            Duration::from_secs(secs),
+            mk,
+            false,
+        )
+        .await;
+        stats.report(&format!("INTER-CANISTER FAN-OUT (x{n} parallel/ingress)"), t.elapsed());
+
+        println!("\n== done ==");
+        return;
+    }
+
+    if read_mode {
+        // Populate each canister with ~120 MiB of real stable data (written in
+        // <=24 MiB chunks to respect the 32 MiB per-message dirty limit).
+        const BIG_MIB: u32 = 128;
+        let chunk: u32 = 24 * MIB;
+        let pages: u32 = (BIG_MIB * MIB) / 65536; // 64 KiB Wasm pages
+        println!("\n[read] populating {} canisters to ~{BIG_MIB} MiB stable each...", canisters.len());
+        let _ = pages; // grow incrementally below instead of one big grow
+        let grow_pages = chunk / 65536; // pages per 24 MiB step
+        for (i, c) in canisters.iter().enumerate() {
+            let (mut off, mut werr) = (0u32, 0u32);
+            // Grow + fill one 24 MiB window at a time: a single 128 MiB grow can
+            // be rejected, but small incremental grows reliably build the state.
+            while off + chunk <= BIG_MIB * MIB {
+                let p = wasm().stable_grow(grow_pages).stable_fill(off, 0x40 + i as u32, chunk).reply().build();
+                if update(&agent, c, p).await.is_err() {
+                    werr += 1;
+                }
+                off += chunk;
+            }
+            println!("  canister {i} = {c} populated ({werr} write errors)");
+        }
+        println!("[read] waiting ~25s for a checkpoint to flush state to disk...");
+        tokio::time::sleep(Duration::from_secs(25)).await;
+
+        // Read 24 MiB (< 32 MiB accessed limit) per call, cycling the offset
+        // window across the populated range.
+        let off_ctr = Arc::new(AtomicU64::new(0));
+        let mk: Arc<dyn Fn() -> Vec<u8> + Send + Sync> = {
+            let off_ctr = off_ctr.clone();
+            let windows = ((BIG_MIB * MIB) / chunk) as u64; // cycle across the FULL state
+            Arc::new(move || {
+                let n = off_ctr.fetch_add(1, Ordering::Relaxed);
+                let off = ((n % windows) as u32) * chunk;
+                wasm().stable_read(off, chunk).reply().build()
+            })
+        };
+        let all_cans = Arc::new(canisters.as_ref().clone());
+        println!(
+            "\n[read] read storm ({secs}s): 24 MiB stable_read/call — QUERIES across all {} canisters (cycling full range)",
+            all_cans.len()
+        );
+        let t = Instant::now();
+        let qs = storm(agent.clone(), all_cans, concurrency, Duration::from_secs(secs), mk.clone(), true).await;
+        qs.report("READ-QUERY (24 MiB stable_read, all canisters)", t.elapsed());
+
+        // Read-limit probe: access 48 MiB in one execution (> 32 MiB accessed
+        // limit) -> expect a trap, for both update and query.
+        println!("\n[read] read-limit probe: 48 MiB stable_read in one execution (accessed limit 32 MiB)");
+        let ru = update(&agent, &canisters[0], wasm().stable_read(0, 48 * MIB).reply().build()).await;
+        println!(
+            "  update read 48 MiB: {}",
+            if ru.is_ok() { "OK (no limit!)".to_string() } else { format!("TRAP {}", ru.as_ref().err().unwrap().chars().take(220).collect::<String>()) }
+        );
+        let rq = agent
+            .execute_query(&canisters[canisters.len() - 1], "query", wasm().stable_read(0, 48 * MIB).reply().build())
+            .await;
+        println!(
+            "  query  read 48 MiB: {}",
+            if rq.is_ok() { "OK (no limit!)".to_string() } else { format!("TRAP {}", rq.as_ref().err().unwrap().chars().take(220).collect::<String>()) }
+        );
+
+        println!("\n== done ==");
+        return;
+    }
+
+    if heapread_mode {
+        // Build a large heap global per canister via append (24 MiB chunks, so
+        // the transient heap stays small and all 3 globals fit under the cap).
+        const BIG_MIB: u32 = 96;
+        let chunk: u32 = 24 * MIB;
+        let appends = (BIG_MIB * MIB) / chunk;
+        println!("\n[heapread] populating {} canisters with a {BIG_MIB} MiB heap global...", canisters.len());
+        for (i, c) in canisters.iter().enumerate() {
+            let mut ok = true;
+            for _ in 0..appends {
+                if update(&agent, c, wasm().push_equal_bytes(0x41 + i as u32, chunk).append_to_global_data().reply().build()).await.is_err() {
+                    ok = false;
+                }
+            }
+            println!("  canister {i} = {c}: {}", if ok { "populated" } else { "PARTIAL/FAILED" });
+        }
+        println!("[heapread] waiting ~25s for a checkpoint to flush state to disk...");
+        tokio::time::sleep(Duration::from_secs(25)).await;
+
+        // Read the full 96 MiB global per call via queries on ALL canisters.
+        // (Heap reads via update OOM: the get_global_data stack copy permanently
+        // grows the heap. Queries discard it.) This pulls ~3x96 MiB of distinct
+        // heap state into the page cache.
+        println!(
+            "\n[heapread] heap-read QUERY storm ({secs}s): get_global_data ({BIG_MIB} MiB) on all {} canisters, {concurrency} concurrent",
+            canisters.len()
+        );
+        let t = Instant::now();
+        let qs = storm(
+            agent.clone(),
+            canisters.clone(),
+            concurrency,
+            Duration::from_secs(secs),
+            Arc::new(|| wasm().get_global_data().reply().build()),
+            true,
+        )
+        .await;
+        qs.report("HEAP-READ-QUERY (96 MiB/read)", t.elapsed());
+        println!("\n== done ==");
+        return;
+    }
+
+    if heap_mode {
+        // ---- Heap per-message write probe ----
+        // Stable memory traps a single message that dirties/accesses > 32 MiB;
+        // heap (Wasm) memory has no such per-message cap. push_equal_bytes(b, n)
+        // pushes n bytes onto the data stack, dirtying n bytes of heap.
+        println!("\n[heap] per-message heap-write probe (stable's per-msg limit is 32 MiB; heap has none)");
+        for sz in [24u32, 48, 96] {
+            let r = update(&agent, &canisters[0], wasm().push_equal_bytes(0x61, sz * MIB).reply().build()).await;
+            println!(
+                "  push {sz} MiB onto heap in ONE message: {}",
+                if r.is_ok() { "OK".to_string() } else { format!("TRAP {}", r.as_ref().err().unwrap().chars().take(200).collect::<String>()) }
+            );
+        }
+
+        // ---- Heap-write storm (analogue of the COMPUTE storm) ----
+        let upd_cans = Arc::new(canisters[..canisters.len() - 1].to_vec());
+        println!("\n[heap] heap-write storm ({secs}s): 8 MiB heap write/call on {} canisters", upd_cans.len());
+        let t = Instant::now();
+        let ws = storm(
+            agent.clone(),
+            upd_cans.clone(),
+            concurrency,
+            Duration::from_secs(secs),
+            Arc::new(|| wasm().push_equal_bytes(0x61, 8 * MIB).reply().build()),
+            false,
+        )
+        .await;
+        ws.report("HEAP-WRITE (8 MiB/call)", t.elapsed());
+
+        // ---- Populate a persistent heap global, then read it ----
+        // 96 MiB so each get_global_data read pulls ~96 MiB of distinct state
+        // into memory (no per-execution accessed cap on heap, unlike stable's
+        // 32 MiB). 3 canisters x 96 MiB = ~288 MiB distinct read working set.
+        const BIG_MIB: u32 = 96;
+        println!("\n[heap] populating {} canisters with a {BIG_MIB} MiB heap global...", canisters.len());
+        for (i, c) in canisters.iter().enumerate() {
+            let r = update(
+                &agent,
+                c,
+                wasm().push_equal_bytes(0x41 + i as u32, BIG_MIB * MIB).set_global_data_from_stack().reply().build(),
+            )
+            .await;
+            println!("  canister {i} = {c}: {}", if r.is_ok() { "populated".to_string() } else { format!("ERR {}", r.as_ref().err().unwrap().chars().take(160).collect::<String>()) });
+        }
+        println!("[heap] waiting ~25s for a checkpoint...");
+        tokio::time::sleep(Duration::from_secs(25)).await;
+
+        // ---- Heap-read storm (analogue of the stable READ test) ----
+        // get_global_data reads the whole 40 MiB global in one execution — more
+        // than the 32 MiB stable per-message accessed limit would ever allow.
+        let qry_cans = Arc::new(vec![canisters[canisters.len() - 1]]);
+        println!(
+            "\n[heap] heap-read storm ({secs}s): read {BIG_MIB} MiB heap global/call — UPDATES on {} canisters, QUERIES on 1",
+            upd_cans.len()
+        );
+        let t = Instant::now();
+        let (us, qs) = tokio::join!(
+            storm(agent.clone(), upd_cans.clone(), concurrency, Duration::from_secs(secs), Arc::new(|| wasm().get_global_data().reply().build()), false),
+            storm(agent.clone(), qry_cans.clone(), concurrency, Duration::from_secs(secs), Arc::new(|| wasm().get_global_data().reply().build()), true),
+        );
+        us.report("HEAP-READ-UPDATE (40 MiB heap read)", t.elapsed());
+        qs.report("HEAP-READ-QUERY (40 MiB heap read)", t.elapsed());
+
+        println!("\n== done ==");
+        return;
+    }
+
+    if !probe_only {
+    // ---- Phase A: ingress/throughput storm (near-empty updates) ----
+    println!("\n[2/5] THROUGHPUT storm: empty update calls, {concurrency} concurrent, {secs}s");
+    let t = Instant::now();
+    let stats = storm(
+        agent.clone(),
+        canisters.clone(),
+        concurrency,
+        Duration::from_secs(secs),
+        Arc::new(|| wasm().reply().build()),
+        false,
+    )
+    .await;
+    stats.report("THROUGHPUT (empty updates)", t.elapsed());
+
+    // ---- Phase B: compute storm (8 MiB stable fill per call, within dirty limit) ----
+    println!("\n[3/5] COMPUTE storm: 8 MiB stable_fill per call, {concurrency} concurrent, {secs}s");
+    let t = Instant::now();
+    let stats = storm(
+        agent.clone(),
+        canisters.clone(),
+        concurrency,
+        Duration::from_secs(secs),
+        Arc::new(|| wasm().stable_fill(0, 0x61, 8 * MIB).reply().build()),
+        false,
+    )
+    .await;
+    stats.report("COMPUTE (8 MiB fill)", t.elapsed());
+    }
+
+    // ---- Phase C: per-message dirty-page limit (32 MiB) ----
+    // Grow in a separate (committed) message first, then fill in-bounds amounts
+    // so we isolate the *dirty-page* limit from any grow/bounds effects.
+    println!("\n[4/5] DIRTY-LIMIT probe (per-message stable dirty limit = 32 MiB)");
+    let c = canisters[0];
+    let g = update(&agent, &c, wasm().stable_grow(1024).reply().build()).await; // +64 MiB, commit
+    println!(
+        "  grow +64 MiB (own message): {}",
+        if g.is_ok() { "OK".to_string() } else { format!("ERR {}", g.as_ref().err().unwrap().chars().take(200).collect::<String>()) }
+    );
+    let small = update(&agent, &c, wasm().stable_fill(0, 0x62, 24 * MIB).reply().build()).await;
+    println!(
+        "  fill 24 MiB (UNDER 32 MiB limit): {}",
+        if small.is_ok() { "OK".to_string() } else { format!("ERR {}", small.as_ref().err().unwrap().chars().take(260).collect::<String>()) }
+    );
+    let big = update(&agent, &c, wasm().stable_fill(0, 0x62, 48 * MIB).reply().build()).await;
+    println!(
+        "  fill 48 MiB (OVER 32 MiB limit):  {}",
+        if big.is_ok() { "OK — NO LIMIT ENFORCED".to_string() } else { format!("TRAP {}", big.as_ref().err().unwrap().chars().take(320).collect::<String>()) }
+    );
+
+    if !probe_only {
+    // ---- Phase D: grow stable memory toward the 512 MiB subnet cap ----
+    println!("\n[5/5] MEMORY-GROWTH storm: grow 16 MiB + fill per call across all canisters until rejected");
+    let grow = Arc::new(Stats::default());
+    let total_mib = Arc::new(AtomicU64::new(0));
+    let mut handles = Vec::new();
+    for &c in canisters.iter() {
+        let agent = agent.clone();
+        let grow = grow.clone();
+        let total_mib = total_mib.clone();
+        handles.push(tokio::spawn(async move {
+            // Hard cap iterations so a misbehaving run can't loop forever.
+            for _ in 0..64 {
+                let p = wasm().stable_grow(256).stable_fill(0, 0x63, 16 * MIB).reply().build();
+                let started = Instant::now();
+                let res = update(&agent, &c, p).await;
+                let ok = res.is_ok();
+                grow.record(started, &res);
+                if ok {
+                    total_mib.fetch_add(16, Ordering::Relaxed);
+                } else {
+                    break; // first rejection for this canister: stop growing it
+                }
+            }
+        }));
+    }
+    for h in handles {
+        let _ = h.await;
+    }
+    grow.report("MEMORY-GROWTH", Duration::from_secs(1));
+    println!(
+        "  approx stable memory successfully grown across subnet: ~{} MiB",
+        total_mib.load(Ordering::Relaxed)
+    );
+    }
+
+    println!("\n== done ==");
+}
diff --git a/rs/canister_sandbox/src/replica_controller/sandboxed_execution_controller.rs b/rs/canister_sandbox/src/replica_controller/sandboxed_execution_controller.rs
index 420a629a14ac..0f88fc359c52 100644
--- a/rs/canister_sandbox/src/replica_controller/sandboxed_execution_controller.rs
+++ b/rs/canister_sandbox/src/replica_controller/sandboxed_execution_controller.rs
@@ -72,17 +72,27 @@ const SANDBOX_PROCESS_UPDATE_INTERVAL: Duration = Duration::from_secs(10);
 /// distributed across 4 execution cores.
 const SANDBOX_PROCESSES_TO_EVICT: usize = 200;
 
-/// The RSS to evict in one go in order to amortize for the eviction cost (1 GiB).
-const SANDBOX_PROCESSES_RSS_TO_EVICT: NumBytes = NumBytes::new(1024 * 1024 * 1024);
+/// The RSS to evict in one go in order to amortize for the eviction cost.
+/// Nano-replica profile: small batches (64 MiB) since the whole sandbox RSS
+/// budget is only ~128 MiB.
+const SANDBOX_PROCESSES_RSS_TO_EVICT: NumBytes = NumBytes::new(64 * 1024 * 1024);
 
 /// By default, assume each sandbox process consumes 5 MiB of RSS.
 /// The actual memory usage is updated asynchronously.
 /// See `monitor_and_evict_sandbox_processes`
 const DEFAULT_SANDBOX_PROCESS_RSS: NumBytes = NumBytes::new(5 * 1024 * 1024);
 
-/// The maximum sandbox RSS is computed as `subnet_heap_delta_capacity / MAX_SANDBOXES_RSS_TO_HEAP_DELTA_RATIO`.
+/// The maximum sandbox RSS is computed as `subnet_heap_delta_capacity / MAX_SANDBOXES_RSS_TO_HEAP_DELTA_RATIO`,
+/// but never below `MIN_SANDBOXES_RSS`.
 const MAX_SANDBOXES_RSS_TO_HEAP_DELTA_RATIO: u64 = 3;
 
+/// Floor on the maximum total sandbox RSS, independent of the heap delta
+/// capacity. On the nano-replica profile the heap delta capacity is tiny
+/// (tens of MiB), and `heap_delta_capacity / 3` would otherwise starve the
+/// sandboxes and cause constant respawning. Keeping a small set of canisters
+/// warm matters more than a large heap delta buffer when checkpoints are cheap.
+const MIN_SANDBOXES_RSS: NumBytes = NumBytes::new(128 * 1024 * 1024);
+
 /// To speedup synchronous operations, the sandbox RSS-based eviction
 /// is triggered only when the system's available memory falls below
 /// the specified byte threshold.
@@ -1509,7 +1519,7 @@ impl SandboxedExecutionController {
             .maximum_state_delta
             .and_then(|d| if d.get() != 0 { Some(d) } else { None })
             .unwrap_or(self.default_subnet_heap_delta_capacity);
-        heap_delta_capacity / MAX_SANDBOXES_RSS_TO_HEAP_DELTA_RATIO
+        (heap_delta_capacity / MAX_SANDBOXES_RSS_TO_HEAP_DELTA_RATIO).max(MIN_SANDBOXES_RSS)
     }
 
     fn trigger_sandbox_eviction<F>(
diff --git a/rs/config/src/embedders.rs b/rs/config/src/embedders.rs
index 167c0bfabbdb..347b18780584 100644
--- a/rs/config/src/embedders.rs
+++ b/rs/config/src/embedders.rs
@@ -49,7 +49,7 @@ pub(crate) const MAX_NUMBER_EXPORTED_FUNCTIONS: usize = 1000;
 pub(crate) const MAX_SUM_EXPORTED_FUNCTION_NAME_LENGTHS: usize = 20000;
 /// The number of threads to use for query execution per canister.
 /// See also `QUERY_EXECUTION_THREADS_TOTAL`.
-pub(crate) const QUERY_EXECUTION_THREADS_PER_CANISTER: usize = 2;
+pub(crate) const QUERY_EXECUTION_THREADS_PER_CANISTER: usize = 1;
 
 /// In terms of execution time, compiling 1 WASM instructions takes as much time
 /// as actually executing 6_000 instructions. Only public for use in tests.
@@ -63,25 +63,30 @@ pub const DEFAULT_CREATE_EXECUTION_STATE_BASE_COST: NumInstructions =
     NumInstructions::new(20_000_000);
 
 /// The number of rayon threads used by wasmtime to compile wasm binaries
-const DEFAULT_WASMTIME_RAYON_COMPILATION_THREADS: usize = 10;
+/// Nano-replica profile: minimal parallelism.
+const DEFAULT_WASMTIME_RAYON_COMPILATION_THREADS: usize = 2;
 
 /// The number of rayon threads use for the parallel page copying optimization.
-const DEFAULT_PAGE_ALLOCATOR_THREADS: usize = 8;
+/// Nano-replica profile: minimal parallelism.
+const DEFAULT_PAGE_ALLOCATOR_THREADS: usize = 2;
 
 /// Sandbox process eviction ensures that the number of sandbox processes is
-/// always below this threshold. Idle sandboxes should be using at most ~5MiB
-/// resident memory with the on-disk compilation cache, so 10,000 sandboxes
-/// shouldn't be more than 50 GiB.
-pub(crate) const DEFAULT_MAX_SANDBOX_COUNT: usize = 10_000;
+/// always below this threshold. Nano-replica profile: at ~5MiB idle RSS each,
+/// 32 sandboxes stay well under the budget while keeping a small working set
+/// of canisters warm (sandbox respawn is expensive and serialized behind the
+/// single update thread).
+pub(crate) const DEFAULT_MAX_SANDBOX_COUNT: usize = 32;
 
 /// A sandbox process may be evicted after it has been idle for this
 /// duration and sandbox process eviction is activated.
-pub(crate) const DEFAULT_MAX_SANDBOX_IDLE_TIME: Duration = Duration::from_secs(30 * 60);
+/// Nano-replica profile: evict idle sandboxes quickly.
+pub(crate) const DEFAULT_MAX_SANDBOX_IDLE_TIME: Duration = Duration::from_secs(2 * 60);
 
 /// The maximum number of pages that a message dirties without optimizing dirty
 /// page copying by triggering a new execution slice for copying pages.
-/// This default is 1 GiB.
-pub(crate) const DEFAULT_MAX_DIRTY_PAGES_WITHOUT_OPTIMIZATION: usize = (GIB as usize) / PAGE_SIZE;
+/// Nano-replica profile: 32 MiB.
+pub(crate) const DEFAULT_MAX_DIRTY_PAGES_WITHOUT_OPTIMIZATION: usize =
+    (32 * 1024 * 1024) / PAGE_SIZE;
 
 /// Scheduling overhead for copying dirty pages, in instructions.
 pub(crate) const DIRTY_PAGE_COPY_OVERHEAD: NumInstructions = NumInstructions::new(3_000);
@@ -90,31 +95,37 @@ pub(crate) const DIRTY_PAGE_COPY_OVERHEAD: NumInstructions = NumInstructions::ne
 pub const WASM64_DIRTY_PAGE_OVERHEAD_MULTIPLIER: u64 = 4;
 
 const KIB: u64 = 1024;
-const GIB: u64 = KIB * KIB * KIB;
+const MIB: u64 = KIB * KIB;
+
+// Nano-replica profile: these limits bound the *resident* working set of a
+// single execution. On a 512 MiB - 1 GiB VM they MUST stay well below the
+// available RAM, otherwise a single message can OOM-kill the replica (which on
+// a replicated subnet means state divergence). The mainnet values were 1-8 GiB.
 
 // Maximum number of stable memory dirty OS pages (4KiB) that an upgrade/install message execution
 // is allowed to produce.
 const STABLE_MEMORY_DIRTY_PAGE_LIMIT_UPGRADE: NumOsPages =
-    NumOsPages::new(8 * GIB / (PAGE_SIZE as u64));
+    NumOsPages::new(128 * MIB / (PAGE_SIZE as u64));
 // Maximum number of stable memory dirty OS pages (4KiB) that a regular message (update) execution
 // is allowed to produce.
 const STABLE_MEMORY_DIRTY_PAGE_LIMIT_MESSAGE: NumOsPages =
-    NumOsPages::new(2 * GIB / (PAGE_SIZE as u64));
+    NumOsPages::new(32 * MIB / (PAGE_SIZE as u64));
 // Maximum number of stable memory dirty OS pages (4KiB) that a non-replicated query is allowed to produce.
-const STABLE_MEMORY_DIRTY_PAGE_LIMIT_QUERY: NumOsPages = NumOsPages::new(GIB / (PAGE_SIZE as u64));
+const STABLE_MEMORY_DIRTY_PAGE_LIMIT_QUERY: NumOsPages =
+    NumOsPages::new(32 * MIB / (PAGE_SIZE as u64));
 
 // Maximum number of stable memory OS pages (4KiB) that that an upgrade/install message execution
 // is allowed to access.
 const STABLE_MEMORY_ACCESSED_PAGE_LIMIT_UPGRADE: NumOsPages =
-    NumOsPages::new(8 * GIB / (PAGE_SIZE as u64));
+    NumOsPages::new(128 * MIB / (PAGE_SIZE as u64));
 // Maximum number of stable memory OS pages (4KiB) that a that a regular message (update) execution
 // is allowed to access.
 const STABLE_MEMORY_ACCESSED_PAGE_LIMIT_MESSAGE: NumOsPages =
-    NumOsPages::new(2 * GIB / (PAGE_SIZE as u64));
+    NumOsPages::new(32 * MIB / (PAGE_SIZE as u64));
 // Maximum number of stable memory OS pages (4KiB) that a single non-replicated query execution
 // is allowed to access.
 const STABLE_MEMORY_ACCESSED_PAGE_LIMIT_QUERY: NumOsPages =
-    NumOsPages::new(GIB / (PAGE_SIZE as u64));
+    NumOsPages::new(32 * MIB / (PAGE_SIZE as u64));
 
 /// The maximum size in bytes for an uncompressed Wasm module. This value is
 /// also used as the maximum size for the Wasm chunk store of each canister.
diff --git a/rs/config/src/execution_environment.rs b/rs/config/src/execution_environment.rs
index 75a72fc8aacc..4a7ce32c2f6a 100644
--- a/rs/config/src/execution_environment.rs
+++ b/rs/config/src/execution_environment.rs
@@ -9,7 +9,6 @@ use std::{str::FromStr, time::Duration};
 const KIB: u64 = 1024;
 const MIB: u64 = 1024 * KIB;
 const GIB: u64 = 1024 * MIB;
-const TIB: u64 = 1024 * GIB;
 
 const REPLICATED_INTER_CANISTER_LOG_FETCH_FEATURE: FlagStatus = FlagStatus::Disabled;
 
@@ -34,7 +33,12 @@ pub const TEST_DEFAULT_LOG_MEMORY_USAGE: u64 = if LOG_MEMORY_STORE_FEATURE_ENABL
 /// This specifies the threshold in bytes at which the subnet memory usage is
 /// considered to be high. If this value is greater or equal to the subnet
 /// capacity, then the subnet is never considered to have high usage.
-const SUBNET_MEMORY_THRESHOLD: NumBytes = NumBytes::new(750 * GIB);
+// Nano-replica profile: set equal to the subnet memory capacity so the subnet
+// is never considered "high usage" and the storage cycle-reservation mechanism
+// stays disabled — canisters can allocate freely up to the subnet capacity
+// without reserving cycles (reservation pricing is calibrated for mainnet and
+// would otherwise reject growth on a tiny subnet).
+const SUBNET_MEMORY_THRESHOLD: NumBytes = NumBytes::new(512 * MIB);
 
 /// This is the upper limit on how much logical storage canisters can request to
 /// be store on a given subnet.
@@ -42,7 +46,8 @@ const SUBNET_MEMORY_THRESHOLD: NumBytes = NumBytes::new(750 * GIB);
 /// Logical storage is the amount of storage being used from the point of view
 /// of the canister. The actual storage used by the nodes can be higher as the
 /// IC protocol requires storing copies of the canister state.
-const SUBNET_MEMORY_CAPACITY: NumBytes = NumBytes::new(2 * TIB);
+// Nano-replica profile: a few hundred MB of logical storage for the whole subnet.
+const SUBNET_MEMORY_CAPACITY: NumBytes = NumBytes::new(512 * MIB);
 
 /// This is the upper limit on how much memory can be used by all guaranteed
 /// response canister messages on a given subnet.
@@ -50,24 +55,27 @@ const SUBNET_MEMORY_CAPACITY: NumBytes = NumBytes::new(2 * TIB);
 /// Guaranteed response message memory usage is calculated as the total size of
 /// enqueued guaranteed responses; plus the maximum allowed response size per
 /// reserved guaranteed response slot.
-const SUBNET_GUARANTEED_RESPONSE_MESSAGE_MEMORY_CAPACITY: NumBytes = NumBytes::new(15 * GIB);
+// Nano-replica profile: guaranteed-response messages are heavily reserved
+// (~2 MiB per outstanding call), so keep this small. Consider rejecting
+// guaranteed-response calls entirely (best-effort-only subnet).
+const SUBNET_GUARANTEED_RESPONSE_MESSAGE_MEMORY_CAPACITY: NumBytes = NumBytes::new(64 * MIB);
 
 /// The limit on how much memory may be used by all guaranteed response messages
 /// on a given subnet at the end of a round.
 ///
 /// During the round, the best-effort message memory usage may exceed the limit,
 /// but the constraint is restored at the end of the round by shedding messages.
-const SUBNET_BEST_EFFORT_MESSAGE_MEMORY_CAPACITY: NumBytes = NumBytes::new(5 * GIB);
+const SUBNET_BEST_EFFORT_MESSAGE_MEMORY_CAPACITY: NumBytes = NumBytes::new(32 * MIB);
 
 /// This is the upper limit on how much memory can be used by the ingress
 /// history on a given subnet. It is lower than the subnet message memory
 /// capacity because here we count actual memory consumption as opposed to
 /// memory plus reservations.
-const INGRESS_HISTORY_MEMORY_CAPACITY: NumBytes = NumBytes::new(4 * GIB);
+const INGRESS_HISTORY_MEMORY_CAPACITY: NumBytes = NumBytes::new(32 * MIB);
 
 /// This is the upper limit on how much memory can be used by wasm custom
 /// sections on a given subnet.
-const SUBNET_WASM_CUSTOM_SECTIONS_MEMORY_CAPACITY: NumBytes = NumBytes::new(2 * GIB);
+const SUBNET_WASM_CUSTOM_SECTIONS_MEMORY_CAPACITY: NumBytes = NumBytes::new(16 * MIB);
 
 // The gen 1 production machines should have 64 cores.
 // We could in theory use 32 threads, leaving other threads for query handling,
@@ -79,15 +87,26 @@ const SUBNET_WASM_CUSTOM_SECTIONS_MEMORY_CAPACITY: NumBytes = NumBytes::new(2 *
 //    We needs to ensure:
 //    `SUBNET_MEMORY_CAPACITY / number_of_threads >= max_canister_memory`
 //    If you change this number please adjust other constants as well.
-pub(crate) const NUMBER_OF_EXECUTION_THREADS: usize = 4;
+// Nano-replica profile: minimum viable update-execution parallelism. This also
+// sets `SchedulerConfig::scheduler_cores` and divides the (small) subnet memory
+// capacity across threads.
+//
+// NOTE: the DTS scheduler requires at least 2 cores — compute capacity is
+// `(scheduler_cores - 1) * 100%` (see `round_schedule::compute_capacity_percent`),
+// so a single core yields 0% allocatable capacity and trips a scheduler
+// invariant on every round. 2 is the floor.
+pub(crate) const NUMBER_OF_EXECUTION_THREADS: usize = 2;
 
 /// The number of bytes reserved for response callback executions.
-/// For each thread, we reserve 2.5GiB of memory or, equivalently, 2560MiB.
+/// Nano-replica profile: keep this small (8 MiB per thread) so canisters can
+/// allocate almost the entire subnet memory capacity. The reservation only
+/// guards response-callback execution headroom; on a best-effort-leaning nano
+/// subnet a small reservation is sufficient.
 pub const SUBNET_MEMORY_RESERVATION: NumBytes =
-    NumBytes::new(2560 * MIB * NUMBER_OF_EXECUTION_THREADS as u64);
+    NumBytes::new(8 * MIB * NUMBER_OF_EXECUTION_THREADS as u64);
 
 /// The soft limit on the subnet-wide number of callbacks.
-pub const SUBNET_CALLBACK_SOFT_LIMIT: usize = 1_000_000;
+pub const SUBNET_CALLBACK_SOFT_LIMIT: usize = 4_096;
 
 /// The number of callbacks that are guaranteed to each canister.
 pub const CANISTER_GUARANTEED_CALLBACK_QUOTA: usize = 50;
@@ -108,7 +127,11 @@ pub const STOP_CANISTER_TIMEOUT_DURATION: Duration = Duration::from_secs(5 * 60)
 /// potential fragmentation. This limit should be larger than the maximum
 /// canister memory size to guarantee that a message that overwrites the whole
 /// memory can succeed.
-pub(crate) const SUBNET_HEAP_DELTA_CAPACITY: NumBytes = NumBytes::new(140 * GIB);
+// Nano-replica profile: heap deltas are the dominant *resident* cost between
+// checkpoints. Keep this small and checkpoint frequently (the subnet state is
+// only a few hundred MB, so checkpoints are cheap). Must be >= the per-message
+// dirty page limit so a single message can still complete.
+pub(crate) const SUBNET_HEAP_DELTA_CAPACITY: NumBytes = NumBytes::new(96 * MIB);
 
 /// The maximum number of instructions for inspect_message calls.
 const MAX_INSTRUCTIONS_FOR_MESSAGE_ACCEPTANCE_CALLS: NumInstructions =
@@ -126,7 +149,8 @@ pub const INSTRUCTION_OVERHEAD_PER_QUERY_CALL: u64 = 50_000_000;
 
 /// The number of query execution threads overall for all canisters.
 /// See also `QUERY_EXECUTION_THREADS_PER_CANISTER`.
-pub(crate) const QUERY_EXECUTION_THREADS_TOTAL: usize = 4;
+// Nano-replica profile: a single query-execution thread.
+pub(crate) const QUERY_EXECUTION_THREADS_TOTAL: usize = 1;
 
 /// When a canister is scheduled for query execution, it is allowed to run for
 /// this amount of time. This limit controls how many queries the canister
@@ -147,7 +171,7 @@ const QUERY_SCHEDULING_TIME_SLICE_PER_CANISTER: Duration = Duration::from_millis
 ///
 /// The limit includes both cache keys and values, for successful query
 /// executions and user errors.
-const QUERY_CACHE_CAPACITY: NumBytes = NumBytes::new(200 * MIB);
+const QUERY_CACHE_CAPACITY: NumBytes = NumBytes::new(16 * MIB);
 
 /// The upper limit on how long the cache entry stays valid in the query cache.
 const QUERY_CACHE_MAX_EXPIRY_TIME: Duration = Duration::from_secs(600);
@@ -187,7 +211,9 @@ pub const DOGECOIN_MAINNET_CANISTER_ID: &str = "gordg-fyaaa-aaaan-aaadq-cai";
 const DOGECOIN_MAINNET_STAGING_CANISTER_ID: &str = "bhuiy-ciaaa-aaaad-abwea-cai";
 
 /// The capacity of the Wasm compilation cache.
-pub const MAX_COMPILATION_CACHE_SIZE: NumBytes = NumBytes::new(10 * GIB);
+// Nano-replica profile: the compilation cache is on-disk; keep the in-memory
+// bound small.
+pub const MAX_COMPILATION_CACHE_SIZE: NumBytes = NumBytes::new(64 * MIB);
 
 /// Maximum number of controllers allowed in a request (specified in the interface spec).
 pub const MAX_ALLOWED_CONTROLLERS_COUNT: usize = 10;
diff --git a/rs/config/src/message_routing.rs b/rs/config/src/message_routing.rs
index b1f7c064c560..355912020beb 100644
--- a/rs/config/src/message_routing.rs
+++ b/rs/config/src/message_routing.rs
@@ -4,13 +4,14 @@ use serde::{Deserialize, Serialize};
 ///
 /// At most `MAX_STREAM_MESSAGES` are enqueued into a stream; but only until its
 /// `count_bytes()` is greater than or equal to `TARGET_STREAM_SIZE_BYTES`.
-pub const TARGET_STREAM_SIZE_BYTES: usize = 10 * 1024 * 1024;
+// Nano-replica profile: smaller XNet streams to bound per-stream memory.
+pub const TARGET_STREAM_SIZE_BYTES: usize = 2 * 1024 * 1024;
 
 /// Maximum number of messages in a stream.
 ///
 /// At most `MAX_STREAM_MESSAGES` are enqueued into a stream; but only until its
 /// `count_bytes()` is greater than or equal to `TARGET_STREAM_SIZE_BYTES`.
-pub const MAX_STREAM_MESSAGES: usize = 10_000;
+pub const MAX_STREAM_MESSAGES: usize = 1_000;
 
 #[derive(Clone, Eq, PartialEq, Debug, Deserialize, Serialize)]
 #[serde(default)]
diff --git a/rs/config/src/subnet_config.rs b/rs/config/src/subnet_config.rs
index 14d51e72f421..69189a446556 100644
--- a/rs/config/src/subnet_config.rs
+++ b/rs/config/src/subnet_config.rs
@@ -26,7 +26,7 @@ impl SubnetSecurity {
     }
 }
 
-const GIB: u64 = 1024 * 1024 * 1024;
+const MIB: u64 = 1024 * 1024;
 const M: u64 = 1_000_000;
 const B: u64 = 1_000_000_000;
 const T: u128 = 1_000_000_000_000;
@@ -103,12 +103,17 @@ const SYSTEM_SUBNET_FACTOR: u64 = 10;
 // so a round may take 1 to 4 seconds. To avoid regressing the throughput of
 // slow subnets while maintaining the speed of fast subnets, we use the middle
 // value of 200MB.
-const MAX_HEAP_DELTA_PER_ITERATION: NumBytes = NumBytes::new(200 * M);
+// Nano-replica profile: keep a single round's heap-delta production below the
+// SUBNET_HEAP_DELTA_CAPACITY (96 MiB) so one round cannot overshoot the cap and
+// spike unreclaimable (anonymous) resident memory. This bounds the per-round
+// dirty working set so writes stay safe on a 512 MiB - 1 GiB VM.
+const MAX_HEAP_DELTA_PER_ITERATION: NumBytes = NumBytes::new(64 * M);
 
 /// The reserve represents the freely available portion of the
 /// `subnet_heap_delta_capacity` that can be used as a heap delta burst
 /// during the initial rounds following a checkpoint.
-const HEAP_DELTA_INITIAL_RESERVE: NumBytes = NumBytes::new(32 * GIB);
+/// Nano-replica profile: must not exceed `SUBNET_HEAP_DELTA_CAPACITY`.
+const HEAP_DELTA_INITIAL_RESERVE: NumBytes = NumBytes::new(32 * MIB);
 
 // Log all messages that took more than this value to execute.
 pub const MAX_MESSAGE_DURATION_BEFORE_WARN_IN_SECONDS: f64 = 5.0;
@@ -122,7 +127,9 @@ pub const MAX_MESSAGE_DURATION_BEFORE_WARN_IN_SECONDS: f64 = 5.0;
 ///
 ///   long installs + long updates + query threads = 1 + 4 + 2 = 7
 ///
-const MAX_PAUSED_EXECUTIONS: usize = 4;
+// Nano-replica profile: limit concurrent paused (DTS) executions to keep the
+// number of simultaneously live Wasm instances small.
+const MAX_PAUSED_EXECUTIONS: usize = 1;
 
 /// Cost for creating a new canister.
 pub const CANISTER_CREATION_FEE: Cycles = Cycles::new(500_000_000_000);
@@ -318,7 +325,10 @@ impl SchedulerConfig {
             max_heap_delta_per_iteration: MAX_HEAP_DELTA_PER_ITERATION,
             max_message_duration_before_warn_in_seconds:
                 MAX_MESSAGE_DURATION_BEFORE_WARN_IN_SECONDS,
-            heap_delta_rate_limit: NumBytes::from(75 * 1024 * 1024),
+            // Nano-replica profile: cap per-canister heap delta per round so a
+            // single canister cannot fill the (small) subnet heap delta capacity
+            // in one round.
+            heap_delta_rate_limit: NumBytes::from(32 * MIB),
             install_code_rate_limit: MAX_INSTRUCTIONS_PER_SLICE,
             dirty_page_overhead: DEFAULT_DIRTY_PAGE_OVERHEAD,
             accumulated_priority_reset_interval: ACCUMULATED_PRIORITY_RESET_INTERVAL,