From 7c035b1a539847f652a32a2800b13b116aeb9e10 Mon Sep 17 00:00:00 2001
From: Bjoern Tackmann <bjoern@dfinity.org>
Date: Fri, 12 Jun 2026 16:19:12 +0200
Subject: [PATCH 01/14] =?UTF-8?q?feat(config):=20nano-replica=20memory=20p?=
 =?UTF-8?q?rofile=20for=20512=20MiB=E2=80=931=20GiB=20VMs?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Scale down the replica's memory capacities, reservations and limits so it
can run on a 512 MiB–1 GiB VM (down from the 512 GiB mainnet footprint),
accepting a substantially reduced subnet capacity.

execution_environment.rs:
- subnet memory capacity 2 TiB -> 512 MiB, threshold -> 384 MiB
- guaranteed-response msg mem 15 GiB -> 64 MiB, best-effort 5 GiB -> 32 MiB
- ingress history 4 GiB -> 32 MiB, wasm custom sections 2 GiB -> 16 MiB
- execution threads 4 -> 1, query threads 4 -> 1
- subnet memory reservation 2560 -> 64 MiB per thread
- callback soft limit 1,000,000 -> 4,096
- subnet heap delta capacity 140 GiB -> 96 MiB
- query cache 200 -> 16 MiB, compilation cache 10 GiB -> 64 MiB

embedders.rs (OOM-cliff fix — bound a single execution's resident set):
- stable dirty/accessed page limits 1-8 GiB -> 32/128 MiB
- max dirty pages without optimization 1 GiB -> 32 MiB
- sandbox count 10,000 -> 32, idle time 30m -> 2m
- rayon compilation/page-allocator threads 10/8 -> 2/2
- query threads per canister 2 -> 1

subnet_config.rs:
- heap delta initial reserve 32 GiB -> 32 MiB (must be <= capacity)
- max paused (DTS) executions 4 -> 1
- per-canister heap delta rate limit 75 -> 32 MiB

sandboxed_execution_controller.rs:
- decouple max sandbox RSS from heap delta via a 128 MiB floor
  (MIN_SANDBOXES_RSS), so a tiny heap delta no longer starves sandboxes
- eviction batch 1 GiB -> 64 MiB

message_routing.rs:
- XNet stream target size 10 -> 2 MiB, max stream messages 10,000 -> 1,000

Verified: rustfmt, clippy (clean), cargo test -p ic-config (19 passed),
bazel build //rs/config:config //rs/canister_sandbox:backend_lib.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../sandboxed_execution_controller.rs         | 18 ++++++--
 rs/config/src/embedders.rs                    | 45 ++++++++++++-------
 rs/config/src/execution_environment.rs        | 45 ++++++++++++-------
 rs/config/src/message_routing.rs              |  5 ++-
 rs/config/src/subnet_config.rs                | 14 ++++--
 5 files changed, 85 insertions(+), 42 deletions(-)

diff --git a/rs/canister_sandbox/src/replica_controller/sandboxed_execution_controller.rs b/rs/canister_sandbox/src/replica_controller/sandboxed_execution_controller.rs
index 420a629a14ac..0f88fc359c52 100644
--- a/rs/canister_sandbox/src/replica_controller/sandboxed_execution_controller.rs
+++ b/rs/canister_sandbox/src/replica_controller/sandboxed_execution_controller.rs
@@ -72,17 +72,27 @@ const SANDBOX_PROCESS_UPDATE_INTERVAL: Duration = Duration::from_secs(10);
 /// distributed across 4 execution cores.
 const SANDBOX_PROCESSES_TO_EVICT: usize = 200;
 
-/// The RSS to evict in one go in order to amortize for the eviction cost (1 GiB).
-const SANDBOX_PROCESSES_RSS_TO_EVICT: NumBytes = NumBytes::new(1024 * 1024 * 1024);
+/// The RSS to evict in one go in order to amortize for the eviction cost.
+/// Nano-replica profile: small batches (64 MiB) since the whole sandbox RSS
+/// budget is only ~128 MiB.
+const SANDBOX_PROCESSES_RSS_TO_EVICT: NumBytes = NumBytes::new(64 * 1024 * 1024);
 
 /// By default, assume each sandbox process consumes 5 MiB of RSS.
 /// The actual memory usage is updated asynchronously.
 /// See `monitor_and_evict_sandbox_processes`
 const DEFAULT_SANDBOX_PROCESS_RSS: NumBytes = NumBytes::new(5 * 1024 * 1024);
 
-/// The maximum sandbox RSS is computed as `subnet_heap_delta_capacity / MAX_SANDBOXES_RSS_TO_HEAP_DELTA_RATIO`.
+/// The maximum sandbox RSS is computed as `subnet_heap_delta_capacity / MAX_SANDBOXES_RSS_TO_HEAP_DELTA_RATIO`,
+/// but never below `MIN_SANDBOXES_RSS`.
 const MAX_SANDBOXES_RSS_TO_HEAP_DELTA_RATIO: u64 = 3;
 
+/// Floor on the maximum total sandbox RSS, independent of the heap delta
+/// capacity. On the nano-replica profile the heap delta capacity is tiny
+/// (tens of MiB), and `heap_delta_capacity / 3` would otherwise starve the
+/// sandboxes and cause constant respawning. Keeping a small set of canisters
+/// warm matters more than a large heap delta buffer when checkpoints are cheap.
+const MIN_SANDBOXES_RSS: NumBytes = NumBytes::new(128 * 1024 * 1024);
+
 /// To speedup synchronous operations, the sandbox RSS-based eviction
 /// is triggered only when the system's available memory falls below
 /// the specified byte threshold.
@@ -1509,7 +1519,7 @@ impl SandboxedExecutionController {
             .maximum_state_delta
             .and_then(|d| if d.get() != 0 { Some(d) } else { None })
             .unwrap_or(self.default_subnet_heap_delta_capacity);
-        heap_delta_capacity / MAX_SANDBOXES_RSS_TO_HEAP_DELTA_RATIO
+        (heap_delta_capacity / MAX_SANDBOXES_RSS_TO_HEAP_DELTA_RATIO).max(MIN_SANDBOXES_RSS)
     }
 
     fn trigger_sandbox_eviction<F>(
diff --git a/rs/config/src/embedders.rs b/rs/config/src/embedders.rs
index 167c0bfabbdb..347b18780584 100644
--- a/rs/config/src/embedders.rs
+++ b/rs/config/src/embedders.rs
@@ -49,7 +49,7 @@ pub(crate) const MAX_NUMBER_EXPORTED_FUNCTIONS: usize = 1000;
 pub(crate) const MAX_SUM_EXPORTED_FUNCTION_NAME_LENGTHS: usize = 20000;
 /// The number of threads to use for query execution per canister.
 /// See also `QUERY_EXECUTION_THREADS_TOTAL`.
-pub(crate) const QUERY_EXECUTION_THREADS_PER_CANISTER: usize = 2;
+pub(crate) const QUERY_EXECUTION_THREADS_PER_CANISTER: usize = 1;
 
 /// In terms of execution time, compiling 1 WASM instructions takes as much time
 /// as actually executing 6_000 instructions. Only public for use in tests.
@@ -63,25 +63,30 @@ pub const DEFAULT_CREATE_EXECUTION_STATE_BASE_COST: NumInstructions =
     NumInstructions::new(20_000_000);
 
 /// The number of rayon threads used by wasmtime to compile wasm binaries
-const DEFAULT_WASMTIME_RAYON_COMPILATION_THREADS: usize = 10;
+/// Nano-replica profile: minimal parallelism.
+const DEFAULT_WASMTIME_RAYON_COMPILATION_THREADS: usize = 2;
 
 /// The number of rayon threads use for the parallel page copying optimization.
-const DEFAULT_PAGE_ALLOCATOR_THREADS: usize = 8;
+/// Nano-replica profile: minimal parallelism.
+const DEFAULT_PAGE_ALLOCATOR_THREADS: usize = 2;
 
 /// Sandbox process eviction ensures that the number of sandbox processes is
-/// always below this threshold. Idle sandboxes should be using at most ~5MiB
-/// resident memory with the on-disk compilation cache, so 10,000 sandboxes
-/// shouldn't be more than 50 GiB.
-pub(crate) const DEFAULT_MAX_SANDBOX_COUNT: usize = 10_000;
+/// always below this threshold. Nano-replica profile: at ~5MiB idle RSS each,
+/// 32 sandboxes stay well under the budget while keeping a small working set
+/// of canisters warm (sandbox respawn is expensive and serialized behind the
+/// single update thread).
+pub(crate) const DEFAULT_MAX_SANDBOX_COUNT: usize = 32;
 
 /// A sandbox process may be evicted after it has been idle for this
 /// duration and sandbox process eviction is activated.
-pub(crate) const DEFAULT_MAX_SANDBOX_IDLE_TIME: Duration = Duration::from_secs(30 * 60);
+/// Nano-replica profile: evict idle sandboxes quickly.
+pub(crate) const DEFAULT_MAX_SANDBOX_IDLE_TIME: Duration = Duration::from_secs(2 * 60);
 
 /// The maximum number of pages that a message dirties without optimizing dirty
 /// page copying by triggering a new execution slice for copying pages.
-/// This default is 1 GiB.
-pub(crate) const DEFAULT_MAX_DIRTY_PAGES_WITHOUT_OPTIMIZATION: usize = (GIB as usize) / PAGE_SIZE;
+/// Nano-replica profile: 32 MiB.
+pub(crate) const DEFAULT_MAX_DIRTY_PAGES_WITHOUT_OPTIMIZATION: usize =
+    (32 * 1024 * 1024) / PAGE_SIZE;
 
 /// Scheduling overhead for copying dirty pages, in instructions.
 pub(crate) const DIRTY_PAGE_COPY_OVERHEAD: NumInstructions = NumInstructions::new(3_000);
@@ -90,31 +95,37 @@ pub(crate) const DIRTY_PAGE_COPY_OVERHEAD: NumInstructions = NumInstructions::ne
 pub const WASM64_DIRTY_PAGE_OVERHEAD_MULTIPLIER: u64 = 4;
 
 const KIB: u64 = 1024;
-const GIB: u64 = KIB * KIB * KIB;
+const MIB: u64 = KIB * KIB;
+
+// Nano-replica profile: these limits bound the *resident* working set of a
+// single execution. On a 512 MiB - 1 GiB VM they MUST stay well below the
+// available RAM, otherwise a single message can OOM-kill the replica (which on
+// a replicated subnet means state divergence). The mainnet values were 1-8 GiB.
 
 // Maximum number of stable memory dirty OS pages (4KiB) that an upgrade/install message execution
 // is allowed to produce.
 const STABLE_MEMORY_DIRTY_PAGE_LIMIT_UPGRADE: NumOsPages =
-    NumOsPages::new(8 * GIB / (PAGE_SIZE as u64));
+    NumOsPages::new(128 * MIB / (PAGE_SIZE as u64));
 // Maximum number of stable memory dirty OS pages (4KiB) that a regular message (update) execution
 // is allowed to produce.
 const STABLE_MEMORY_DIRTY_PAGE_LIMIT_MESSAGE: NumOsPages =
-    NumOsPages::new(2 * GIB / (PAGE_SIZE as u64));
+    NumOsPages::new(32 * MIB / (PAGE_SIZE as u64));
 // Maximum number of stable memory dirty OS pages (4KiB) that a non-replicated query is allowed to produce.
-const STABLE_MEMORY_DIRTY_PAGE_LIMIT_QUERY: NumOsPages = NumOsPages::new(GIB / (PAGE_SIZE as u64));
+const STABLE_MEMORY_DIRTY_PAGE_LIMIT_QUERY: NumOsPages =
+    NumOsPages::new(32 * MIB / (PAGE_SIZE as u64));
 
 // Maximum number of stable memory OS pages (4KiB) that that an upgrade/install message execution
 // is allowed to access.
 const STABLE_MEMORY_ACCESSED_PAGE_LIMIT_UPGRADE: NumOsPages =
-    NumOsPages::new(8 * GIB / (PAGE_SIZE as u64));
+    NumOsPages::new(128 * MIB / (PAGE_SIZE as u64));
 // Maximum number of stable memory OS pages (4KiB) that a that a regular message (update) execution
 // is allowed to access.
 const STABLE_MEMORY_ACCESSED_PAGE_LIMIT_MESSAGE: NumOsPages =
-    NumOsPages::new(2 * GIB / (PAGE_SIZE as u64));
+    NumOsPages::new(32 * MIB / (PAGE_SIZE as u64));
 // Maximum number of stable memory OS pages (4KiB) that a single non-replicated query execution
 // is allowed to access.
 const STABLE_MEMORY_ACCESSED_PAGE_LIMIT_QUERY: NumOsPages =
-    NumOsPages::new(GIB / (PAGE_SIZE as u64));
+    NumOsPages::new(32 * MIB / (PAGE_SIZE as u64));
 
 /// The maximum size in bytes for an uncompressed Wasm module. This value is
 /// also used as the maximum size for the Wasm chunk store of each canister.
diff --git a/rs/config/src/execution_environment.rs b/rs/config/src/execution_environment.rs
index 75a72fc8aacc..f07f56b75d22 100644
--- a/rs/config/src/execution_environment.rs
+++ b/rs/config/src/execution_environment.rs
@@ -9,7 +9,6 @@ use std::{str::FromStr, time::Duration};
 const KIB: u64 = 1024;
 const MIB: u64 = 1024 * KIB;
 const GIB: u64 = 1024 * MIB;
-const TIB: u64 = 1024 * GIB;
 
 const REPLICATED_INTER_CANISTER_LOG_FETCH_FEATURE: FlagStatus = FlagStatus::Disabled;
 
@@ -34,7 +33,8 @@ pub const TEST_DEFAULT_LOG_MEMORY_USAGE: u64 = if LOG_MEMORY_STORE_FEATURE_ENABL
 /// This specifies the threshold in bytes at which the subnet memory usage is
 /// considered to be high. If this value is greater or equal to the subnet
 /// capacity, then the subnet is never considered to have high usage.
-const SUBNET_MEMORY_THRESHOLD: NumBytes = NumBytes::new(750 * GIB);
+// Nano-replica profile: scaled down to run on a 512 MiB - 1 GiB VM.
+const SUBNET_MEMORY_THRESHOLD: NumBytes = NumBytes::new(384 * MIB);
 
 /// This is the upper limit on how much logical storage canisters can request to
 /// be store on a given subnet.
@@ -42,7 +42,8 @@ const SUBNET_MEMORY_THRESHOLD: NumBytes = NumBytes::new(750 * GIB);
 /// Logical storage is the amount of storage being used from the point of view
 /// of the canister. The actual storage used by the nodes can be higher as the
 /// IC protocol requires storing copies of the canister state.
-const SUBNET_MEMORY_CAPACITY: NumBytes = NumBytes::new(2 * TIB);
+// Nano-replica profile: a few hundred MB of logical storage for the whole subnet.
+const SUBNET_MEMORY_CAPACITY: NumBytes = NumBytes::new(512 * MIB);
 
 /// This is the upper limit on how much memory can be used by all guaranteed
 /// response canister messages on a given subnet.
@@ -50,24 +51,27 @@ const SUBNET_MEMORY_CAPACITY: NumBytes = NumBytes::new(2 * TIB);
 /// Guaranteed response message memory usage is calculated as the total size of
 /// enqueued guaranteed responses; plus the maximum allowed response size per
 /// reserved guaranteed response slot.
-const SUBNET_GUARANTEED_RESPONSE_MESSAGE_MEMORY_CAPACITY: NumBytes = NumBytes::new(15 * GIB);
+// Nano-replica profile: guaranteed-response messages are heavily reserved
+// (~2 MiB per outstanding call), so keep this small. Consider rejecting
+// guaranteed-response calls entirely (best-effort-only subnet).
+const SUBNET_GUARANTEED_RESPONSE_MESSAGE_MEMORY_CAPACITY: NumBytes = NumBytes::new(64 * MIB);
 
 /// The limit on how much memory may be used by all guaranteed response messages
 /// on a given subnet at the end of a round.
 ///
 /// During the round, the best-effort message memory usage may exceed the limit,
 /// but the constraint is restored at the end of the round by shedding messages.
-const SUBNET_BEST_EFFORT_MESSAGE_MEMORY_CAPACITY: NumBytes = NumBytes::new(5 * GIB);
+const SUBNET_BEST_EFFORT_MESSAGE_MEMORY_CAPACITY: NumBytes = NumBytes::new(32 * MIB);
 
 /// This is the upper limit on how much memory can be used by the ingress
 /// history on a given subnet. It is lower than the subnet message memory
 /// capacity because here we count actual memory consumption as opposed to
 /// memory plus reservations.
-const INGRESS_HISTORY_MEMORY_CAPACITY: NumBytes = NumBytes::new(4 * GIB);
+const INGRESS_HISTORY_MEMORY_CAPACITY: NumBytes = NumBytes::new(32 * MIB);
 
 /// This is the upper limit on how much memory can be used by wasm custom
 /// sections on a given subnet.
-const SUBNET_WASM_CUSTOM_SECTIONS_MEMORY_CAPACITY: NumBytes = NumBytes::new(2 * GIB);
+const SUBNET_WASM_CUSTOM_SECTIONS_MEMORY_CAPACITY: NumBytes = NumBytes::new(16 * MIB);
 
 // The gen 1 production machines should have 64 cores.
 // We could in theory use 32 threads, leaving other threads for query handling,
@@ -79,15 +83,19 @@ const SUBNET_WASM_CUSTOM_SECTIONS_MEMORY_CAPACITY: NumBytes = NumBytes::new(2 *
 //    We needs to ensure:
 //    `SUBNET_MEMORY_CAPACITY / number_of_threads >= max_canister_memory`
 //    If you change this number please adjust other constants as well.
-pub(crate) const NUMBER_OF_EXECUTION_THREADS: usize = 4;
+// Nano-replica profile: a single update-execution thread. This also sets
+// `SchedulerConfig::scheduler_cores` and divides the (small) subnet memory
+// capacity by a single thread.
+pub(crate) const NUMBER_OF_EXECUTION_THREADS: usize = 1;
 
 /// The number of bytes reserved for response callback executions.
-/// For each thread, we reserve 2.5GiB of memory or, equivalently, 2560MiB.
+/// Nano-replica profile: 64MiB per thread (must stay well below the subnet
+/// memory capacity, otherwise no memory is left for canister state).
 pub const SUBNET_MEMORY_RESERVATION: NumBytes =
-    NumBytes::new(2560 * MIB * NUMBER_OF_EXECUTION_THREADS as u64);
+    NumBytes::new(64 * MIB * NUMBER_OF_EXECUTION_THREADS as u64);
 
 /// The soft limit on the subnet-wide number of callbacks.
-pub const SUBNET_CALLBACK_SOFT_LIMIT: usize = 1_000_000;
+pub const SUBNET_CALLBACK_SOFT_LIMIT: usize = 4_096;
 
 /// The number of callbacks that are guaranteed to each canister.
 pub const CANISTER_GUARANTEED_CALLBACK_QUOTA: usize = 50;
@@ -108,7 +116,11 @@ pub const STOP_CANISTER_TIMEOUT_DURATION: Duration = Duration::from_secs(5 * 60)
 /// potential fragmentation. This limit should be larger than the maximum
 /// canister memory size to guarantee that a message that overwrites the whole
 /// memory can succeed.
-pub(crate) const SUBNET_HEAP_DELTA_CAPACITY: NumBytes = NumBytes::new(140 * GIB);
+// Nano-replica profile: heap deltas are the dominant *resident* cost between
+// checkpoints. Keep this small and checkpoint frequently (the subnet state is
+// only a few hundred MB, so checkpoints are cheap). Must be >= the per-message
+// dirty page limit so a single message can still complete.
+pub(crate) const SUBNET_HEAP_DELTA_CAPACITY: NumBytes = NumBytes::new(96 * MIB);
 
 /// The maximum number of instructions for inspect_message calls.
 const MAX_INSTRUCTIONS_FOR_MESSAGE_ACCEPTANCE_CALLS: NumInstructions =
@@ -126,7 +138,8 @@ pub const INSTRUCTION_OVERHEAD_PER_QUERY_CALL: u64 = 50_000_000;
 
 /// The number of query execution threads overall for all canisters.
 /// See also `QUERY_EXECUTION_THREADS_PER_CANISTER`.
-pub(crate) const QUERY_EXECUTION_THREADS_TOTAL: usize = 4;
+// Nano-replica profile: a single query-execution thread.
+pub(crate) const QUERY_EXECUTION_THREADS_TOTAL: usize = 1;
 
 /// When a canister is scheduled for query execution, it is allowed to run for
 /// this amount of time. This limit controls how many queries the canister
@@ -147,7 +160,7 @@ const QUERY_SCHEDULING_TIME_SLICE_PER_CANISTER: Duration = Duration::from_millis
 ///
 /// The limit includes both cache keys and values, for successful query
 /// executions and user errors.
-const QUERY_CACHE_CAPACITY: NumBytes = NumBytes::new(200 * MIB);
+const QUERY_CACHE_CAPACITY: NumBytes = NumBytes::new(16 * MIB);
 
 /// The upper limit on how long the cache entry stays valid in the query cache.
 const QUERY_CACHE_MAX_EXPIRY_TIME: Duration = Duration::from_secs(600);
@@ -187,7 +200,9 @@ pub const DOGECOIN_MAINNET_CANISTER_ID: &str = "gordg-fyaaa-aaaan-aaadq-cai";
 const DOGECOIN_MAINNET_STAGING_CANISTER_ID: &str = "bhuiy-ciaaa-aaaad-abwea-cai";
 
 /// The capacity of the Wasm compilation cache.
-pub const MAX_COMPILATION_CACHE_SIZE: NumBytes = NumBytes::new(10 * GIB);
+// Nano-replica profile: the compilation cache is on-disk; keep the in-memory
+// bound small.
+pub const MAX_COMPILATION_CACHE_SIZE: NumBytes = NumBytes::new(64 * MIB);
 
 /// Maximum number of controllers allowed in a request (specified in the interface spec).
 pub const MAX_ALLOWED_CONTROLLERS_COUNT: usize = 10;
diff --git a/rs/config/src/message_routing.rs b/rs/config/src/message_routing.rs
index b1f7c064c560..355912020beb 100644
--- a/rs/config/src/message_routing.rs
+++ b/rs/config/src/message_routing.rs
@@ -4,13 +4,14 @@ use serde::{Deserialize, Serialize};
 ///
 /// At most `MAX_STREAM_MESSAGES` are enqueued into a stream; but only until its
 /// `count_bytes()` is greater than or equal to `TARGET_STREAM_SIZE_BYTES`.
-pub const TARGET_STREAM_SIZE_BYTES: usize = 10 * 1024 * 1024;
+// Nano-replica profile: smaller XNet streams to bound per-stream memory.
+pub const TARGET_STREAM_SIZE_BYTES: usize = 2 * 1024 * 1024;
 
 /// Maximum number of messages in a stream.
 ///
 /// At most `MAX_STREAM_MESSAGES` are enqueued into a stream; but only until its
 /// `count_bytes()` is greater than or equal to `TARGET_STREAM_SIZE_BYTES`.
-pub const MAX_STREAM_MESSAGES: usize = 10_000;
+pub const MAX_STREAM_MESSAGES: usize = 1_000;
 
 #[derive(Clone, Eq, PartialEq, Debug, Deserialize, Serialize)]
 #[serde(default)]
diff --git a/rs/config/src/subnet_config.rs b/rs/config/src/subnet_config.rs
index 14d51e72f421..48bd06192c44 100644
--- a/rs/config/src/subnet_config.rs
+++ b/rs/config/src/subnet_config.rs
@@ -26,7 +26,7 @@ impl SubnetSecurity {
     }
 }
 
-const GIB: u64 = 1024 * 1024 * 1024;
+const MIB: u64 = 1024 * 1024;
 const M: u64 = 1_000_000;
 const B: u64 = 1_000_000_000;
 const T: u128 = 1_000_000_000_000;
@@ -108,7 +108,8 @@ const MAX_HEAP_DELTA_PER_ITERATION: NumBytes = NumBytes::new(200 * M);
 /// The reserve represents the freely available portion of the
 /// `subnet_heap_delta_capacity` that can be used as a heap delta burst
 /// during the initial rounds following a checkpoint.
-const HEAP_DELTA_INITIAL_RESERVE: NumBytes = NumBytes::new(32 * GIB);
+/// Nano-replica profile: must not exceed `SUBNET_HEAP_DELTA_CAPACITY`.
+const HEAP_DELTA_INITIAL_RESERVE: NumBytes = NumBytes::new(32 * MIB);
 
 // Log all messages that took more than this value to execute.
 pub const MAX_MESSAGE_DURATION_BEFORE_WARN_IN_SECONDS: f64 = 5.0;
@@ -122,7 +123,9 @@ pub const MAX_MESSAGE_DURATION_BEFORE_WARN_IN_SECONDS: f64 = 5.0;
 ///
 ///   long installs + long updates + query threads = 1 + 4 + 2 = 7
 ///
-const MAX_PAUSED_EXECUTIONS: usize = 4;
+// Nano-replica profile: limit concurrent paused (DTS) executions to keep the
+// number of simultaneously live Wasm instances small.
+const MAX_PAUSED_EXECUTIONS: usize = 1;
 
 /// Cost for creating a new canister.
 pub const CANISTER_CREATION_FEE: Cycles = Cycles::new(500_000_000_000);
@@ -318,7 +321,10 @@ impl SchedulerConfig {
             max_heap_delta_per_iteration: MAX_HEAP_DELTA_PER_ITERATION,
             max_message_duration_before_warn_in_seconds:
                 MAX_MESSAGE_DURATION_BEFORE_WARN_IN_SECONDS,
-            heap_delta_rate_limit: NumBytes::from(75 * 1024 * 1024),
+            // Nano-replica profile: cap per-canister heap delta per round so a
+            // single canister cannot fill the (small) subnet heap delta capacity
+            // in one round.
+            heap_delta_rate_limit: NumBytes::from(32 * MIB),
             install_code_rate_limit: MAX_INSTRUCTIONS_PER_SLICE,
             dirty_page_overhead: DEFAULT_DIRTY_PAGE_OVERHEAD,
             accumulated_priority_reset_interval: ACCUMULATED_PRIORITY_RESET_INTERVAL,

From 994a3e98382b98fc38197173930df2cb96ec6a5d Mon Sep 17 00:00:00 2001
From: Bjoern Tackmann <bjoern@dfinity.org>
Date: Tue, 16 Jun 2026 09:33:12 +0200
Subject: [PATCH 02/14] fix(config): nano profile needs >=2 scheduler cores
 (DTS invariant)

The DTS scheduler computes allocatable compute capacity as
`(scheduler_cores - 1) * 100%` (round_schedule::compute_capacity_percent).
With NUMBER_OF_EXECUTION_THREADS = 1 this is 0%, so the invariant
`total_compute_allocation + 1% <= compute_capacity` fails on every round
and the replica panics in the MR Batch Processor on restart.

Bump to 2 (the scheduler floor). Memory cost is negligible: the extra
execution thread's Wasm address space is virtual, resident usage stays
bounded by the per-message dirty-page limits and the shared sandbox-RSS
budget, and SUBNET_MEMORY_RESERVATION is 64 MiB x 2 = 128 MiB (< the
512 MiB subnet cap).

Found by running a 4-node local-net subnet with the nano profile.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 rs/config/src/execution_environment.rs | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/rs/config/src/execution_environment.rs b/rs/config/src/execution_environment.rs
index f07f56b75d22..0daab53b627e 100644
--- a/rs/config/src/execution_environment.rs
+++ b/rs/config/src/execution_environment.rs
@@ -83,10 +83,15 @@ const SUBNET_WASM_CUSTOM_SECTIONS_MEMORY_CAPACITY: NumBytes = NumBytes::new(16 *
 //    We needs to ensure:
 //    `SUBNET_MEMORY_CAPACITY / number_of_threads >= max_canister_memory`
 //    If you change this number please adjust other constants as well.
-// Nano-replica profile: a single update-execution thread. This also sets
-// `SchedulerConfig::scheduler_cores` and divides the (small) subnet memory
-// capacity by a single thread.
-pub(crate) const NUMBER_OF_EXECUTION_THREADS: usize = 1;
+// Nano-replica profile: minimum viable update-execution parallelism. This also
+// sets `SchedulerConfig::scheduler_cores` and divides the (small) subnet memory
+// capacity across threads.
+//
+// NOTE: the DTS scheduler requires at least 2 cores — compute capacity is
+// `(scheduler_cores - 1) * 100%` (see `round_schedule::compute_capacity_percent`),
+// so a single core yields 0% allocatable capacity and trips a scheduler
+// invariant on every round. 2 is the floor.
+pub(crate) const NUMBER_OF_EXECUTION_THREADS: usize = 2;
 
 /// The number of bytes reserved for response callback executions.
 /// Nano-replica profile: 64MiB per thread (must stay well below the subnet

From 6dd40834fb669f28ef5223d90c3b12fd631d4184 Mon Sep 17 00:00:00 2001
From: Bjoern Tackmann <bjoern@dfinity.org>
Date: Tue, 16 Jun 2026 09:33:12 +0200
Subject: [PATCH 03/14] test(canister_client): add subnet load-driver example
 (hammer)

Standalone stress driver for a local subnet, driven over the public
endpoint with the in-repo ic-canister-client Agent (no dfx needed):
deploys N universal canisters via provisional_create_canister_with_cycles,
then runs throughput / compute / dirty-page / memory-growth phases and
reports throughput, latency and error classes.

Run:
  UNIVERSAL_CANISTER_WASM_PATH=/path/to/universal_canister.wasm \
    cargo run -p ic-canister-client --example hammer -- http://localhost:8080

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 rs/canister_client/Cargo.toml         |   1 +
 rs/canister_client/examples/hammer.rs | 299 ++++++++++++++++++++++++++
 2 files changed, 300 insertions(+)
 create mode 100644 rs/canister_client/examples/hammer.rs

diff --git a/rs/canister_client/Cargo.toml b/rs/canister_client/Cargo.toml
index 1dfd761ad595..905df0adade2 100644
--- a/rs/canister_client/Cargo.toml
+++ b/rs/canister_client/Cargo.toml
@@ -31,6 +31,7 @@ url = { workspace = true }
 [dev-dependencies]
 hex = { workspace = true }
 ic-certification-test-utils = { path = "../certification/test-utils" }
+ic-universal-canister = { path = "../universal_canister/lib" }
 ic-crypto-test-utils-reproducible-rng = { path = "../crypto/test_utils/reproducible_rng" }
 ic-crypto-test-utils-root-of-trust = { path = "../crypto/test_utils/root_of_trust" }
 ic-crypto-test-utils-tls = { path = "../crypto/test_utils/tls" }
diff --git a/rs/canister_client/examples/hammer.rs b/rs/canister_client/examples/hammer.rs
new file mode 100644
index 000000000000..48c1ccee75a5
--- /dev/null
+++ b/rs/canister_client/examples/hammer.rs
@@ -0,0 +1,299 @@
+//! Stress-test driver for the local 4-node subnet (dev/local-net).
+//!
+//! Deploys N universal canisters via `provisional_create_canister_with_cycles`
+//! and then hammers the subnet with compute and memory load, reporting how it
+//! holds up. Drives the public endpoint with the in-repo `ic-canister-client`
+//! Agent, so it needs no dfx / external SDK.
+//!
+//! Run with:
+//!   cargo run -p ic-canister-client --example hammer --release -- http://localhost:8080
+//!
+//! Env knobs: HAMMER_CANISTERS (default 6), HAMMER_SECS (per throughput/compute
+//! phase, default 15), HAMMER_CONCURRENCY (default 48).
+
+use ic_canister_client::{Agent, Sender};
+use ic_management_canister_types_private::{
+    CanisterIdRecord, CanisterInstallMode, IC_00, InstallCodeArgs, Method, Payload,
+    ProvisionalCreateCanisterWithCyclesArgs,
+};
+use ic_types::{CanisterId, PrincipalId};
+use ic_universal_canister::{get_universal_canister_wasm, wasm};
+use std::collections::BTreeMap;
+use std::str::FromStr;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+use url::Url;
+
+const MIB: u32 = 1024 * 1024;
+// A canister id that lives in this subnet's allocation range (see DEPLOY.md);
+// used only to route `provisional_create_canister_with_cycles`.
+const ROUTING_CANISTER_ID: &str = "bnz7o-iuaaa-aaaaa-qaaaa-cai";
+
+static NONCE: AtomicU64 = AtomicU64::new(1);
+
+fn next_nonce() -> Vec<u8> {
+    NONCE.fetch_add(1, Ordering::Relaxed).to_le_bytes().to_vec()
+}
+
+#[derive(Default)]
+struct Stats {
+    ok: AtomicU64,
+    err: AtomicU64,
+    lat_sum_ms: AtomicU64,
+    lat_max_ms: AtomicU64,
+    err_classes: Mutex<BTreeMap<String, u64>>,
+}
+
+impl Stats {
+    fn record(&self, started: Instant, result: &Result<Option<Vec<u8>>, String>) {
+        let ms = started.elapsed().as_millis() as u64;
+        self.lat_sum_ms.fetch_add(ms, Ordering::Relaxed);
+        self.lat_max_ms.fetch_max(ms, Ordering::Relaxed);
+        match result {
+            Ok(_) => {
+                self.ok.fetch_add(1, Ordering::Relaxed);
+            }
+            Err(e) => {
+                self.err.fetch_add(1, Ordering::Relaxed);
+                // Collapse to a short class so the histogram stays readable.
+                let class: String = e.split_whitespace().take(10).collect::<Vec<_>>().join(" ");
+                let class: String = class.chars().take(120).collect();
+                *self.err_classes.lock().unwrap().entry(class).or_insert(0) += 1;
+            }
+        }
+    }
+
+    fn report(&self, label: &str, wall: Duration) {
+        let ok = self.ok.load(Ordering::Relaxed);
+        let err = self.err.load(Ordering::Relaxed);
+        let total = ok + err;
+        let avg = if total > 0 {
+            self.lat_sum_ms.load(Ordering::Relaxed) / total
+        } else {
+            0
+        };
+        let rps = ok as f64 / wall.as_secs_f64().max(0.001);
+        println!(
+            "\n── {label} ──\n  ok={ok} err={err}  throughput={rps:.1} ok/s  \
+             latency avg={avg}ms max={}ms  (wall {:.1}s)",
+            self.lat_max_ms.load(Ordering::Relaxed),
+            wall.as_secs_f64()
+        );
+        let classes = self.err_classes.lock().unwrap();
+        if !classes.is_empty() {
+            println!("  error classes:");
+            for (c, n) in classes.iter() {
+                println!("    [{n:>5}] {c}");
+            }
+        }
+    }
+}
+
+async fn update(
+    agent: &Agent,
+    canister: &CanisterId,
+    payload: Vec<u8>,
+) -> Result<Option<Vec<u8>>, String> {
+    agent
+        .execute_update(canister, canister, "update", payload, next_nonce())
+        .await
+}
+
+/// Create + install a universal canister; optionally pre-grow its stable memory.
+async fn deploy_one(
+    agent: &Agent,
+    routing_id: &CanisterId,
+    pre_grow_pages: u32,
+) -> Result<CanisterId, String> {
+    let args = ProvisionalCreateCanisterWithCyclesArgs::new(
+        Some(1_000_000_000_000_000_u128), // 1 Pcycle, never freezes
+        None,
+    );
+    let reply = agent
+        .execute_update(
+            routing_id,
+            &IC_00,
+            Method::ProvisionalCreateCanisterWithCycles,
+            args.encode(),
+            next_nonce(),
+        )
+        .await?
+        .ok_or("provisional_create: empty reply")?;
+    let canister_id = CanisterIdRecord::decode(&reply)
+        .map_err(|e| format!("decode CanisterIdRecord: {e}"))?
+        .get_canister_id();
+
+    agent
+        .install_canister(InstallCodeArgs::new(
+            CanisterInstallMode::Install,
+            canister_id,
+            get_universal_canister_wasm(),
+            vec![],
+        ))
+        .await?;
+
+    if pre_grow_pages > 0 {
+        update(agent, &canister_id, wasm().stable_grow(pre_grow_pages).reply().build()).await?;
+    }
+    Ok(canister_id)
+}
+
+/// Run `make_payload` against the canister pool from `concurrency` workers until
+/// `dur` elapses.
+async fn storm(
+    agent: Arc<Agent>,
+    canisters: Arc<Vec<CanisterId>>,
+    concurrency: usize,
+    dur: Duration,
+    make_payload: Arc<dyn Fn() -> Vec<u8> + Send + Sync>,
+) -> Stats {
+    let stats = Arc::new(Stats::default());
+    let deadline = Instant::now() + dur;
+    let rr = Arc::new(AtomicU64::new(0));
+    let mut handles = Vec::new();
+    for _ in 0..concurrency {
+        let agent = agent.clone();
+        let canisters = canisters.clone();
+        let stats = stats.clone();
+        let rr = rr.clone();
+        let make_payload = make_payload.clone();
+        handles.push(tokio::spawn(async move {
+            while Instant::now() < deadline {
+                let idx = rr.fetch_add(1, Ordering::Relaxed) as usize % canisters.len();
+                let canister = canisters[idx];
+                let payload = make_payload();
+                let started = Instant::now();
+                let res = update(&agent, &canister, payload).await;
+                stats.record(started, &res);
+            }
+        }));
+    }
+    for h in handles {
+        let _ = h.await;
+    }
+    Arc::try_unwrap(stats).unwrap_or_default()
+}
+
+#[tokio::main(flavor = "multi_thread", worker_threads = 8)]
+async fn main() {
+    let url = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| "http://localhost:8080".to_string());
+    let num_canisters: usize = std::env::var("HAMMER_CANISTERS")
+        .ok()
+        .and_then(|s| s.parse().ok())
+        .unwrap_or(6);
+    let secs: u64 = std::env::var("HAMMER_SECS")
+        .ok()
+        .and_then(|s| s.parse().ok())
+        .unwrap_or(15);
+    let concurrency: usize = std::env::var("HAMMER_CONCURRENCY")
+        .ok()
+        .and_then(|s| s.parse().ok())
+        .unwrap_or(48);
+
+    let agent = Arc::new(Agent::new(
+        Url::parse(&url).expect("bad url"),
+        Sender::Anonymous,
+    ));
+    let routing_id =
+        CanisterId::unchecked_from_principal(PrincipalId::from_str(ROUTING_CANISTER_ID).unwrap());
+
+    println!("== hammer ==");
+    println!("target={url} canisters={num_canisters} phase_secs={secs} concurrency={concurrency}");
+
+    // ---- Deploy ----
+    println!("\n[1/5] deploying {num_canisters} universal canisters (pre-grow 32 MiB stable each)...");
+    let t0 = Instant::now();
+    let mut canisters = Vec::new();
+    for i in 0..num_canisters {
+        match deploy_one(&agent, &routing_id, 512).await {
+            Ok(id) => {
+                println!("  + canister {i} = {id}");
+                canisters.push(id);
+            }
+            Err(e) => println!("  ! deploy {i} failed: {e}"),
+        }
+    }
+    if canisters.is_empty() {
+        eprintln!("no canisters deployed; aborting");
+        std::process::exit(1);
+    }
+    println!("  deployed {} canisters in {:.1}s", canisters.len(), t0.elapsed().as_secs_f64());
+    let canisters = Arc::new(canisters);
+
+    // ---- Phase A: ingress/throughput storm (near-empty updates) ----
+    println!("\n[2/5] THROUGHPUT storm: empty update calls, {concurrency} concurrent, {secs}s");
+    let t = Instant::now();
+    let stats = storm(
+        agent.clone(),
+        canisters.clone(),
+        concurrency,
+        Duration::from_secs(secs),
+        Arc::new(|| wasm().reply().build()),
+    )
+    .await;
+    stats.report("THROUGHPUT (empty updates)", t.elapsed());
+
+    // ---- Phase B: compute storm (8 MiB stable fill per call, within dirty limit) ----
+    println!("\n[3/5] COMPUTE storm: 8 MiB stable_fill per call, {concurrency} concurrent, {secs}s");
+    let t = Instant::now();
+    let stats = storm(
+        agent.clone(),
+        canisters.clone(),
+        concurrency,
+        Duration::from_secs(secs),
+        Arc::new(|| wasm().stable_fill(0, 0x61, 8 * MIB).reply().build()),
+    )
+    .await;
+    stats.report("COMPUTE (8 MiB fill)", t.elapsed());
+
+    // ---- Phase C: per-message dirty-page limit (expect traps) ----
+    println!("\n[4/5] DIRTY-LIMIT probe: 48 MiB dirty in one message (limit is 32 MiB) x16");
+    let probe = Stats::default();
+    for _ in 0..16 {
+        let c = canisters[0];
+        let p = wasm().stable_grow(1024).stable_fill(0, 0x62, 48 * MIB).reply().build();
+        let started = Instant::now();
+        let res = update(&agent, &c, p).await;
+        probe.record(started, &res);
+    }
+    probe.report("DIRTY-LIMIT (48 MiB/msg)", Duration::from_secs(1));
+
+    // ---- Phase D: grow stable memory toward the 512 MiB subnet cap ----
+    println!("\n[5/5] MEMORY-GROWTH storm: grow 16 MiB + fill per call across all canisters until rejected");
+    let grow = Arc::new(Stats::default());
+    let total_mib = Arc::new(AtomicU64::new(0));
+    let mut handles = Vec::new();
+    for &c in canisters.iter() {
+        let agent = agent.clone();
+        let grow = grow.clone();
+        let total_mib = total_mib.clone();
+        handles.push(tokio::spawn(async move {
+            // Hard cap iterations so a misbehaving run can't loop forever.
+            for _ in 0..64 {
+                let p = wasm().stable_grow(256).stable_fill(0, 0x63, 16 * MIB).reply().build();
+                let started = Instant::now();
+                let res = update(&agent, &c, p).await;
+                let ok = res.is_ok();
+                grow.record(started, &res);
+                if ok {
+                    total_mib.fetch_add(16, Ordering::Relaxed);
+                } else {
+                    break; // first rejection for this canister: stop growing it
+                }
+            }
+        }));
+    }
+    for h in handles {
+        let _ = h.await;
+    }
+    grow.report("MEMORY-GROWTH", Duration::from_secs(1));
+    println!(
+        "  approx stable memory successfully grown across subnet: ~{} MiB",
+        total_mib.load(Ordering::Relaxed)
+    );
+
+    println!("\n== done ==");
+}

From 827506f2dc5719386c14f875e6f8b4a1d31d9ec5 Mon Sep 17 00:00:00 2001
From: Bjoern Tackmann <bjoern@dfinity.org>
Date: Tue, 16 Jun 2026 10:23:45 +0200
Subject: [PATCH 04/14] feat(config): nano profile lets canisters allocate up
 to the subnet cap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Disable the storage cycle-reservation mechanism on the nano profile so
canisters can freely allocate up to the subnet memory capacity:

- SUBNET_MEMORY_THRESHOLD = SUBNET_MEMORY_CAPACITY (512 MiB). When the
  threshold is >= capacity the subnet is never "high usage", so growth
  never triggers cycle reservations (whose mainnet-calibrated pricing
  otherwise rejects growth on a tiny subnet, hitting the reserved-cycles
  limit).
- SUBNET_MEMORY_RESERVATION = 8 MiB/thread (was 64), so the response-
  callback reservation no longer caps usable storage well below capacity.

Also bake the matching hypervisor override into dev/local-net/prep.sh so
the local 4-node net inherits it across resets.

Verified on the local-net: with reservation disabled, a single message
writing 24 MiB of stable memory succeeds while 48 MiB traps with
"Exceeded the limit for the number of accessed pages ... limit 32768 KB"
(the nano 32 MiB per-message stable limit), and the subnet keeps
finalizing with no replica panic — i.e. the per-message limit, not an
OOM-kill, bounds a single execution's working set.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 dev/local-net/prep.sh                  |  7 +++++++
 rs/config/src/execution_environment.rs | 16 +++++++++++-----
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/dev/local-net/prep.sh b/dev/local-net/prep.sh
index 003f8af6e7b4..945127cdf01a 100755
--- a/dev/local-net/prep.sh
+++ b/dev/local-net/prep.sh
@@ -139,6 +139,13 @@ for i in 0 1 2 3; do
   crypto: {
     crypto_root: "/etc/ic/crypto",
   },
+  hypervisor: {
+    // Nano profile: threshold == subnet memory capacity disables the storage
+    // cycle-reservation mechanism; small reservation lets canisters allocate
+    // up to ~the full subnet memory capacity.
+    subnet_memory_threshold: 536870912,
+    subnet_memory_reservation: 16777216,
+  },
   http_handler: {
     listen_addr: "[::]:$HTTP_PORT",
   },
diff --git a/rs/config/src/execution_environment.rs b/rs/config/src/execution_environment.rs
index 0daab53b627e..4a7ce32c2f6a 100644
--- a/rs/config/src/execution_environment.rs
+++ b/rs/config/src/execution_environment.rs
@@ -33,8 +33,12 @@ pub const TEST_DEFAULT_LOG_MEMORY_USAGE: u64 = if LOG_MEMORY_STORE_FEATURE_ENABL
 /// This specifies the threshold in bytes at which the subnet memory usage is
 /// considered to be high. If this value is greater or equal to the subnet
 /// capacity, then the subnet is never considered to have high usage.
-// Nano-replica profile: scaled down to run on a 512 MiB - 1 GiB VM.
-const SUBNET_MEMORY_THRESHOLD: NumBytes = NumBytes::new(384 * MIB);
+// Nano-replica profile: set equal to the subnet memory capacity so the subnet
+// is never considered "high usage" and the storage cycle-reservation mechanism
+// stays disabled — canisters can allocate freely up to the subnet capacity
+// without reserving cycles (reservation pricing is calibrated for mainnet and
+// would otherwise reject growth on a tiny subnet).
+const SUBNET_MEMORY_THRESHOLD: NumBytes = NumBytes::new(512 * MIB);
 
 /// This is the upper limit on how much logical storage canisters can request to
 /// be store on a given subnet.
@@ -94,10 +98,12 @@ const SUBNET_WASM_CUSTOM_SECTIONS_MEMORY_CAPACITY: NumBytes = NumBytes::new(16 *
 pub(crate) const NUMBER_OF_EXECUTION_THREADS: usize = 2;
 
 /// The number of bytes reserved for response callback executions.
-/// Nano-replica profile: 64MiB per thread (must stay well below the subnet
-/// memory capacity, otherwise no memory is left for canister state).
+/// Nano-replica profile: keep this small (8 MiB per thread) so canisters can
+/// allocate almost the entire subnet memory capacity. The reservation only
+/// guards response-callback execution headroom; on a best-effort-leaning nano
+/// subnet a small reservation is sufficient.
 pub const SUBNET_MEMORY_RESERVATION: NumBytes =
-    NumBytes::new(64 * MIB * NUMBER_OF_EXECUTION_THREADS as u64);
+    NumBytes::new(8 * MIB * NUMBER_OF_EXECUTION_THREADS as u64);
 
 /// The soft limit on the subnet-wide number of callbacks.
 pub const SUBNET_CALLBACK_SOFT_LIMIT: usize = 4_096;

From 7815c58d870a38a8ad5b29352653e05546cfb568 Mon Sep 17 00:00:00 2001
From: Bjoern Tackmann <bjoern@dfinity.org>
Date: Tue, 16 Jun 2026 10:23:45 +0200
Subject: [PATCH 05/14] test(canister_client): hammer probe mode + full error
 messages

- HAMMER_MODE=probe runs only the per-message dirty/accessed-page-limit
  probe (skips the throughput/compute/growth storms).
- Grow stable memory in its own committed message, then fill 24 MiB
  (under the 32 MiB limit, expect OK) and 48 MiB (over, expect trap), so
  the limit is isolated from subnet-capacity effects.
- Widen error-class output so full canister reject reasons are visible.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 Cargo.lock                            |  1 +
 rs/canister_client/examples/hammer.rs | 42 ++++++++++++++++++---------
 2 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 16bcb0f104fd..b26b2d52cb28 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -7258,6 +7258,7 @@ dependencies = [
  "ic-secp256k1",
  "ic-test-utilities-types",
  "ic-types",
+ "ic-universal-canister",
  "ic-validator",
  "itertools 0.12.1",
  "prost 0.13.5",
diff --git a/rs/canister_client/examples/hammer.rs b/rs/canister_client/examples/hammer.rs
index 48c1ccee75a5..77c96edc1472 100644
--- a/rs/canister_client/examples/hammer.rs
+++ b/rs/canister_client/examples/hammer.rs
@@ -57,8 +57,8 @@ impl Stats {
             Err(e) => {
                 self.err.fetch_add(1, Ordering::Relaxed);
                 // Collapse to a short class so the histogram stays readable.
-                let class: String = e.split_whitespace().take(10).collect::<Vec<_>>().join(" ");
-                let class: String = class.chars().take(120).collect();
+                let class: String = e.split_whitespace().take(60).collect::<Vec<_>>().join(" ");
+                let class: String = class.chars().take(400).collect();
                 *self.err_classes.lock().unwrap().entry(class).or_insert(0) += 1;
             }
         }
@@ -192,6 +192,9 @@ async fn main() {
         .ok()
         .and_then(|s| s.parse().ok())
         .unwrap_or(48);
+    // probe mode: skip the throughput/compute/growth storms, run only the
+    // per-message dirty-page-limit probe (Phase C).
+    let probe_only = std::env::var("HAMMER_MODE").map(|m| m == "probe").unwrap_or(false);
 
     let agent = Arc::new(Agent::new(
         Url::parse(&url).expect("bad url"),
@@ -223,6 +226,7 @@ async fn main() {
     println!("  deployed {} canisters in {:.1}s", canisters.len(), t0.elapsed().as_secs_f64());
     let canisters = Arc::new(canisters);
 
+    if !probe_only {
     // ---- Phase A: ingress/throughput storm (near-empty updates) ----
     println!("\n[2/5] THROUGHPUT storm: empty update calls, {concurrency} concurrent, {secs}s");
     let t = Instant::now();
@@ -248,19 +252,30 @@ async fn main() {
     )
     .await;
     stats.report("COMPUTE (8 MiB fill)", t.elapsed());
-
-    // ---- Phase C: per-message dirty-page limit (expect traps) ----
-    println!("\n[4/5] DIRTY-LIMIT probe: 48 MiB dirty in one message (limit is 32 MiB) x16");
-    let probe = Stats::default();
-    for _ in 0..16 {
-        let c = canisters[0];
-        let p = wasm().stable_grow(1024).stable_fill(0, 0x62, 48 * MIB).reply().build();
-        let started = Instant::now();
-        let res = update(&agent, &c, p).await;
-        probe.record(started, &res);
     }
-    probe.report("DIRTY-LIMIT (48 MiB/msg)", Duration::from_secs(1));
 
+    // ---- Phase C: per-message dirty-page limit (32 MiB) ----
+    // Grow in a separate (committed) message first, then fill in-bounds amounts
+    // so we isolate the *dirty-page* limit from any grow/bounds effects.
+    println!("\n[4/5] DIRTY-LIMIT probe (per-message stable dirty limit = 32 MiB)");
+    let c = canisters[0];
+    let g = update(&agent, &c, wasm().stable_grow(1024).reply().build()).await; // +64 MiB, commit
+    println!(
+        "  grow +64 MiB (own message): {}",
+        if g.is_ok() { "OK".to_string() } else { format!("ERR {}", g.as_ref().err().unwrap().chars().take(200).collect::<String>()) }
+    );
+    let small = update(&agent, &c, wasm().stable_fill(0, 0x62, 24 * MIB).reply().build()).await;
+    println!(
+        "  fill 24 MiB (UNDER 32 MiB limit): {}",
+        if small.is_ok() { "OK".to_string() } else { format!("ERR {}", small.as_ref().err().unwrap().chars().take(260).collect::<String>()) }
+    );
+    let big = update(&agent, &c, wasm().stable_fill(0, 0x62, 48 * MIB).reply().build()).await;
+    println!(
+        "  fill 48 MiB (OVER 32 MiB limit):  {}",
+        if big.is_ok() { "OK — NO LIMIT ENFORCED".to_string() } else { format!("TRAP {}", big.as_ref().err().unwrap().chars().take(320).collect::<String>()) }
+    );
+
+    if !probe_only {
     // ---- Phase D: grow stable memory toward the 512 MiB subnet cap ----
     println!("\n[5/5] MEMORY-GROWTH storm: grow 16 MiB + fill per call across all canisters until rejected");
     let grow = Arc::new(Stats::default());
@@ -294,6 +309,7 @@ async fn main() {
         "  approx stable memory successfully grown across subnet: ~{} MiB",
         total_mib.load(Ordering::Relaxed)
     );
+    }
 
     println!("\n== done ==");
 }

From aded64b6a8c0619af58fe8a057afc253cb5a7e9d Mon Sep 17 00:00:00 2001
From: Bjoern Tackmann <bjoern@dfinity.org>
Date: Tue, 16 Jun 2026 10:52:06 +0200
Subject: [PATCH 06/14] feat(local-net): shorten DKG/checkpoint interval to 50
 rounds
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The nano heap-delta capacity (96 MiB) is small relative to the default
~500-round checkpoint interval, so a memory-write-heavy workload fills the
heap delta in a few rounds and then execution stalls until the next
checkpoint flushes it (consensus keeps finalizing throughout — graceful,
but execution duty-cycle collapses).

Pass --dkg-interval-length 49 to ic-prep so checkpoints happen every ~50
rounds. Measured effect under the same hammer workload:
  heap-delta round-skips during the run: ~880 -> ~150
  compute phase drains ~3x faster; execution advances in short bursts
  instead of multi-minute stalls.

Checkpoint cadence follows the DKG interval (CUP heights); cheap here
because the nano subnet state is only a few hundred MB.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 dev/local-net/prep.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dev/local-net/prep.sh b/dev/local-net/prep.sh
index 945127cdf01a..6aa3f71dd785 100755
--- a/dev/local-net/prep.sh
+++ b/dev/local-net/prep.sh
@@ -97,6 +97,7 @@ docker run --rm \
     --nns-subnet-index "$SUBNET_IDX" \
     --provisional-whitelist /bootstrap/.provisional_whitelist.json \
     --use-specified-ids-allocation-range \
+    --dkg-interval-length 49 \
     "${NODE_ARGS[@]}"
 
 # Permissions: ic-prep wrote as root inside the container. Make the

From 60b94648c2fde6d831ee414127da7a15bc40a5ed Mon Sep 17 00:00:00 2001
From: Bjoern Tackmann <bjoern@dfinity.org>
Date: Tue, 16 Jun 2026 11:06:05 +0200
Subject: [PATCH 07/14] test(canister_client): add read-heavy mode to hammer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

HAMMER_MODE=read populates N canisters with large stable state, then runs
read-heavy 24 MiB stable_read calls — updates on all-but-one canister and
queries on the last — concurrently, plus a 48 MiB single-execution read
probe to exercise the per-message/query stable accessed-page limit.
storm() gains an is_query flag to drive query calls via execute_query.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 rs/canister_client/examples/hammer.rs | 76 ++++++++++++++++++++++++++-
 1 file changed, 75 insertions(+), 1 deletion(-)

diff --git a/rs/canister_client/examples/hammer.rs b/rs/canister_client/examples/hammer.rs
index 77c96edc1472..a8c490c3d4e7 100644
--- a/rs/canister_client/examples/hammer.rs
+++ b/rs/canister_client/examples/hammer.rs
@@ -147,6 +147,7 @@ async fn storm(
     concurrency: usize,
     dur: Duration,
     make_payload: Arc<dyn Fn() -> Vec<u8> + Send + Sync>,
+    is_query: bool,
 ) -> Stats {
     let stats = Arc::new(Stats::default());
     let deadline = Instant::now() + dur;
@@ -164,7 +165,11 @@ async fn storm(
                 let canister = canisters[idx];
                 let payload = make_payload();
                 let started = Instant::now();
-                let res = update(&agent, &canister, payload).await;
+                let res = if is_query {
+                    agent.execute_query(&canister, "query", payload).await
+                } else {
+                    update(&agent, &canister, payload).await
+                };
                 stats.record(started, &res);
             }
         }));
@@ -195,6 +200,9 @@ async fn main() {
     // probe mode: skip the throughput/compute/growth storms, run only the
     // per-message dirty-page-limit probe (Phase C).
     let probe_only = std::env::var("HAMMER_MODE").map(|m| m == "probe").unwrap_or(false);
+    // read mode: populate canisters with large state, then read-heavy updates on
+    // all-but-one and read-heavy queries on the last; plus a read-limit probe.
+    let read_mode = std::env::var("HAMMER_MODE").map(|m| m == "read").unwrap_or(false);
 
     let agent = Arc::new(Agent::new(
         Url::parse(&url).expect("bad url"),
@@ -226,6 +234,70 @@ async fn main() {
     println!("  deployed {} canisters in {:.1}s", canisters.len(), t0.elapsed().as_secs_f64());
     let canisters = Arc::new(canisters);
 
+    if read_mode {
+        // Populate each canister with ~120 MiB of real stable data (written in
+        // <=24 MiB chunks to respect the 32 MiB per-message dirty limit).
+        const BIG_MIB: u32 = 120;
+        let chunk: u32 = 24 * MIB;
+        let pages: u32 = (BIG_MIB * MIB) / 65536; // 64 KiB Wasm pages
+        println!("\n[read] populating {} canisters to ~{BIG_MIB} MiB stable each...", canisters.len());
+        for (i, c) in canisters.iter().enumerate() {
+            let _ = update(&agent, c, wasm().stable_grow(pages).reply().build()).await;
+            let mut off = 0u32;
+            while off + chunk <= BIG_MIB * MIB {
+                let _ = update(&agent, c, wasm().stable_fill(off, 0x40 + i as u32, chunk).reply().build()).await;
+                off += chunk;
+            }
+            println!("  canister {i} = {c} populated");
+        }
+        println!("[read] waiting ~25s for a checkpoint to flush state to disk...");
+        tokio::time::sleep(Duration::from_secs(25)).await;
+
+        // Read 24 MiB (< 32 MiB accessed limit) per call, cycling the offset
+        // window across the populated range.
+        let off_ctr = Arc::new(AtomicU64::new(0));
+        let mk: Arc<dyn Fn() -> Vec<u8> + Send + Sync> = {
+            let off_ctr = off_ctr.clone();
+            Arc::new(move || {
+                let n = off_ctr.fetch_add(1, Ordering::Relaxed);
+                let off = ((n % 4) as u32) * chunk;
+                wasm().stable_read(off, chunk).reply().build()
+            })
+        };
+        let upd_cans = Arc::new(canisters[..canisters.len() - 1].to_vec());
+        let qry_cans = Arc::new(vec![canisters[canisters.len() - 1]]);
+        println!(
+            "\n[read] read storm ({secs}s): 24 MiB stable_read/call — UPDATES on {} canisters, QUERIES on 1",
+            upd_cans.len()
+        );
+        let t = Instant::now();
+        let (us, qs) = tokio::join!(
+            storm(agent.clone(), upd_cans, concurrency, Duration::from_secs(secs), mk.clone(), false),
+            storm(agent.clone(), qry_cans.clone(), concurrency, Duration::from_secs(secs), mk.clone(), true),
+        );
+        us.report("READ-UPDATE (24 MiB stable_read)", t.elapsed());
+        qs.report("READ-QUERY (24 MiB stable_read)", t.elapsed());
+
+        // Read-limit probe: access 48 MiB in one execution (> 32 MiB accessed
+        // limit) -> expect a trap, for both update and query.
+        println!("\n[read] read-limit probe: 48 MiB stable_read in one execution (accessed limit 32 MiB)");
+        let ru = update(&agent, &canisters[0], wasm().stable_read(0, 48 * MIB).reply().build()).await;
+        println!(
+            "  update read 48 MiB: {}",
+            if ru.is_ok() { "OK (no limit!)".to_string() } else { format!("TRAP {}", ru.as_ref().err().unwrap().chars().take(220).collect::<String>()) }
+        );
+        let rq = agent
+            .execute_query(&canisters[canisters.len() - 1], "query", wasm().stable_read(0, 48 * MIB).reply().build())
+            .await;
+        println!(
+            "  query  read 48 MiB: {}",
+            if rq.is_ok() { "OK (no limit!)".to_string() } else { format!("TRAP {}", rq.as_ref().err().unwrap().chars().take(220).collect::<String>()) }
+        );
+
+        println!("\n== done ==");
+        return;
+    }
+
     if !probe_only {
     // ---- Phase A: ingress/throughput storm (near-empty updates) ----
     println!("\n[2/5] THROUGHPUT storm: empty update calls, {concurrency} concurrent, {secs}s");
@@ -236,6 +308,7 @@ async fn main() {
         concurrency,
         Duration::from_secs(secs),
         Arc::new(|| wasm().reply().build()),
+        false,
     )
     .await;
     stats.report("THROUGHPUT (empty updates)", t.elapsed());
@@ -249,6 +322,7 @@ async fn main() {
         concurrency,
         Duration::from_secs(secs),
         Arc::new(|| wasm().stable_fill(0, 0x61, 8 * MIB).reply().build()),
+        false,
     )
     .await;
     stats.report("COMPUTE (8 MiB fill)", t.elapsed());

From 6a390b73e2983a7a7b62234ccbda9c1ea878016c Mon Sep 17 00:00:00 2001
From: Bjoern Tackmann <bjoern@dfinity.org>
Date: Tue, 16 Jun 2026 11:35:26 +0200
Subject: [PATCH 08/14] test(canister_client): add heap-memory mode to hammer

HAMMER_MODE=heap mirrors the stable-memory tests on Wasm heap memory:
per-message heap-write probe (24/48/96 MiB in one message), heap-write
storm (8 MiB/call), and a heap-read storm (40 MiB get_global_data reads,
updates + queries). Demonstrates that heap has no per-execution
dirty/accessed cap (the 32 MiB limits are stable-only): all three
single-message heap writes and the 40 MiB heap reads succeed, whereas the
stable equivalents trap at 32 MiB.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 rs/canister_client/examples/hammer.rs | 69 +++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/rs/canister_client/examples/hammer.rs b/rs/canister_client/examples/hammer.rs
index a8c490c3d4e7..aedc25d4ff17 100644
--- a/rs/canister_client/examples/hammer.rs
+++ b/rs/canister_client/examples/hammer.rs
@@ -203,6 +203,11 @@ async fn main() {
     // read mode: populate canisters with large state, then read-heavy updates on
     // all-but-one and read-heavy queries on the last; plus a read-limit probe.
     let read_mode = std::env::var("HAMMER_MODE").map(|m| m == "read").unwrap_or(false);
+    // heap mode: the heap-memory (Wasm) analogue of the stable-memory tests
+    // (compute/dirty-limit/read). Heap has no per-execution dirty/accessed cap
+    // (the 32 MiB limits are stable-only), so a single message can touch
+    // arbitrarily large heap.
+    let heap_mode = std::env::var("HAMMER_MODE").map(|m| m == "heap").unwrap_or(false);
 
     let agent = Arc::new(Agent::new(
         Url::parse(&url).expect("bad url"),
@@ -298,6 +303,70 @@ async fn main() {
         return;
     }
 
+    if heap_mode {
+        // ---- Heap per-message write probe ----
+        // Stable memory traps a single message that dirties/accesses > 32 MiB;
+        // heap (Wasm) memory has no such per-message cap. push_equal_bytes(b, n)
+        // pushes n bytes onto the data stack, dirtying n bytes of heap.
+        println!("\n[heap] per-message heap-write probe (stable's per-msg limit is 32 MiB; heap has none)");
+        for sz in [24u32, 48, 96] {
+            let r = update(&agent, &canisters[0], wasm().push_equal_bytes(0x61, sz * MIB).reply().build()).await;
+            println!(
+                "  push {sz} MiB onto heap in ONE message: {}",
+                if r.is_ok() { "OK".to_string() } else { format!("TRAP {}", r.as_ref().err().unwrap().chars().take(200).collect::<String>()) }
+            );
+        }
+
+        // ---- Heap-write storm (analogue of the COMPUTE storm) ----
+        let upd_cans = Arc::new(canisters[..canisters.len() - 1].to_vec());
+        println!("\n[heap] heap-write storm ({secs}s): 8 MiB heap write/call on {} canisters", upd_cans.len());
+        let t = Instant::now();
+        let ws = storm(
+            agent.clone(),
+            upd_cans.clone(),
+            concurrency,
+            Duration::from_secs(secs),
+            Arc::new(|| wasm().push_equal_bytes(0x61, 8 * MIB).reply().build()),
+            false,
+        )
+        .await;
+        ws.report("HEAP-WRITE (8 MiB/call)", t.elapsed());
+
+        // ---- Populate a persistent heap global, then read it ----
+        const BIG_MIB: u32 = 40; // > 32 MiB so reads exceed the stable per-msg limit
+        println!("\n[heap] populating {} canisters with a {BIG_MIB} MiB heap global...", canisters.len());
+        for (i, c) in canisters.iter().enumerate() {
+            let r = update(
+                &agent,
+                c,
+                wasm().push_equal_bytes(0x41 + i as u32, BIG_MIB * MIB).set_global_data_from_stack().reply().build(),
+            )
+            .await;
+            println!("  canister {i} = {c}: {}", if r.is_ok() { "populated".to_string() } else { format!("ERR {}", r.as_ref().err().unwrap().chars().take(160).collect::<String>()) });
+        }
+        println!("[heap] waiting ~25s for a checkpoint...");
+        tokio::time::sleep(Duration::from_secs(25)).await;
+
+        // ---- Heap-read storm (analogue of the stable READ test) ----
+        // get_global_data reads the whole 40 MiB global in one execution — more
+        // than the 32 MiB stable per-message accessed limit would ever allow.
+        let qry_cans = Arc::new(vec![canisters[canisters.len() - 1]]);
+        println!(
+            "\n[heap] heap-read storm ({secs}s): read {BIG_MIB} MiB heap global/call — UPDATES on {} canisters, QUERIES on 1",
+            upd_cans.len()
+        );
+        let t = Instant::now();
+        let (us, qs) = tokio::join!(
+            storm(agent.clone(), upd_cans.clone(), concurrency, Duration::from_secs(secs), Arc::new(|| wasm().get_global_data().reply().build()), false),
+            storm(agent.clone(), qry_cans.clone(), concurrency, Duration::from_secs(secs), Arc::new(|| wasm().get_global_data().reply().build()), true),
+        );
+        us.report("HEAP-READ-UPDATE (40 MiB heap read)", t.elapsed());
+        qs.report("HEAP-READ-QUERY (40 MiB heap read)", t.elapsed());
+
+        println!("\n== done ==");
+        return;
+    }
+
     if !probe_only {
     // ---- Phase A: ingress/throughput storm (near-empty updates) ----
     println!("\n[2/5] THROUGHPUT storm: empty update calls, {concurrency} concurrent, {secs}s");

From b9537be397144edb44d34e7f26f9304ec52db687 Mon Sep 17 00:00:00 2001
From: Bjoern Tackmann <bjoern@dfinity.org>
Date: Tue, 16 Jun 2026 13:00:35 +0200
Subject: [PATCH 09/14] test(canister_client): heapread mode + full-range read
 cycling

- read mode: cycle read offsets across the FULL populated range (not 4
  fixed windows) and error-check the populate, so reads pull distinct
  state and all canisters are actually large.
- heapread mode: build a large per-canister heap global via
  append_to_global_data and query-read it (96 MiB/read). Surfaces that
  large heap state is ~2.5x more expensive than stable (wasm heap never
  shrinks + realloc on build), so 3x96 MiB heap globals OOM the 512 MiB
  subnet while 3x128 MiB stable fits, and that large heap reads via
  update OOM (the get_global_data copy grows heap).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 rs/canister_client/examples/hammer.rs | 69 ++++++++++++++++++++++++---
 1 file changed, 62 insertions(+), 7 deletions(-)

diff --git a/rs/canister_client/examples/hammer.rs b/rs/canister_client/examples/hammer.rs
index aedc25d4ff17..326600842af1 100644
--- a/rs/canister_client/examples/hammer.rs
+++ b/rs/canister_client/examples/hammer.rs
@@ -208,6 +208,11 @@ async fn main() {
     // (the 32 MiB limits are stable-only), so a single message can touch
     // arbitrarily large heap.
     let heap_mode = std::env::var("HAMMER_MODE").map(|m| m == "heap").unwrap_or(false);
+    // heapread mode: large heap-memory reads pulling lots of distinct state into
+    // RAM. Each canister holds a 96 MiB heap global (built via append, small
+    // transient); reads use queries (heap reads via update would OOM because
+    // get_global_data copies the global to the stack, permanently growing heap).
+    let heapread_mode = std::env::var("HAMMER_MODE").map(|m| m == "heapread").unwrap_or(false);
 
     let agent = Arc::new(Agent::new(
         Url::parse(&url).expect("bad url"),
@@ -242,18 +247,22 @@ async fn main() {
     if read_mode {
         // Populate each canister with ~120 MiB of real stable data (written in
         // <=24 MiB chunks to respect the 32 MiB per-message dirty limit).
-        const BIG_MIB: u32 = 120;
+        const BIG_MIB: u32 = 128;
         let chunk: u32 = 24 * MIB;
         let pages: u32 = (BIG_MIB * MIB) / 65536; // 64 KiB Wasm pages
         println!("\n[read] populating {} canisters to ~{BIG_MIB} MiB stable each...", canisters.len());
         for (i, c) in canisters.iter().enumerate() {
-            let _ = update(&agent, c, wasm().stable_grow(pages).reply().build()).await;
-            let mut off = 0u32;
+            if let Err(e) = update(&agent, c, wasm().stable_grow(pages).reply().build()).await {
+                println!("  canister {i}: GROW FAILED: {}", e.chars().take(160).collect::<String>());
+            }
+            let (mut off, mut werr) = (0u32, 0u32);
             while off + chunk <= BIG_MIB * MIB {
-                let _ = update(&agent, c, wasm().stable_fill(off, 0x40 + i as u32, chunk).reply().build()).await;
+                if update(&agent, c, wasm().stable_fill(off, 0x40 + i as u32, chunk).reply().build()).await.is_err() {
+                    werr += 1;
+                }
                 off += chunk;
             }
-            println!("  canister {i} = {c} populated");
+            println!("  canister {i} = {c} populated ({werr} write errors)");
         }
         println!("[read] waiting ~25s for a checkpoint to flush state to disk...");
         tokio::time::sleep(Duration::from_secs(25)).await;
@@ -263,9 +272,10 @@ async fn main() {
         let off_ctr = Arc::new(AtomicU64::new(0));
         let mk: Arc<dyn Fn() -> Vec<u8> + Send + Sync> = {
             let off_ctr = off_ctr.clone();
+            let windows = ((BIG_MIB * MIB) / chunk) as u64; // cycle across the FULL state
             Arc::new(move || {
                 let n = off_ctr.fetch_add(1, Ordering::Relaxed);
-                let off = ((n % 4) as u32) * chunk;
+                let off = ((n % windows) as u32) * chunk;
                 wasm().stable_read(off, chunk).reply().build()
             })
         };
@@ -303,6 +313,48 @@ async fn main() {
         return;
     }
 
+    if heapread_mode {
+        // Build a large heap global per canister via append (24 MiB chunks, so
+        // the transient heap stays small and all 3 globals fit under the cap).
+        const BIG_MIB: u32 = 96;
+        let chunk: u32 = 24 * MIB;
+        let appends = (BIG_MIB * MIB) / chunk;
+        println!("\n[heapread] populating {} canisters with a {BIG_MIB} MiB heap global...", canisters.len());
+        for (i, c) in canisters.iter().enumerate() {
+            let mut ok = true;
+            for _ in 0..appends {
+                if update(&agent, c, wasm().push_equal_bytes(0x41 + i as u32, chunk).append_to_global_data().reply().build()).await.is_err() {
+                    ok = false;
+                }
+            }
+            println!("  canister {i} = {c}: {}", if ok { "populated" } else { "PARTIAL/FAILED" });
+        }
+        println!("[heapread] waiting ~25s for a checkpoint to flush state to disk...");
+        tokio::time::sleep(Duration::from_secs(25)).await;
+
+        // Read the full 96 MiB global per call via queries on ALL canisters.
+        // (Heap reads via update OOM: the get_global_data stack copy permanently
+        // grows the heap. Queries discard it.) This pulls ~3x96 MiB of distinct
+        // heap state into the page cache.
+        println!(
+            "\n[heapread] heap-read QUERY storm ({secs}s): get_global_data ({BIG_MIB} MiB) on all {} canisters, {concurrency} concurrent",
+            canisters.len()
+        );
+        let t = Instant::now();
+        let qs = storm(
+            agent.clone(),
+            canisters.clone(),
+            concurrency,
+            Duration::from_secs(secs),
+            Arc::new(|| wasm().get_global_data().reply().build()),
+            true,
+        )
+        .await;
+        qs.report("HEAP-READ-QUERY (96 MiB/read)", t.elapsed());
+        println!("\n== done ==");
+        return;
+    }
+
     if heap_mode {
         // ---- Heap per-message write probe ----
         // Stable memory traps a single message that dirties/accesses > 32 MiB;
@@ -333,7 +385,10 @@ async fn main() {
         ws.report("HEAP-WRITE (8 MiB/call)", t.elapsed());
 
         // ---- Populate a persistent heap global, then read it ----
-        const BIG_MIB: u32 = 40; // > 32 MiB so reads exceed the stable per-msg limit
+        // 96 MiB so each get_global_data read pulls ~96 MiB of distinct state
+        // into memory (no per-execution accessed cap on heap, unlike stable's
+        // 32 MiB). 3 canisters x 96 MiB = ~288 MiB distinct read working set.
+        const BIG_MIB: u32 = 96;
         println!("\n[heap] populating {} canisters with a {BIG_MIB} MiB heap global...", canisters.len());
         for (i, c) in canisters.iter().enumerate() {
             let r = update(

From 181946b49ee7a22f97f9682268790a11574e3526 Mon Sep 17 00:00:00 2001
From: Bjoern Tackmann <bjoern@dfinity.org>
Date: Tue, 16 Jun 2026 14:44:17 +0200
Subject: [PATCH 10/14] test(canister_client): read mode = all-query +
 incremental populate

- read storm now queries ALL canisters cycling the full populated range
  (clean read pressure; queries don't replicate or dirty).
- populate grows+fills in 24 MiB increments (a single 128 MiB grow can be
  rejected; small incremental grows reliably build the state).

Used to measure read memory/perf under a container RAM cap.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 rs/canister_client/examples/hammer.rs | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/rs/canister_client/examples/hammer.rs b/rs/canister_client/examples/hammer.rs
index 326600842af1..0f492035d429 100644
--- a/rs/canister_client/examples/hammer.rs
+++ b/rs/canister_client/examples/hammer.rs
@@ -251,13 +251,15 @@ async fn main() {
         let chunk: u32 = 24 * MIB;
         let pages: u32 = (BIG_MIB * MIB) / 65536; // 64 KiB Wasm pages
         println!("\n[read] populating {} canisters to ~{BIG_MIB} MiB stable each...", canisters.len());
+        let _ = pages; // grow incrementally below instead of one big grow
+        let grow_pages = chunk / 65536; // pages per 24 MiB step
         for (i, c) in canisters.iter().enumerate() {
-            if let Err(e) = update(&agent, c, wasm().stable_grow(pages).reply().build()).await {
-                println!("  canister {i}: GROW FAILED: {}", e.chars().take(160).collect::<String>());
-            }
             let (mut off, mut werr) = (0u32, 0u32);
+            // Grow + fill one 24 MiB window at a time: a single 128 MiB grow can
+            // be rejected, but small incremental grows reliably build the state.
             while off + chunk <= BIG_MIB * MIB {
-                if update(&agent, c, wasm().stable_fill(off, 0x40 + i as u32, chunk).reply().build()).await.is_err() {
+                let p = wasm().stable_grow(grow_pages).stable_fill(off, 0x40 + i as u32, chunk).reply().build();
+                if update(&agent, c, p).await.is_err() {
                     werr += 1;
                 }
                 off += chunk;
@@ -279,19 +281,14 @@ async fn main() {
                 wasm().stable_read(off, chunk).reply().build()
             })
         };
-        let upd_cans = Arc::new(canisters[..canisters.len() - 1].to_vec());
-        let qry_cans = Arc::new(vec![canisters[canisters.len() - 1]]);
+        let all_cans = Arc::new(canisters.as_ref().clone());
         println!(
-            "\n[read] read storm ({secs}s): 24 MiB stable_read/call — UPDATES on {} canisters, QUERIES on 1",
-            upd_cans.len()
+            "\n[read] read storm ({secs}s): 24 MiB stable_read/call — QUERIES across all {} canisters (cycling full range)",
+            all_cans.len()
         );
         let t = Instant::now();
-        let (us, qs) = tokio::join!(
-            storm(agent.clone(), upd_cans, concurrency, Duration::from_secs(secs), mk.clone(), false),
-            storm(agent.clone(), qry_cans.clone(), concurrency, Duration::from_secs(secs), mk.clone(), true),
-        );
-        us.report("READ-UPDATE (24 MiB stable_read)", t.elapsed());
-        qs.report("READ-QUERY (24 MiB stable_read)", t.elapsed());
+        let qs = storm(agent.clone(), all_cans, concurrency, Duration::from_secs(secs), mk.clone(), true).await;
+        qs.report("READ-QUERY (24 MiB stable_read, all canisters)", t.elapsed());
 
         // Read-limit probe: access 48 MiB in one execution (> 32 MiB accessed
         // limit) -> expect a trap, for both update and query.

From 8f964f9782bc384d83c246756366fea64de45524 Mon Sep 17 00:00:00 2001
From: Bjoern Tackmann <bjoern@dfinity.org>
Date: Tue, 16 Jun 2026 15:02:42 +0200
Subject: [PATCH 11/14] test(canister_client): add inter-canister call-thrash
 mode

HAMMER_MODE=calls: each ingress makes the target canister start a
HAMMER_CALL_DEPTH-hop chain of update calls around the canister ring
(nested via call_args().other_side), generating ~2*depth inter-canister
messages per ingress. Used to stress message routing, callbacks and the
guaranteed-response memory reservation under the nano profile.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 rs/canister_client/examples/hammer.rs | 62 ++++++++++++++++++++++++++-
 1 file changed, 61 insertions(+), 1 deletion(-)

diff --git a/rs/canister_client/examples/hammer.rs b/rs/canister_client/examples/hammer.rs
index 0f492035d429..5b9cb956e8f9 100644
--- a/rs/canister_client/examples/hammer.rs
+++ b/rs/canister_client/examples/hammer.rs
@@ -17,7 +17,7 @@ use ic_management_canister_types_private::{
     ProvisionalCreateCanisterWithCyclesArgs,
 };
 use ic_types::{CanisterId, PrincipalId};
-use ic_universal_canister::{get_universal_canister_wasm, wasm};
+use ic_universal_canister::{call_args, get_universal_canister_wasm, wasm};
 use std::collections::BTreeMap;
 use std::str::FromStr;
 use std::sync::atomic::{AtomicU64, Ordering};
@@ -100,6 +100,24 @@ async fn update(
         .await
 }
 
+/// Build a payload that, executed by the ingress-target canister, makes it call
+/// canisters[start+1], which calls canisters[start+2], ... `depth` hops deep
+/// around the canister ring; the innermost canister replies and the replies
+/// propagate back. Generates ~2*depth inter-canister messages per ingress
+/// (depth requests + depth responses), with `depth` outstanding callbacks at
+/// peak (each holding a guaranteed-response memory reservation).
+fn chain_payload(canisters: &[CanisterId], start: usize, depth: usize) -> Vec<u8> {
+    let k = canisters.len();
+    let mut inner = wasm().reply().build(); // innermost callee just replies
+    for h in (1..=depth).rev() {
+        let callee = canisters[(start + h) % k].get().as_slice().to_vec();
+        inner = wasm()
+            .call_simple(callee, "update", call_args().other_side(inner))
+            .build();
+    }
+    inner
+}
+
 /// Create + install a universal canister; optionally pre-grow its stable memory.
 async fn deploy_one(
     agent: &Agent,
@@ -208,6 +226,13 @@ async fn main() {
     // (the 32 MiB limits are stable-only), so a single message can touch
     // arbitrarily large heap.
     let heap_mode = std::env::var("HAMMER_MODE").map(|m| m == "heap").unwrap_or(false);
+    // calls mode: thrash inter-canister communication — each ingress triggers a
+    // chain of canister-to-canister update calls `HAMMER_CALL_DEPTH` hops deep.
+    let calls_mode = std::env::var("HAMMER_MODE").map(|m| m == "calls").unwrap_or(false);
+    let call_depth: usize = std::env::var("HAMMER_CALL_DEPTH")
+        .ok()
+        .and_then(|s| s.parse().ok())
+        .unwrap_or(4);
     // heapread mode: large heap-memory reads pulling lots of distinct state into
     // RAM. Each canister holds a 96 MiB heap global (built via append, small
     // transient); reads use queries (heap reads via update would OOM because
@@ -244,6 +269,41 @@ async fn main() {
     println!("  deployed {} canisters in {:.1}s", canisters.len(), t0.elapsed().as_secs_f64());
     let canisters = Arc::new(canisters);
 
+    if calls_mode {
+        // ---- Inter-canister call thrash ----
+        // Each ingress makes the target canister start a `call_depth`-hop chain
+        // of update calls around the canister ring. With C concurrent ingresses
+        // there are up to C*call_depth outstanding inter-canister calls at peak.
+        let cans = canisters.clone();
+        let ctr = Arc::new(AtomicU64::new(0));
+        let mk: Arc<dyn Fn() -> Vec<u8> + Send + Sync> = {
+            let cans = cans.clone();
+            Arc::new(move || {
+                let n = ctr.fetch_add(1, Ordering::Relaxed) as usize;
+                chain_payload(&cans, n % cans.len(), call_depth)
+            })
+        };
+        println!(
+            "\n[calls] inter-canister call storm ({secs}s): {call_depth}-hop chains, {concurrency} concurrent ingresses across {} canisters",
+            canisters.len()
+        );
+        println!("  (~{} inter-canister messages per ingress; up to {} outstanding calls at peak)", 2 * call_depth, concurrency * call_depth);
+        let t = Instant::now();
+        let stats = storm(
+            agent.clone(),
+            canisters.clone(),
+            concurrency,
+            Duration::from_secs(secs),
+            mk,
+            false,
+        )
+        .await;
+        stats.report(&format!("INTER-CANISTER CALLS ({call_depth}-hop chains)"), t.elapsed());
+
+        println!("\n== done ==");
+        return;
+    }
+
     if read_mode {
         // Populate each canister with ~120 MiB of real stable data (written in
         // <=24 MiB chunks to respect the 32 MiB per-message dirty limit).

From cb022404035c458a571bc900f08aba525d959a3f Mon Sep 17 00:00:00 2001
From: Bjoern Tackmann <bjoern@dfinity.org>
Date: Tue, 16 Jun 2026 15:18:04 +0200
Subject: [PATCH 12/14] test(canister_client): inter-canister fan-out mode (+
 single-msg multiplier)

HAMMER_MODE=fanout: each ingress fires N parallel fire-and-forget update
calls (no-op callbacks), leaving N outstanding inter-canister calls per
in-flight ingress to stress the guaranteed-response memory reservation and
callback limits. HAMMER_FANOUT_MULT repeats the fan-out so a single message
issues N*mult calls (all reservations taken before any drain), which
exposes the 64 MiB guaranteed-response cap (~32 simultaneous calls).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 rs/canister_client/examples/hammer.rs | 64 +++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/rs/canister_client/examples/hammer.rs b/rs/canister_client/examples/hammer.rs
index 5b9cb956e8f9..1cc9b38ba99c 100644
--- a/rs/canister_client/examples/hammer.rs
+++ b/rs/canister_client/examples/hammer.rs
@@ -118,6 +118,32 @@ fn chain_payload(canisters: &[CanisterId], start: usize, depth: usize) -> Vec<u8
     inner
 }
 
+/// Build a payload that makes the ingress-target canister fire one parallel
+/// update call to every canister (fan-out, fire-and-forget: on_reply/on_reject
+/// are no-ops), then reply to the ingress immediately. Each in-flight ingress
+/// thus leaves N = canisters.len() outstanding inter-canister calls, each
+/// holding a guaranteed-response memory reservation (~2 MiB) and a callback
+/// slot — so C concurrent ingresses drive up to C*N simultaneous outstanding
+/// calls, stressing the 64 MiB guaranteed-response cap and the callback limits.
+fn fanout_payload(canisters: &[CanisterId], mult: usize) -> Vec<u8> {
+    let callee_runs = wasm().reply().build(); // callee just replies
+    let noop = wasm().noop().build(); // fire-and-forget: ignore the response
+    let mut p = wasm();
+    // Fire `mult` calls to each canister in ONE message: all the response-memory
+    // reservations are taken before any callee runs, so a single message issuing
+    // > ~32 calls exceeds the 64 MiB guaranteed-response cap.
+    for _ in 0..mult {
+        for c in canisters {
+            let callee = c.get().as_slice().to_vec();
+            p = p
+                .call_new(callee, "update", call_args().on_reply(noop.clone()).on_reject(noop.clone()))
+                .call_data_append(&callee_runs)
+                .call_perform();
+        }
+    }
+    p.reply().build()
+}
+
 /// Create + install a universal canister; optionally pre-grow its stable memory.
 async fn deploy_one(
     agent: &Agent,
@@ -233,6 +259,10 @@ async fn main() {
         .ok()
         .and_then(|s| s.parse().ok())
         .unwrap_or(4);
+    // fanout mode: each ingress fires N parallel calls (fire-and-forget) →
+    // N outstanding calls per in-flight ingress, to stress the guaranteed-
+    // response memory reservation and callback limits.
+    let fanout_mode = std::env::var("HAMMER_MODE").map(|m| m == "fanout").unwrap_or(false);
     // heapread mode: large heap-memory reads pulling lots of distinct state into
     // RAM. Each canister holds a 96 MiB heap global (built via append, small
     // transient); reads use queries (heap reads via update would OOM because
@@ -304,6 +334,40 @@ async fn main() {
         return;
     }
 
+    if fanout_mode {
+        // ---- Inter-canister FAN-OUT thrash (stresses response-memory reservation) ----
+        let mult: usize = std::env::var("HAMMER_FANOUT_MULT")
+            .ok()
+            .and_then(|s| s.parse().ok())
+            .unwrap_or(1);
+        let n = canisters.len() * mult;
+        let cans = canisters.clone();
+        let mk: Arc<dyn Fn() -> Vec<u8> + Send + Sync> = {
+            let cans = cans.clone();
+            Arc::new(move || fanout_payload(&cans, mult))
+        };
+        println!(
+            "\n[fanout] inter-canister FAN-OUT storm ({secs}s): each ingress fires {n} parallel calls in ONE message (fire-and-forget), {concurrency} concurrent ingresses"
+        );
+        println!(
+            "  ({n} simultaneous reservations/ingress; the 64 MiB guaranteed-response cap allows only ~32 — expect rejections when {n} > ~32)"
+        );
+        let t = Instant::now();
+        let stats = storm(
+            agent.clone(),
+            canisters.clone(),
+            concurrency,
+            Duration::from_secs(secs),
+            mk,
+            false,
+        )
+        .await;
+        stats.report(&format!("INTER-CANISTER FAN-OUT (x{n} parallel/ingress)"), t.elapsed());
+
+        println!("\n== done ==");
+        return;
+    }
+
     if read_mode {
         // Populate each canister with ~120 MiB of real stable data (written in
         // <=24 MiB chunks to respect the 32 MiB per-message dirty limit).

From dac58e4a16e5f0a423e2914d6ea1afd12a195f2e Mon Sep 17 00:00:00 2001
From: Bjoern Tackmann <bjoern@dfinity.org>
Date: Tue, 16 Jun 2026 15:26:21 +0200
Subject: [PATCH 13/14] test(canister_client): add hybrid load mode
 (read+write+messaging)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

HAMMER_MODE=hybrid runs three storms concurrently over the canister pool:
query reads (24 MiB stable_read), update writes (8 MiB stable_fill), and
3-hop inter-canister call chains — splitting the concurrency budget. Shows
read/update path isolation and update-path contention under mixed load.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 rs/canister_client/examples/hammer.rs | 67 +++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/rs/canister_client/examples/hammer.rs b/rs/canister_client/examples/hammer.rs
index 1cc9b38ba99c..3dfda207a5b5 100644
--- a/rs/canister_client/examples/hammer.rs
+++ b/rs/canister_client/examples/hammer.rs
@@ -263,6 +263,8 @@ async fn main() {
     // N outstanding calls per in-flight ingress, to stress the guaranteed-
     // response memory reservation and callback limits.
     let fanout_mode = std::env::var("HAMMER_MODE").map(|m| m == "fanout").unwrap_or(false);
+    // hybrid mode: reads + writes + inter-canister messaging all at once.
+    let hybrid_mode = std::env::var("HAMMER_MODE").map(|m| m == "hybrid").unwrap_or(false);
     // heapread mode: large heap-memory reads pulling lots of distinct state into
     // RAM. Each canister holds a 96 MiB heap global (built via append, small
     // transient); reads use queries (heap reads via update would OOM because
@@ -299,6 +301,71 @@ async fn main() {
     println!("  deployed {} canisters in {:.1}s", canisters.len(), t0.elapsed().as_secs_f64());
     let canisters = Arc::new(canisters);
 
+    if hybrid_mode {
+        // ---- Hybrid: heavy reads + writes + inter-canister messaging at once ----
+        const BIG_MIB: u32 = 96;
+        let chunk: u32 = 24 * MIB;
+        let grow_pages = chunk / 65536;
+        let windows = ((BIG_MIB * MIB) / chunk) as u64;
+        println!("\n[hybrid] populating {} canisters to ~{BIG_MIB} MiB stable each...", canisters.len());
+        for (i, c) in canisters.iter().enumerate() {
+            let mut off = 0u32;
+            while off + chunk <= BIG_MIB * MIB {
+                let _ = update(&agent, c, wasm().stable_grow(grow_pages).stable_fill(off, 0x40 + i as u32, chunk).reply().build()).await;
+                off += chunk;
+            }
+        }
+        println!("[hybrid] waiting ~20s for a checkpoint...");
+        tokio::time::sleep(Duration::from_secs(20)).await;
+
+        // Three concurrent storms over the full canister pool, splitting the
+        // concurrency budget. Each canister sees a mix of query-reads,
+        // update-writes, and inter-canister call chains simultaneously.
+        let each = (concurrency / 3).max(1);
+        let roff = Arc::new(AtomicU64::new(0));
+        let woff = Arc::new(AtomicU64::new(0));
+        let mctr = Arc::new(AtomicU64::new(0));
+        let read_mk: Arc<dyn Fn() -> Vec<u8> + Send + Sync> = {
+            let roff = roff.clone();
+            Arc::new(move || {
+                let n = roff.fetch_add(1, Ordering::Relaxed);
+                wasm().stable_read(((n % windows) as u32) * chunk, chunk).reply().build()
+            })
+        };
+        let write_mk: Arc<dyn Fn() -> Vec<u8> + Send + Sync> = {
+            let woff = woff.clone();
+            Arc::new(move || {
+                let n = woff.fetch_add(1, Ordering::Relaxed);
+                // overwrite 8 MiB within an existing window (dirties, no growth)
+                wasm().stable_fill(((n % windows) as u32) * chunk, 0x77, 8 * MIB).reply().build()
+            })
+        };
+        let msg_cans = canisters.clone();
+        let msg_mk: Arc<dyn Fn() -> Vec<u8> + Send + Sync> = {
+            let msg_cans = msg_cans.clone();
+            Arc::new(move || {
+                let n = mctr.fetch_add(1, Ordering::Relaxed) as usize;
+                chain_payload(&msg_cans, n % msg_cans.len(), 3)
+            })
+        };
+        println!(
+            "\n[hybrid] storm ({secs}s): reads(query 24 MiB) + writes(update 8 MiB) + messages(3-hop chains), {each} concurrent each over {} canisters",
+            canisters.len()
+        );
+        let t = Instant::now();
+        let (rs, ws, ms) = tokio::join!(
+            storm(agent.clone(), canisters.clone(), each, Duration::from_secs(secs), read_mk, true),
+            storm(agent.clone(), canisters.clone(), each, Duration::from_secs(secs), write_mk, false),
+            storm(agent.clone(), canisters.clone(), each, Duration::from_secs(secs), msg_mk, false),
+        );
+        rs.report("HYBRID reads (query, 24 MiB stable_read)", t.elapsed());
+        ws.report("HYBRID writes (update, 8 MiB stable_fill)", t.elapsed());
+        ms.report("HYBRID messages (3-hop call chains)", t.elapsed());
+
+        println!("\n== done ==");
+        return;
+    }
+
     if calls_mode {
         // ---- Inter-canister call thrash ----
         // Each ingress makes the target canister start a `call_depth`-hop chain

From c269633609e6b6e59f12563ed9019c84d0e7d323 Mon Sep 17 00:00:00 2001
From: Bjoern Tackmann <bjoern@dfinity.org>
Date: Tue, 16 Jun 2026 15:35:33 +0200
Subject: [PATCH 14/14] fix(config): cap MAX_HEAP_DELTA_PER_ITERATION below the
 heap-delta cap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MAX_HEAP_DELTA_PER_ITERATION was 200 MB > SUBNET_HEAP_DELTA_CAPACITY
(96 MiB), so a single execution round could push the in-memory heap delta
far past the cap before the next round's skip-check — a transient spike of
unreclaimable (anonymous) resident memory (~200-300 MB) that threatens a
512 MiB VM under write load. Lower it to 64 MB so one round cannot
overshoot the cap, tightening the anonymous-memory ceiling.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 rs/config/src/subnet_config.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/rs/config/src/subnet_config.rs b/rs/config/src/subnet_config.rs
index 48bd06192c44..69189a446556 100644
--- a/rs/config/src/subnet_config.rs
+++ b/rs/config/src/subnet_config.rs
@@ -103,7 +103,11 @@ const SYSTEM_SUBNET_FACTOR: u64 = 10;
 // so a round may take 1 to 4 seconds. To avoid regressing the throughput of
 // slow subnets while maintaining the speed of fast subnets, we use the middle
 // value of 200MB.
-const MAX_HEAP_DELTA_PER_ITERATION: NumBytes = NumBytes::new(200 * M);
+// Nano-replica profile: keep a single round's heap-delta production below the
+// SUBNET_HEAP_DELTA_CAPACITY (96 MiB) so one round cannot overshoot the cap and
+// spike unreclaimable (anonymous) resident memory. This bounds the per-round
+// dirty working set so writes stay safe on a 512 MiB - 1 GiB VM.
+const MAX_HEAP_DELTA_PER_ITERATION: NumBytes = NumBytes::new(64 * M);
 
 /// The reserve represents the freely available portion of the
 /// `subnet_heap_delta_capacity` that can be used as a heap delta burst