diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
index cf6ddea88..e3e1cfc2d 100644
--- a/src/a2a3/platform/onboard/host/device_runner.cpp
+++ b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -249,42 +249,68 @@ int AicpuSoInfo::finalize() {
 
 DeviceRunner::~DeviceRunner() { finalize(); }
 
-int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size) {
-    if (static_arena_.is_committed()) {
-        // Idempotent for the production case (sizes do not change across a
-        // worker's lifetime). If a caller asks for a larger layout, redo it.
-        if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_) return 0;
-        static_arena_.release();
-        gm_heap_region_off_ = SIZE_MAX;
-        gm_sm_region_off_ = SIZE_MAX;
-        cached_gm_heap_size_ = 0;
-        cached_gm_sm_size_ = 0;
-    }
-    gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign);
-    gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign);
-    if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
-        // Roll back the two reserves: commit() failure leaves committed_=false,
-        // so the next entry would skip the release branch and stack new
-        // reserves on top of the stale cursor. release() is idempotent on a
-        // never-committed arena (just zeroes cursor_ / region_count_).
-        static_arena_.release();
-        gm_heap_region_off_ = SIZE_MAX;
-        gm_sm_region_off_ = SIZE_MAX;
-        return -1;
-    }
-    cached_gm_heap_size_ = gm_heap_size;
-    cached_gm_sm_size_ = gm_sm_size;
+int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
+    // Three independent device_malloc'd buffers: GM heap, PTO2 SM, prebuilt
+    // runtime arena. Split out from a single large allocation because the
+    // combined size can exceed the device allocator's largest contiguous
+    // block. Each arena commits exactly one region, so its base() is the
+    // pooled pointer the caller wants.
+    //
+    // Idempotent for the production case (sizes do not change across a
+    // worker's lifetime). If a caller asks for a larger layout on any
+    // region, redo just that region — already-committed peers stay alive
+    // so their callers don't have to re-acquire.
+    auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int {
+        if (requested_size == 0) {
+            // hbg's runtime_arena path: caller passed 0 and never reserved
+            // a region. Leave the arena uncommitted; acquire_pooled_* will
+            // return nullptr.
+            if (arena.is_committed() && cached_size != 0) {
+                arena.release();
+                cached_size = 0;
+            }
+            return 0;
+        }
+        if (arena.is_committed() && requested_size <= cached_size) {
+            return 0;
+        }
+        arena.release();
+        cached_size = 0;
+        arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign);
+        if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
+            // commit() failure leaves committed_=false, so the next entry's
+            // is_committed() guard skips the release branch. release() is
+            // idempotent on a never-committed arena (zeroes cursor_).
+            arena.release();
+            return -1;
+        }
+        cached_size = requested_size;
+        return 0;
+    };
+    // Failure of a later region leaves earlier peers committed on purpose:
+    // pooled pointers previously returned to callers must stay valid even if
+    // this resize attempt aborts.
+    if (commit_region(gm_heap_arena_, cached_gm_heap_size_, gm_heap_size) != 0) return -1;
+    if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) return -1;
+    if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) return -1;
     return 0;
 }
 
 void *DeviceRunner::acquire_pooled_gm_heap() {
-    if (!static_arena_.is_committed()) return nullptr;
-    return static_arena_.region_ptr(gm_heap_region_off_);
+    if (!gm_heap_arena_.is_committed()) return nullptr;
+    return gm_heap_arena_.base();
 }
 
 void *DeviceRunner::acquire_pooled_gm_sm() {
-    if (!static_arena_.is_committed()) return nullptr;
-    return static_arena_.region_ptr(gm_sm_region_off_);
+    if (!gm_sm_arena_.is_committed()) return nullptr;
+    return gm_sm_arena_.base();
+}
+
+void *DeviceRunner::acquire_pooled_runtime_arena() {
+    // hbg calls setup_static_arena(...,0) and leaves runtime_arena_pool_
+    // uncommitted — fail loudly if a caller asks for it anyway.
+    if (!runtime_arena_pool_.is_committed()) return nullptr;
+    return runtime_arena_pool_.base();
 }
 
 std::thread DeviceRunner::create_thread(std::function<void()> fn) {
@@ -1222,14 +1248,16 @@ int DeviceRunner::finalize() {
     // perf_cleanup guard; this is the backstop for the no-run-since-init case.
     finalize_collectors();
 
-    // Release per-Worker static arena (GM heap + PTO2 SM in a single backing
-    // device allocation). Must precede mem_alloc_.finalize() so the arena
-    // frees through the still-live allocator, not after it.
-    static_arena_.release();
-    gm_heap_region_off_ = SIZE_MAX;
-    gm_sm_region_off_ = SIZE_MAX;
+    // Release the three per-Worker pooled arenas (GM heap, PTO2 SM, optional
+    // trb prebuilt runtime arena — each its own device_malloc). Must precede
+    // mem_alloc_.finalize() so the arenas free through the still-live
+    // allocator, not after it.
+    gm_heap_arena_.release();
+    gm_sm_arena_.release();
+    runtime_arena_pool_.release();
     cached_gm_heap_size_ = 0;
     cached_gm_sm_size_ = 0;
+    cached_runtime_arena_size_ = 0;
 
     // Free all remaining allocations (including handshake buffer and binGmAddr)
     mem_alloc_.finalize();
diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h
index 53fb6555f..93501a916 100644
--- a/src/a2a3/platform/onboard/host/device_runner.h
+++ b/src/a2a3/platform/onboard/host/device_runner.h
@@ -185,25 +185,36 @@ struct KernelArgsHelper {
 class DeviceRunner {
 public:
     DeviceRunner() :
-        static_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {}
+        gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
+        gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
+        runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {}
     ~DeviceRunner();
 
     /**
-     * Lay out and commit the per-Worker static device arena that backs the
-     * PTO2 GM heap and PTO2 shared memory in a single underlying allocation.
-     * Must be called before acquire_pooled_gm_heap / acquire_pooled_gm_sm.
-     * Idempotent on identical sizes. Returns 0 on success, -1 on failure.
+     * Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared
+     * memory, trb prebuilt runtime arena) as three independent device
+     * allocations. Must be called before any acquire_pooled_*. Idempotent
+     * on identical sizes. `runtime_arena_size` is 0 for the hbg path (no
+     * prebuilt runtime arena) — the corresponding arena stays uncommitted.
+     * Returns 0 on success, -1 on failure.
      */
-    int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size);
+    int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
 
     /**
-     * Return the pooled GM heap / PTO2 SM pointer. setup_static_arena must
-     * have been called earlier in this Worker; otherwise these return
-     * nullptr. Both pointers are stable for the lifetime of the Worker and
-     * the single underlying device buffer is released in `finalize()`.
+     * Return the pooled GM heap / PTO2 SM / runtime arena pointer.
+     * setup_static_arena must have already committed the relevant region;
+     * otherwise these return nullptr. All pointers are stable for the
+     * Worker's lifetime; the three underlying device buffers are released
+     * in `finalize()`.
+     *
+     * acquire_pooled_runtime_arena() is trb-only — the runtime arena region
+     * is only committed when setup_static_arena was called with
+     * runtime_arena_size > 0. Calling it on the hbg path
+     * (setup_static_arena(...,0)) returns nullptr (well-defined).
      */
     void *acquire_pooled_gm_heap();
     void *acquire_pooled_gm_sm();
+    void *acquire_pooled_runtime_arena();
 
     /**
      * Create a thread bound to this device.
@@ -602,22 +613,31 @@ class DeviceRunner {
     // Memory management
     MemoryAllocator mem_alloc_;
 
-    // Per-Worker arena backing the PTO2 GM heap + PTO2 shared memory in a
-    // single device allocation. Released explicitly in finalize() before
-    // mem_alloc_.finalize() so it does not free pointers a second time.
+    // Three independent per-Worker arenas, each backing a single pooled
+    // region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime
+    // arena). Split out from a single backing allocation because the
+    // combined size can exceed the device allocator's largest contiguous
+    // block — three separate device_malloc calls are friendlier than one
+    // big one. Released explicitly in finalize() before mem_alloc_.finalize()
+    // so the underlying buffers do not get freed twice.
+    //
+    // `runtime_arena_pool_` stays unreserved when setup_static_arena was
+    // invoked with runtime_arena_size == 0 (hbg path).
     //
     // Trampolines forward DeviceArena's alloc/free calls to mem_alloc_.
     static void *arena_alloc_trampoline(void *ctx, size_t size) {
         return static_cast<MemoryAllocator *>(ctx)->alloc(size);
     }
     static void arena_free_trampoline(void *ctx, void *p) { static_cast<MemoryAllocator *>(ctx)->free(p); }
-    DeviceArena static_arena_;
-    size_t gm_heap_region_off_{SIZE_MAX};
-    size_t gm_sm_region_off_{SIZE_MAX};
-    // Cached sizes for setup_static_arena's "fits" check — avoids calling
-    // region_size() on the arena's public API for the two regions we own.
+    DeviceArena gm_heap_arena_;
+    DeviceArena gm_sm_arena_;
+    DeviceArena runtime_arena_pool_;
+    // Cached sizes for setup_static_arena's "fits" check — avoids re-allocating
+    // the same buffer when a later worker init asks for an equal-or-smaller
+    // layout on an already-committed arena.
     size_t cached_gm_heap_size_{0};
     size_t cached_gm_sm_size_{0};
+    size_t cached_runtime_arena_size_{0};
 
     // Device resources
     rtStream_t stream_aicpu_{nullptr};
diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
index 744b7291c..29c14d862 100644
--- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
@@ -108,9 +108,9 @@ static uint64_t upload_chip_callable_buffer_wrapper(const void *callable) {
     }
 }
 
-static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size) {
+static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
     try {
-        return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size);
+        return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size, runtime_arena_size);
     } catch (...) {
         return -1;
     }
@@ -132,6 +132,14 @@ static void *acquire_pooled_gm_sm_wrapper() {
     }
 }
 
+static void *acquire_pooled_runtime_arena_wrapper() {
+    try {
+        return current_runner()->acquire_pooled_runtime_arena();
+    } catch (...) {
+        return nullptr;
+    }
+}
+
 /* ===========================================================================
  * Public C API (resolved by ChipWorker via dlsym)
  * =========================================================================== */
@@ -370,6 +378,7 @@ int run_prepared(
         r->host_api.setup_static_arena = setup_static_arena_wrapper;
         r->host_api.acquire_pooled_gm_heap = acquire_pooled_gm_heap_wrapper;
         r->host_api.acquire_pooled_gm_sm = acquire_pooled_gm_sm_wrapper;
+        r->host_api.acquire_pooled_runtime_arena = acquire_pooled_runtime_arena_wrapper;
         r->host_api.upload_chip_callable_buffer = upload_chip_callable_buffer_wrapper;
 
         // Restore kernel addrs + orch symbol names + active_callable_id; the
diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
index 1635f3a7a..9a9cbbabf 100644
--- a/src/a2a3/platform/sim/host/device_runner.cpp
+++ b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -122,40 +122,68 @@ bool create_temp_so_file(const std::string &path_template, const uint8_t *data,
 
 DeviceRunner::~DeviceRunner() { finalize(); }
 
-int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size) {
-    if (static_arena_.is_committed()) {
-        if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_) return 0;
-        static_arena_.release();
-        gm_heap_region_off_ = SIZE_MAX;
-        gm_sm_region_off_ = SIZE_MAX;
+int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
+    // Three independent device_malloc'd buffers: GM heap, PTO2 SM, prebuilt
+    // runtime arena. Split out from a single large allocation because the
+    // combined size can exceed the device allocator's largest contiguous
+    // block. Each arena commits exactly one region, so its base() is the
+    // pooled pointer the caller wants.
+    //
+    // Idempotent for the production case (sizes do not change across a
+    // worker's lifetime). If a caller asks for a larger layout on any
+    // region, redo just that region.
+    auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int {
+        if (requested_size == 0) {
+            if (arena.is_committed() && cached_size != 0) {
+                arena.release();
+                cached_size = 0;
+            }
+            return 0;
+        }
+        if (arena.is_committed() && requested_size <= cached_size) {
+            return 0;
+        }
+        arena.release();
+        cached_size = 0;
+        arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign);
+        if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
+            arena.release();
+            return -1;
+        }
+        cached_size = requested_size;
+        return 0;
+    };
+    if (commit_region(gm_heap_arena_, cached_gm_heap_size_, gm_heap_size) != 0) return -1;
+    if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) {
+        gm_heap_arena_.release();
         cached_gm_heap_size_ = 0;
-        cached_gm_sm_size_ = 0;
+        return -1;
     }
-    gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign);
-    gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign);
-    if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
-        // Roll back the two reserves: commit() failure leaves committed_=false,
-        // so the next entry would skip the release branch and stack new
-        // reserves on top of the stale cursor. release() is idempotent on a
-        // never-committed arena (just zeroes cursor_ / region_count_).
-        static_arena_.release();
-        gm_heap_region_off_ = SIZE_MAX;
-        gm_sm_region_off_ = SIZE_MAX;
+    if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) {
+        gm_heap_arena_.release();
+        gm_sm_arena_.release();
+        cached_gm_heap_size_ = 0;
+        cached_gm_sm_size_ = 0;
         return -1;
     }
-    cached_gm_heap_size_ = gm_heap_size;
-    cached_gm_sm_size_ = gm_sm_size;
     return 0;
 }
 
 void *DeviceRunner::acquire_pooled_gm_heap() {
-    if (!static_arena_.is_committed()) return nullptr;
-    return static_arena_.region_ptr(gm_heap_region_off_);
+    if (!gm_heap_arena_.is_committed()) return nullptr;
+    return gm_heap_arena_.base();
 }
 
 void *DeviceRunner::acquire_pooled_gm_sm() {
-    if (!static_arena_.is_committed()) return nullptr;
-    return static_arena_.region_ptr(gm_sm_region_off_);
+    if (!gm_sm_arena_.is_committed()) return nullptr;
+    return gm_sm_arena_.base();
+}
+
+void *DeviceRunner::acquire_pooled_runtime_arena() {
+    // hbg calls setup_static_arena(...,0) and leaves runtime_arena_pool_
+    // uncommitted — fail loudly if a caller asks for it anyway.
+    if (!runtime_arena_pool_.is_committed()) return nullptr;
+    return runtime_arena_pool_.base();
 }
 
 std::thread DeviceRunner::create_thread(std::function<void()> fn) {
@@ -1032,24 +1060,29 @@ int DeviceRunner::finalize() {
     // Close executor .so files (typically already closed by run(), this is a safety net)
     unload_executor_binaries();
 
-    // Release per-Worker static arena (GM heap + PTO2 SM in a single backing
-    // device allocation). Must precede mem_alloc_.finalize() so the arena
-    // frees through the still-live allocator, not after it.
-    static_arena_.release();
-    gm_heap_region_off_ = SIZE_MAX;
-    gm_sm_region_off_ = SIZE_MAX;
+    // Release the three per-Worker pooled arenas (GM heap, PTO2 SM, optional
+    // trb prebuilt runtime arena — each its own device_malloc). Must precede
+    // mem_alloc_.finalize() so the arenas free through the still-live
+    // allocator, not after it.
+    gm_heap_arena_.release();
+    gm_sm_arena_.release();
+    runtime_arena_pool_.release();
     cached_gm_heap_size_ = 0;
     cached_gm_sm_size_ = 0;
+    cached_runtime_arena_size_ = 0;
 
-    // Free all remaining allocations
-    mem_alloc_.finalize();
-    clear_cpu_sim_shared_storage();
-
-    // Free the 8-byte device_wall buffer (allocated lazily in run()).
+    // Free the 8-byte device_wall buffer (allocated lazily in run()) before
+    // mem_alloc_.finalize(): free_tensor() routes back through mem_alloc_,
+    // so doing it after finalize would be a use-after-finalize.
     if (device_wall_dev_ptr_ != nullptr) {
         free_tensor(device_wall_dev_ptr_);
         device_wall_dev_ptr_ = nullptr;
     }
+
+    // Free all remaining allocations
+    mem_alloc_.finalize();
+    clear_cpu_sim_shared_storage();
+
     device_id_ = -1;
     worker_count_ = 0;
     last_runtime_ = nullptr;
diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h
index 73b3dfea2..46ee45913 100644
--- a/src/a2a3/platform/sim/host/device_runner.h
+++ b/src/a2a3/platform/sim/host/device_runner.h
@@ -75,24 +75,33 @@
 class DeviceRunner {
 public:
     DeviceRunner() :
-        static_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {}
+        gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
+        gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
+        runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {}
     ~DeviceRunner();
 
     /**
-     * Lay out and commit the per-Worker static device arena that backs the
-     * PTO2 GM heap and PTO2 shared memory in a single underlying allocation.
-     * Must be called before acquire_pooled_gm_heap / acquire_pooled_gm_sm.
-     * Idempotent on identical sizes. Returns 0 on success, -1 on failure.
+     * Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared
+     * memory, trb prebuilt runtime arena) as three independent device
+     * allocations. Must be called before any acquire_pooled_*.
+     * `runtime_arena_size` is 0 for the hbg path (leaves that arena
+     * uncommitted). Idempotent on identical sizes. Returns 0 on success,
+     * -1 on failure.
      */
-    int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size);
+    int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
 
     /**
-     * Return the pooled GM heap / PTO2 SM pointer. setup_static_arena must
-     * have been called earlier in this Worker; otherwise these return
-     * nullptr. Pointers are stable for the lifetime of the Worker.
+     * Return the pooled GM heap / PTO2 SM / runtime arena pointer.
+     * setup_static_arena must have already committed the relevant region.
+     *
+     * acquire_pooled_runtime_arena() is trb-only — the runtime arena region
+     * is only committed when setup_static_arena was called with
+     * runtime_arena_size > 0. Calling it on the hbg path
+     * (setup_static_arena(...,0)) returns nullptr (well-defined).
      */
     void *acquire_pooled_gm_heap();
     void *acquire_pooled_gm_sm();
+    void *acquire_pooled_runtime_arena();
 
     /**
      * Create a thread bound to this device.
@@ -280,22 +289,29 @@ class DeviceRunner {
     // Memory management
     MemoryAllocator mem_alloc_;
 
-    // Per-Worker arena backing the PTO2 GM heap + PTO2 shared memory in a
-    // single device allocation. Released explicitly in finalize() before
-    // mem_alloc_.finalize() so it does not free pointers a second time.
+    // Three independent per-Worker arenas, each backing a single pooled
+    // region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime
+    // arena). Split out from a single backing allocation because the
+    // combined size can exceed the device allocator's largest contiguous
+    // block. Released explicitly in finalize() before mem_alloc_.finalize()
+    // so the underlying buffers do not get freed twice.
+    //
+    // `runtime_arena_pool_` stays unreserved when setup_static_arena was
+    // invoked with runtime_arena_size == 0 (hbg path).
     //
     // Trampolines forward DeviceArena's alloc/free to mem_alloc_.
     static void *arena_alloc_trampoline(void *ctx, size_t size) {
         return static_cast<MemoryAllocator *>(ctx)->alloc(size);
     }
     static void arena_free_trampoline(void *ctx, void *p) { static_cast<MemoryAllocator *>(ctx)->free(p); }
-    DeviceArena static_arena_;
-    size_t gm_heap_region_off_{SIZE_MAX};
-    size_t gm_sm_region_off_{SIZE_MAX};
-    // Cached sizes for setup_static_arena's "fits" check — avoids calling
-    // region_size() on the arena's public API for the two regions we own.
+    DeviceArena gm_heap_arena_;
+    DeviceArena gm_sm_arena_;
+    DeviceArena runtime_arena_pool_;
+    // Cached sizes for setup_static_arena's "fits" check — avoids re-allocating
+    // a buffer when a later worker init asks for an equal-or-smaller layout.
     size_t cached_gm_heap_size_{0};
     size_t cached_gm_sm_size_{0};
+    size_t cached_runtime_arena_size_{0};
 
     // Simulation state (no actual device resources)
     KernelArgs kernel_args_;
diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
index 4ad438a9c..fca663610 100644
--- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
@@ -103,9 +103,9 @@ static uint64_t upload_chip_callable_buffer_wrapper(const void *callable) {
     }
 }
 
-static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size) {
+static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
     try {
-        return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size);
+        return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size, runtime_arena_size);
     } catch (...) {
         return -1;
     }
@@ -127,6 +127,14 @@ static void *acquire_pooled_gm_sm_wrapper() {
     }
 }
 
+static void *acquire_pooled_runtime_arena_wrapper() {
+    try {
+        return current_runner()->acquire_pooled_runtime_arena();
+    } catch (...) {
+        return nullptr;
+    }
+}
+
 /* ===========================================================================
  * Public C API (resolved by ChipWorker via dlsym)
  * =========================================================================== */
@@ -333,6 +341,7 @@ int run_prepared(
         r->host_api.setup_static_arena = setup_static_arena_wrapper;
         r->host_api.acquire_pooled_gm_heap = acquire_pooled_gm_heap_wrapper;
         r->host_api.acquire_pooled_gm_sm = acquire_pooled_gm_sm_wrapper;
+        r->host_api.acquire_pooled_runtime_arena = acquire_pooled_runtime_arena_wrapper;
         r->host_api.upload_chip_callable_buffer = upload_chip_callable_buffer_wrapper;
 
         auto bind_result = runner->bind_prepared_callable_to_runtime(*r, callable_id);
diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.h b/src/a2a3/runtime/host_build_graph/runtime/runtime.h
index 41845bdf0..ccdc05ce0 100644
--- a/src/a2a3/runtime/host_build_graph/runtime/runtime.h
+++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.h
@@ -140,9 +140,16 @@ struct HostApi {
     // pto_runtime_c_api.cpp can populate the same HostApi struct regardless of
     // which runtime variant it is built against. Unset for this variant; do
     // not call.
-    int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size);
+    // PTO2 static-arena hooks. The host_build_graph runtime does not currently
+    // use these — the fields exist only so the platform layer's
+    // pto_runtime_c_api.cpp can populate the same HostApi struct regardless of
+    // which runtime variant it is built against. Unset for this variant; do
+    // not call. hbg-side callers pass runtime_arena_size == 0 (hbg has no
+    // prebuilt runtime arena).
+    int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
     void *(*acquire_pooled_gm_heap)();
     void *(*acquire_pooled_gm_sm)();
+    void *(*acquire_pooled_runtime_arena)();
     // Single-shot upload of the entire ChipCallable buffer. `callable` is a
     // `const ChipCallable *` (declared void* to avoid pulling task_interface
     // headers into runtime.h). DeviceRunner walks child_offsets_ to compute
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index f8e35917b..5c31c5b9a 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -467,29 +467,60 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 static_cast<uint64_t>(task_window_size), static_cast<uint64_t>(heap_size), dep_pool_capacity
             );
 
-            void *sm_ptr = runtime->get_gm_sm_ptr();
-            void *gm_heap = runtime->get_gm_heap_ptr();
+            // gm_heap pointer / dep_pool_capacity are encoded into the prebuilt
+            // runtime arena image at host build time, so we no longer fetch
+            // them here. They remain on the host Runtime instance and on the
+            // PTO2Runtime header for diagnostic purposes only.
+            (void)dep_pool_capacity;
 
+            void *sm_ptr = runtime->get_gm_sm_ptr();
             uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(task_window_size);
-            rt = runtime_create_from_sm(
-                PTO2_MODE_EXECUTE, sm_ptr, sm_size, task_window_size, gm_heap, heap_size, runtime_arena_,
-                dep_pool_capacity
-            );
-            if (!rt) {
-                LOG_ERROR("Thread %d: Failed to create PTO2Runtime", thread_idx);
-                // Unblock scheduler threads before returning so they don't spin forever.
+
+            // Prebuilt-arena fast path. Host has pre-populated the entire
+            // runtime arena (PTO2Runtime + orchestrator/scheduler/tensor_map
+            // sub-regions + sm_handle wrapper + mailbox) and uploaded it via
+            // rtMemcpy into the pooled runtime_arena buffer. We attach to it,
+            // wire arena-internal pointers to their device addresses, reset
+            // the SM, and finalize the few device-only fields the host could
+            // not know at image-build time.
+            void *prebuilt_arena = runtime->get_prebuilt_arena_base();
+            size_t off_runtime = runtime->get_prebuilt_runtime_offset();
+            if (prebuilt_arena == nullptr) {
+                LOG_ERROR("Thread %d: prebuilt_arena_base is null", thread_idx);
+                runtime_init_ready_.store(true, std::memory_order_release);
+                return -1;
+            }
+            runtime_arena_.attach(prebuilt_arena, DeviceArena::kDefaultBaseAlign);
+            rt = reinterpret_cast<PTO2Runtime *>(static_cast<char *>(prebuilt_arena) + off_runtime);
+
+            // Wire every arena-internal pointer field (host wrote host-mirror
+            // addresses; we overwrite them with device addresses).
+            runtime_wire_arena_pointers(runtime_arena_, rt->prebuilt_layout, rt);
+
+            // Reset SM state. setup_pointers + init_header_per_ring restore
+            // ring flow-control counters, layout metadata, error flags, and
+            // the per-slot ring->slot_states[] (bind_ring + reset_for_reuse +
+            // fanin_count/active_mask zero — previously done inside
+            // RingSchedState::init).
+            memset(rt->sm_handle, 0, sizeof(*rt->sm_handle));
+            if (!rt->sm_handle->init(sm_ptr, sm_size, task_window_size, heap_size)) {
+                LOG_ERROR("Thread %d: sm_handle->init failed", thread_idx);
                 runtime_init_ready_.store(true, std::memory_order_release);
                 return -1;
             }
 
+            // AICore completion mailbox lives in the arena; reset it each
+            // boot so stale completion notifications from a previous run do
+            // not leak.
+            memset(rt->aicore_mailbox, 0, sizeof(*rt->aicore_mailbox));
+
+            // Fill ops / core counts (host can't resolve s_runtime_ops's
+            // device address nor know the SchedulerContext's core fan-out).
+            runtime_finalize_after_wire(rt, sched_ctx_.aic_count(), sched_ctx_.aiv_count());
 #if PTO2_PROFILING
             rt->orchestrator.l2_perf_level = get_l2_perf_level();
 #endif
 
-            // Total core counts = aic_count_ / aiv_count_ (set once at runtime init).
-            rt->orchestrator.total_cluster_count = sched_ctx_.aic_count();
-            rt->orchestrator.total_aiv_count = sched_ctx_.aiv_count();
-
             // With multi-ring, slot_states are per-ring inside the scheduler.
             runtime->set_slot_states_ptr(nullptr);
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp
index 027805918..71a482632 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp
@@ -487,11 +487,15 @@ dep_gen_replay_emit_deps_json(const DepGenRecord *records, size_t num_records, c
         PTO2TensorMap::reserve_layout(replay_arena, PTO2_TENSORMAP_NUM_BUCKETS, pool_size, task_window_sizes);
     auto annot_layout =
         PTO2TensorMap::reserve_layout(replay_arena, PTO2_TENSORMAP_NUM_BUCKETS, pool_size, task_window_sizes);
-    if (replay_arena.commit() == nullptr || !tm_oracle.init_from_layout(oracle_layout, replay_arena) ||
-        !tm_annot.init_from_layout(annot_layout, replay_arena)) {
+    if (replay_arena.commit() == nullptr || !tm_oracle.init_data_from_layout(oracle_layout, replay_arena) ||
+        !tm_annot.init_data_from_layout(annot_layout, replay_arena)) {
         LOG_ERROR("dep_gen replay: tensormap.init failed (buckets=%d, pool=%d)", PTO2_TENSORMAP_NUM_BUCKETS, pool_size);
         return -3;
     }
+    // Replay tensormaps live entirely on host; only arena-internal pointer
+    // fields need wiring (no parent-orch back-reference exists anymore).
+    tm_oracle.wire_arena_pointers(oracle_layout, replay_arena);
+    tm_annot.wire_arena_pointers(annot_layout, replay_arena);
 
     // JSON output accumulators.
     std::vector<TaskTableEntry> task_table;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index a75205196..e40aa5ae7 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -36,11 +36,13 @@
 #include <cstring>
 
 #include "../common/pto_runtime_status.h"
+#include "../runtime/pto_runtime2.h"
 #include "../runtime/pto_shared_memory.h"
 #include "../runtime/runtime.h"
 #include "callable.h"
 #include "common/platform_config.h"
 #include "common/unified_log.h"
+#include "device_arena.h"
 #include "prepare_callable_common.h"
 
 // Helper: return current time in milliseconds
@@ -271,15 +273,35 @@ extern "C" int bind_prepared_to_runtime_impl(
     uint64_t eff_heap_size = runtime->heap_size ? runtime->heap_size : PTO2_HEAP_SIZE;
     uint64_t eff_task_window_size = runtime->task_window_size ? runtime->task_window_size : PTO2_TASK_WINDOW_SIZE;
 
-    // Lay out the per-Worker static device arena. GM heap (orchestrator output
-    // buffers, all rings combined) and PTO2 shared memory live in a single
-    // backing allocation; setup_static_arena reserves both regions and
-    // commits in one shot. Owned by DeviceRunner across runs — do NOT record
-    // in tensor_pairs_; the free is deferred to DeviceRunner::finalize().
+    // Lay out the per-Worker static device arena. GM heap, PTO2 shared memory,
+    // and the prebuilt runtime arena all live in a single backing allocation;
+    // setup_static_arena reserves the three regions and commits in one shot.
+    // Owned by DeviceRunner across runs — do NOT record in tensor_pairs_; the
+    // free is deferred to DeviceRunner::finalize(). The runtime-arena size is
+    // determined by replaying the reserve sequence on a host-side arena.
     uint64_t total_heap_size = eff_heap_size * PTO2_MAX_RING_DEPTH;
     uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(eff_task_window_size);
+    // dep_pool_size comes from a uint64 env var; reject values that don't fit
+    // the int32_t layout-sizing path rather than silently truncating.
+    int32_t eff_dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE;
+    if (runtime->dep_pool_size != 0) {
+        if (runtime->dep_pool_size > static_cast<uint64_t>(INT32_MAX)) {
+            LOG_ERROR("PTO2_RING_DEP_POOL=%" PRIu64 " exceeds INT32_MAX", runtime->dep_pool_size);
+            return -1;
+        }
+        eff_dep_pool_capacity = static_cast<int32_t>(runtime->dep_pool_size);
+    }
+
+    int64_t t_prebuilt_start = _now_ms();
+    DeviceArena host_arena;  // libc malloc backend by default
+    PTO2RuntimeArenaLayout layout = runtime_reserve_layout(host_arena, eff_task_window_size, eff_dep_pool_capacity);
+    if (host_arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
+        LOG_ERROR("Failed to commit host arena for prebuilt runtime image");
+        return -1;
+    }
+
     int64_t t_setup_start = _now_ms();
-    if (runtime->host_api.setup_static_arena(total_heap_size, sm_size) != 0) {
+    if (runtime->host_api.setup_static_arena(total_heap_size, sm_size, layout.arena_size) != 0) {
         LOG_ERROR("Failed to setup pooled static arena");
         return -1;
     }
@@ -303,9 +325,48 @@ extern "C" int bind_prepared_to_runtime_impl(
     }
     runtime->set_gm_sm_ptr(sm_ptr);
 
+    void *runtime_arena_dev = runtime->host_api.acquire_pooled_runtime_arena();
+    if (runtime_arena_dev == nullptr) {
+        LOG_ERROR("Failed to acquire pooled runtime arena");
+        return -1;
+    }
+
     // Set up device orchestration state
     runtime->set_orch_args(device_args);
 
+    // -------------------------------------------------------------------------
+    // Build the prebuilt runtime-arena image on host.
+    //
+    // We pre-compute every byte the AICPU's runtime arena would otherwise have
+    // to write at boot: layout offsets, sub-structure init data, and pointers
+    // back to the SM / GM heap. Then we rtMemcpy the image into the pooled
+    // runtime-arena region that DeviceRunner keeps alive across runs. AICPU
+    // boot becomes attach + wire (cheap pointer fixup) + sm_handle->init (SM
+    // reset) + a handful of device-only field fixups.
+    // -------------------------------------------------------------------------
+    PTO2Runtime *rt =
+        runtime_init_data_from_layout(host_arena, layout, PTO2_MODE_EXECUTE, sm_ptr, sm_size, gm_heap, eff_heap_size);
+    if (rt == nullptr) {
+        LOG_ERROR("runtime_init_data_from_layout failed");
+        return -1;
+    }
+    runtime_wire_arena_pointers(host_arena, layout, rt);
+
+    // Stash the layout inside the PTO2Runtime image so the AICPU can recover
+    // every arena-internal offset after rtMemcpy. The runtime arena's device
+    // base does NOT travel in this image — it's on the host Runtime
+    // (set_prebuilt_arena below), since the AICPU needs that pointer
+    // *before* it can dereference the image.
+    rt->prebuilt_layout = layout;
+
+    int rc_upload = runtime->host_api.copy_to_device(runtime_arena_dev, host_arena.base(), layout.arena_size);
+    if (rc_upload != 0) {
+        LOG_ERROR("Failed to rtMemcpy prebuilt runtime arena to device (rc=%d)", rc_upload);
+        return -1;
+    }
+    runtime->set_prebuilt_arena(runtime_arena_dev, layout.off_runtime);
+    int64_t t_prebuilt_end = _now_ms();
+
     LOG_INFO_V0("Device orchestration ready: %d tensors + %d scalars", tensor_count, scalar_count);
 
     int64_t t_total_end = _now_ms();
@@ -313,6 +374,7 @@ extern "C" int bind_prepared_to_runtime_impl(
     LOG_INFO_V0("TIMING: static_arena_setup = %" PRId64 "ms", t_setup_end - t_setup_start);
     LOG_INFO_V0("TIMING: gm_heap_acquire = %" PRId64 "ms", t_heap_end - t_heap_start);
     LOG_INFO_V0("TIMING: shared_mem_acquire = %" PRId64 "ms", t_sm_end - t_sm_start);
+    LOG_INFO_V0("TIMING: prebuilt_runtime_arena = %" PRId64 "ms", t_prebuilt_end - t_prebuilt_start);
     LOG_INFO_V0("TIMING: total_init_runtime_impl = %" PRId64 "ms", t_total_end - t_total_start);
 
     return 0;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index 5f6d20855..f80c7a655 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -355,11 +355,21 @@ static bool prepare_task(
 
     prefetch_payload(out->payload, args.tensor_count(), args.scalar_count());
 
+    // Re-bind payload/task pointers each submit. Value is per-slot constant
+    // (same as &task_payloads[slot] / &task_descriptors[slot]), but writing
+    // here lets RingSchedState::init() skip the O(window_size) bind loop.
+    // Both writes hit the same 64B slot_state cache line we're about to
+    // dirty below, so the extra cost is two stores on an already-hot line.
+    // Must precede the scheduler wiring.queue.push at the end of
+    // submit_task_common — that push is the first read of slot_state->task /
+    // slot_state->payload by another thread.
+    out->slot_state->bind_buffers(out->payload, out->task);
+
     // Fields already reset by advance_ring_pointers (eager reset after CONSUMED):
     //   fanout_lock=0, fanout_count=1, fanout_head=nullptr,
     //   fanin_refcount=0, fanout_refcount=0, completed_subtasks=0, next_block_idx=0
     // Fields immutable after RingSchedState::init():
-    //   payload, task, ring_id
+    //   ring_id
     // task_state left as CONSUMED by eager reset (safe for stale wait_for_tensor
     // observers); set to PENDING here when orchestrator actually reuses the slot.
     out->slot_state->task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
@@ -374,89 +384,6 @@ static bool prepare_task(
     return true;
 }
 
-// =============================================================================
-// Orchestrator Initialization
-// =============================================================================
-
-PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout(
-    DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity
-) {
-    PTO2OrchestratorLayout layout{};
-    layout.dep_pool_capacity = dep_pool_capacity;
-    layout.scope_tasks_cap = PTO2_SCOPE_TASKS_CAP;
-    layout.scope_stack_capacity = PTO2_MAX_SCOPE_DEPTH;
-
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        const size_t fanin_pool_bytes =
-            PTO2_ALIGN_UP(static_cast<size_t>(dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE);
-        layout.off_fanin_pool[r] = arena.reserve(fanin_pool_bytes, PTO2_ALIGN_SIZE);
-    }
-    layout.off_scope_tasks = arena.reserve(
-        static_cast<size_t>(layout.scope_tasks_cap) * sizeof(PTO2TaskSlotState *), alignof(PTO2TaskSlotState *)
-    );
-    layout.off_scope_begins =
-        arena.reserve(static_cast<size_t>(layout.scope_stack_capacity) * sizeof(int32_t), alignof(int32_t));
-    layout.tensor_map = PTO2TensorMap::reserve_layout_default(arena, task_window_sizes);
-    return layout;
-}
-
-bool PTO2OrchestratorState::init_from_layout(
-    const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header_arg, void *gm_heap,
-    uint64_t heap_size
-) {
-    auto *orch = this;
-    *orch = PTO2OrchestratorState{};
-
-    orch->sm_header = sm_header_arg;
-    orch->gm_heap_base = gm_heap;
-    orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH;
-    orch->fatal = false;
-
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        void *ring_heap_base = reinterpret_cast<char *>(gm_heap) + r * heap_size;
-        auto &ring = sm_header_arg->rings[r];
-
-        orch->rings[r].task_allocator.init(
-            ring.task_descriptors, ring.task_window_size, &ring.fc.current_task_index, &ring.fc.last_task_alive,
-            ring_heap_base, heap_size, &sm_header_arg->orch_error_code
-        );
-
-        const size_t fanin_pool_bytes =
-            PTO2_ALIGN_UP(static_cast<size_t>(layout.dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE);
-        auto *fanin_entries = static_cast<PTO2FaninSpillEntry *>(arena.region_ptr(layout.off_fanin_pool[r]));
-        // aligned_zalloc-equivalent: pool relies on zeroed entries.
-        memset(fanin_entries, 0, fanin_pool_bytes);
-        orch->rings[r].fanin_pool.init(fanin_entries, layout.dep_pool_capacity, &sm_header_arg->orch_error_code);
-    }
-
-    if (!orch->tensor_map.init_from_layout(layout.tensor_map, arena)) {
-        return false;
-    }
-    orch->tensor_map.orch = orch;
-
-    orch->scope_tasks = static_cast<PTO2TaskSlotState **>(arena.region_ptr(layout.off_scope_tasks));
-    orch->scope_begins = static_cast<int32_t *>(arena.region_ptr(layout.off_scope_begins));
-    orch->scope_tasks_size = 0;
-    orch->scope_tasks_capacity = layout.scope_tasks_cap;
-    orch->scope_stack_top = -1;
-    orch->scope_stack_capacity = layout.scope_stack_capacity;
-    orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
-
-    return true;
-}
-
-void PTO2OrchestratorState::destroy() {
-    auto *orch = this;
-    orch->tensor_map.destroy();
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        orch->rings[r].fanin_pool.base = nullptr;
-    }
-    orch->scope_tasks = nullptr;
-    orch->scope_begins = nullptr;
-}
-
-void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; }
-
 // =============================================================================
 // Scope Management
 // =============================================================================
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index 37fd0dcac..7dd47b19a 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -142,14 +142,21 @@ struct PTO2OrchestratorState {
         int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE
     );
 
-    // Phase 3: bind region pointers, wire per-ring task_allocator + fanin_pool
-    // and tensor_map. Arena must be committed; layout must come from
-    // reserve_layout() against the same arena.
-    bool init_from_layout(
-        const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header, void *gm_heap,
-        uint64_t heap_size
+    // Phase 3a: write everything *except* arena-internal pointer fields.
+    // sm_dev_base is the SM device address (only stored, never dereferenced);
+    // task_window_size feeds the per-ring SM address arithmetic. Safe to call
+    // on a host arena that holds the prebuilt image.
+    bool init_data_from_layout(
+        const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size,
+        uint64_t task_window_size
     );
 
+    // Phase 3b: write the arena-internal pointer fields (scope_tasks,
+    // scope_begins, rings[].fanin_pool.base, tensor_map.{buckets,entry_pool,
+    // free_entry_list,task_entry_heads}, scheduler reference).
+    // Idempotent — host runs once on the image, AICPU runs once after attach.
+    void wire_arena_pointers(const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler);
+
     // Forget pointers; arena owns the backing buffers.
     void destroy();
     void set_scheduler(PTO2SchedulerState *scheduler);
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
index 5a3e3d3d3..abd2a7510 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
@@ -68,10 +68,22 @@ class PTO2TaskAllocator {
 public:
     /**
      * Initialize the allocator with task ring and heap ring resources.
+     *
+     * All pointer arguments are device addresses (live in SM / GM heap); this
+     * function only stores them, no dereferences, so it is safe to invoke
+     * from host code that constructs a prebuilt arena image.
+     *
+     * Production callers leave `initial_local_task_id` at 0: the SM ring
+     * flow-control counters that current_index_ptr / last_alive_ptr point at
+     * start at zero (PTO2RingFlowControl::init() runs on the AICPU during SM
+     * reset), so we keep local_task_id_ aligned with that without reading the
+     * SM. Tests that drive SM state directly may pass a non-zero seed to
+     * exercise corner cases like task IDs near INT32_MAX.
      */
     void init(
         PTO2TaskDescriptor *descriptors, int32_t window_size, std::atomic<int32_t> *current_index_ptr,
-        std::atomic<int32_t> *last_alive_ptr, void *heap_base, uint64_t heap_size, std::atomic<int32_t> *error_code_ptr
+        std::atomic<int32_t> *last_alive_ptr, void *heap_base, uint64_t heap_size, std::atomic<int32_t> *error_code_ptr,
+        int32_t initial_local_task_id = 0
     ) {
         descriptors_ = descriptors;
         window_size_ = window_size;
@@ -81,7 +93,7 @@ class PTO2TaskAllocator {
         heap_base_ = heap_base;
         heap_size_ = heap_size;
         error_code_ptr_ = error_code_ptr;
-        local_task_id_ = current_index_ptr->load(std::memory_order_relaxed);
+        local_task_id_ = initial_local_task_id;
         heap_top_ = 0;
         heap_tail_ = 0;
         last_alive_seen_ = 0;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
index c801d5c15..f39bac365 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
@@ -249,81 +249,19 @@ static const PTO2RuntimeOps s_runtime_ops = {
 };
 
 // =============================================================================
-// Runtime Creation and Destruction
+// Runtime Lifecycle (AICPU-only fixup)
 // =============================================================================
-
-PTO2Runtime *runtime_create_from_sm(
-    PTO2RuntimeMode mode, void *sm_base, uint64_t sm_size, uint64_t task_window_size, void *gm_heap, uint64_t heap_size,
-    DeviceArena &arena, int32_t dep_pool_capacity
-) {
-    if (!sm_base || sm_size == 0) return nullptr;
-
-    // Phase 1: layout. Reserve every sub-region the runtime needs (including
-    // the SM handle wrapper itself) without touching memory yet.
-    int32_t task_window_sizes[PTO2_MAX_RING_DEPTH];
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_window_sizes[r] = static_cast<int32_t>(task_window_size);
-    }
-    const size_t off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
-    PTO2OrchestratorLayout orch_layout =
-        PTO2OrchestratorState::reserve_layout(arena, task_window_sizes, dep_pool_capacity);
-    PTO2SchedulerLayout sched_layout = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacity);
-    const size_t off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE);
-    const size_t off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox));
-
-    // Phase 2: single backing allocation.
-    if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) return nullptr;
-
-    // Phase 3: bind region pointers and initialize.
-    PTO2Runtime *rt = static_cast<PTO2Runtime *>(arena.region_ptr(off_runtime));
-    memset(rt, 0, sizeof(*rt));  // calloc-equivalent for the runtime header.
-
-    // Initialize the SM handle wrapper in-place on its arena region before
-    // anything that reads sm_handle->header (orchestrator / scheduler init).
-    rt->sm_handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(off_sm_handle));
-    memset(rt->sm_handle, 0, sizeof(*rt->sm_handle));
-    if (!rt->sm_handle->init(sm_base, sm_size, task_window_size, heap_size)) {
-        arena.release();
-        return nullptr;
-    }
-
+//
+// Layout / init_data / wire / destroy live in
+// runtime/shared/pto_runtime2_init.cpp so the host build can pre-populate the
+// prebuilt arena image. The pieces below — wiring the ops table and the
+// SPMD core counts — depend on the device-side s_runtime_ops global and the
+// AICPU SchedulerContext respectively, so they remain in the AICPU build.
+
+void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count) {
     rt->ops = &s_runtime_ops;
-    rt->mode = mode;
-    rt->gm_heap = gm_heap;
-    rt->gm_heap_size = heap_size > 0 ? heap_size * PTO2_MAX_RING_DEPTH : 0;
-    rt->gm_heap_owned = false;
-
-    if (!rt->orchestrator.init_from_layout(orch_layout, arena, rt->sm_handle->header, gm_heap, heap_size)) {
-        arena.release();
-        return nullptr;
-    }
-    if (!rt->scheduler.init_from_layout(sched_layout, arena, rt->sm_handle->header)) {
-        rt->orchestrator.destroy();
-        arena.release();
-        return nullptr;
-    }
-    rt->orchestrator.set_scheduler(&rt->scheduler);
-
-    rt->aicore_mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(off_mailbox));
-    memset(rt->aicore_mailbox, 0, sizeof(*rt->aicore_mailbox));
-
-    return rt;
-}
-
-void runtime_destroy(PTO2Runtime *rt, DeviceArena &arena) {
-    if (!rt) {
-        arena.release();  // safe: idempotent if nothing's committed.
-        return;
-    }
-
-    rt->scheduler.destroy();
-    rt->orchestrator.destroy();
-    rt->aicore_mailbox = nullptr;  // arena-owned.
-    rt->sm_handle = nullptr;       // wrapper lives in arena; release() reclaims it.
-
-    // arena.release() frees the single backing buffer that holds rt,
-    // mailbox, sm_handle, orchestrator and scheduler sub-regions in one shot.
-    arena.release();
+    rt->orchestrator.total_cluster_count = aic_count;
+    rt->orchestrator.total_aiv_count = aiv_count;
 }
 
 void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode) {
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
index 5709a85b7..460624e69 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
@@ -91,6 +91,30 @@ struct PTO2RuntimeOps {
     TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const Arg &args);
 };
 
+/**
+ * Layout descriptor for the prebuilt runtime arena. Holds all sub-region
+ * offsets (orchestrator / scheduler / sm_handle wrapper / runtime header /
+ * AICore mailbox) plus the layout-defining capacities. Produced once on the
+ * host by runtime_reserve_layout(); consumed by runtime_init_data_from_layout
+ * and runtime_wire_arena_pointers.
+ */
+struct PTO2RuntimeArenaLayout {
+    size_t off_sm_handle{0};
+    PTO2OrchestratorLayout orch;
+    PTO2SchedulerLayout sched;
+    size_t off_runtime{0};
+    size_t off_mailbox{0};
+
+    // Cached parameters (re-used by init_data + wire stages).
+    uint64_t task_window_size{0};
+    uint64_t heap_size{0};
+    int32_t dep_pool_capacity{0};
+
+    // Total arena byte size post-commit. Used by host to size the prebuilt
+    // image buffer and as the rtMemcpy length.
+    size_t arena_size{0};
+};
+
 /**
  * PTO Runtime2 context
  *
@@ -118,6 +142,16 @@ struct PTO2Runtime {
 
     // Statistics
     int64_t total_cycles;
+
+    // Prebuilt-arena fast path metadata. Carries every offset
+    // wire_arena_pointers needs at AICPU boot so the AICPU can reconstruct
+    // all arena-internal pointer fields without re-running init_data. The
+    // device base of the runtime arena travels separately on the host-side
+    // Runtime (Runtime::prebuilt_arena_base_), since the AICPU needs it
+    // *before* dereferencing this image. Populated on host by
+    // runtime_init_data_from_layout + runtime_wire_arena_pointers; read by
+    // aicpu_executor.cpp.
+    PTO2RuntimeArenaLayout prebuilt_layout;
 };
 
 // =============================================================================
@@ -125,38 +159,60 @@ struct PTO2Runtime {
 // =============================================================================
 
 /**
- * Create runtime from caller-provided GM SM buffer + GM heap.
- *
- * All AICPU-side runtime state (PTO2SharedMemoryHandle wrapper, PTO2Runtime,
- * AICoreCompletionMailbox, plus the orchestrator/scheduler/tensor_map
- * sub-regions) is laid out on the supplied arena and committed in a single
- * backing allocation — including the SM handle wrapper itself. The arena is
- * owned by the caller (typically the per-Worker AicpuExecutor);
- * runtime_destroy() calls arena.release() once to free the lot.
+ * Phase 1 — declare every sub-region (sm_handle wrapper, orchestrator /
+ * scheduler / tensor_map / mailbox / PTO2Runtime header) on the supplied
+ * arena. Pure arithmetic; does not touch device memory and may run on host.
+ * Returns the layout descriptor; caller commits/attaches the arena before
+ * Phase 2/3.
+ */
+PTO2RuntimeArenaLayout runtime_reserve_layout(
+    DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE
+);
+
+/**
+ * Phase 2 — write the data half of the runtime arena: standalone fields,
+ * memset'd arena regions, sub-structure initializers, and SM-side device
+ * pointers. The arena must already be committed (or attached); writes go
+ * into arena.base() + sub-region offsets.
  *
- * `sm_base` / `sm_size` describe the SM buffer that host has already placed
- * for the runtime to use; the SM handle wrapper is constructed in-place on
- * an arena-reserved region pointing at that buffer.
+ * `sm_dev_base` / `gm_heap_dev_base` are device addresses; we only store
+ * them (never dereference). Safe to run on a host arena that owns a host
+ * mirror of the runtime image — the resulting buffer is rtMemcpy-ready.
  *
- * @param mode             Execution mode
- * @param sm_base          Pre-allocated SM buffer base (host-owned)
- * @param sm_size          Size of the SM buffer in bytes
- * @param task_window_size Per-ring task window size used to lay out SM
- * @param gm_heap          GM heap base for output buffers (or NULL if not used)
- * @param heap_size        GM heap size in bytes
- * @param arena            Caller-owned arena that sources all runtime sub-regions.
- *                         Must be freshly constructed (no prior commit) —
- *                         runtime_create_from_sm reserves + commits internally.
- * @return Runtime context, or NULL on failure
- */
-PTO2Runtime *runtime_create_from_sm(
-    PTO2RuntimeMode mode, void *sm_base, uint64_t sm_size, uint64_t task_window_size, void *gm_heap, uint64_t heap_size,
-    DeviceArena &arena, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE
+ * Returns the PTO2Runtime* that sits at layout.off_runtime within the arena.
+ * Caller must follow up with runtime_wire_arena_pointers; rt->ops and the
+ * AICore-side count fields are left untouched and must be filled by the
+ * AICPU at boot.
+ */
+PTO2Runtime *runtime_init_data_from_layout(
+    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, uint64_t sm_size,
+    void *gm_heap_dev_base, uint64_t heap_size
 );
 
 /**
- * Destroy runtime and free all resources. arena.release() is the actual
- * memory free; the rt pointer is no longer valid afterward.
+ * Phase 3 — wire every arena-internal pointer field (rt->sm_handle,
+ * rt->aicore_mailbox, orchestrator.{scope_tasks, scope_begins, scheduler,
+ * tensor_map.*, rings[].fanin_pool.base}, scheduler.{ready_queues, dep_pool,
+ * wiring.queue}) so each holds arena.base() + offset. Idempotent — runs on
+ * both host (writing host-mirror addresses) and AICPU (writing device
+ * addresses) sides.
+ */
+void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt);
+
+/**
+ * AICPU-only Phase 4 — fill in the few fields the host could not know at
+ * prebuilt-image build time: the ops table (s_runtime_ops is a device-side
+ * file-local global, host cannot resolve its device address) and the
+ * orchestrator's core counts (depend on the executor's scheduler context).
+ * Call once per boot after runtime_wire_arena_pointers.
+ */
+void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count);
+
+/**
+ * Destroy runtime. With the prebuilt-arena fast path the arena buffer is
+ * pooled across runs by DeviceRunner, so we never call arena.release()
+ * here — the destructor only forgets sub-structure pointers (idempotent
+ * cleanup).
  */
 void runtime_destroy(PTO2Runtime *rt, DeviceArena &arena);
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
index fcd8a27bd..f217e7ac3 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
@@ -330,7 +330,11 @@ struct alignas(64) PTO2TaskSlotState {
     // Fanout refcount (accessed with fanout_count in check_and_handle_consumed)
     std::atomic<int32_t> fanout_refcount;  // Dynamic: counts released references
 
-    // --- Immutable after RingSchedState::init() (same value on every slot reuse) ---
+    // --- Per-slot constant, re-bound by orch::prepare_task each submit ---
+    // Value is the same on every reuse (&task_payloads[slot] / &task_descriptors[slot]),
+    // but written here per-submit instead of in an O(window_size) init loop —
+    // these are the only "scale-dependent" pointers in this struct, so moving
+    // them out of init makes startup cost independent of task_window_size.
     PTO2TaskPayload *payload;
     PTO2TaskDescriptor *task;
 
@@ -345,14 +349,21 @@ struct alignas(64) PTO2TaskSlotState {
     int16_t next_block_idx{0};                   // Next block to dispatch (scheduler state)
 
     /**
-     * One-time binding of slot-invariant fields.
-     * Called during RingSchedState::init() — these values are determined by
-     * the slot's position in the ring and never change across reuses.
+     * Bind the slot-invariant ring id. Called once per slot during
+     * RingSchedState::init(); ring_id never changes across reuses.
      */
-    void bind(PTO2TaskPayload *p, PTO2TaskDescriptor *t, uint8_t rid) {
+    void bind_ring(uint8_t rid) { ring_id = rid; }
+
+    /**
+     * Re-bind the per-slot payload/task pointers. Called by
+     * orch::prepare_task on every submit. Value is constant for a given
+     * slot, but we pay the cheap re-write each submit (both fields land on
+     * the same 64B slot_state cache line that prepare_task is already
+     * dirtying) to avoid the init-time per-slot loop.
+     */
+    void bind_buffers(PTO2TaskPayload *p, PTO2TaskDescriptor *t) {
         payload = p;
         task = t;
-        ring_id = rid;
     }
 
     /**
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
index 5e1b6faa8..98b832510 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
@@ -58,6 +58,13 @@ struct alignas(64) PTO2RingFlowControl {
     // === Cache Line 1: Written by Scheduler, Read by Orchestrator (for back-pressure) ===
     alignas(64) std::atomic<int32_t> last_task_alive;  // Task ring tail (oldest active task)
 
+    // Per-boot SM reset. PTO2TaskAllocator::init() seeds its private
+    // local_task_id_ from initial_local_task_id (default 0 in production)
+    // *without* dereferencing current_task_index — it relies on this reset
+    // running on every AICPU boot so 0 stays in sync. If you ever change
+    // the initial fc value or the boot ordering, update the default in
+    // PTO2TaskAllocator::init (pto_ring_buffer.h) in the same change, or
+    // submit IDs will be off by the divergence.
     void init() {
         current_task_index.store(0, std::memory_order_relaxed);
         last_task_alive.store(0, std::memory_order_relaxed);
@@ -187,3 +194,67 @@ struct PTO2SharedMemoryHandle {
     void setup_pointers(uint64_t task_window_size);
     void setup_pointers_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
 };
+
+// =============================================================================
+// SM Device Layout Helpers
+// =============================================================================
+//
+// When the host pre-builds a runtime-arena image, it needs the device-side
+// addresses of several SM sub-fields (ring flow-control counters,
+// task_descriptors arrays, orch_error_code) so it can wire them into the
+// orchestrator / scheduler init_data path without dereferencing the SM —
+// the SM lives in device memory and cannot be touched from host.
+//
+// These helpers compute those addresses by offset arithmetic on the SM
+// device base. Pure pointer math, no loads/stores; safe to call from host.
+// The same arithmetic happens on AICPU too (via PTO2SharedMemoryHandle's
+// own setup_pointers), so values are guaranteed consistent across sides.
+namespace pto2_sm_layout {
+
+inline std::atomic<int32_t> *orch_error_code_addr(void *sm_dev_base) noexcept {
+    return reinterpret_cast<std::atomic<int32_t> *>(
+        static_cast<char *>(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, orch_error_code)
+    );
+}
+
+inline PTO2SharedMemoryRingHeader *ring_header_addr(void *sm_dev_base, int ring_id) noexcept {
+    return reinterpret_cast<PTO2SharedMemoryRingHeader *>(
+        static_cast<char *>(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, rings) +
+        static_cast<size_t>(ring_id) * sizeof(PTO2SharedMemoryRingHeader)
+    );
+}
+
+inline std::atomic<int32_t> *ring_current_task_index_addr(void *sm_dev_base, int ring_id) noexcept {
+    return reinterpret_cast<std::atomic<int32_t> *>(
+        reinterpret_cast<char *>(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) +
+        offsetof(PTO2RingFlowControl, current_task_index)
+    );
+}
+
+inline std::atomic<int32_t> *ring_last_task_alive_addr(void *sm_dev_base, int ring_id) noexcept {
+    return reinterpret_cast<std::atomic<int32_t> *>(
+        reinterpret_cast<char *>(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) +
+        offsetof(PTO2RingFlowControl, last_task_alive)
+    );
+}
+
+// Walk the per-ring SM layout (same arithmetic as setup_pointers_per_ring)
+// to compute ring `ring_id`'s task_descriptors device address. Accepts a
+// per-ring window-size array so the helper's signature mirrors
+// `PTO2SharedMemoryHandle::setup_pointers_per_ring` and cannot silently
+// disagree with the SM layout when (hypothetically) ring sizes diverge.
+inline PTO2TaskDescriptor *ring_task_descriptors_addr(
+    void *sm_dev_base, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], int ring_id
+) noexcept {
+    assert(ring_id >= 0 && ring_id < PTO2_MAX_RING_DEPTH && "pto2_sm_layout: ring_id out of range");
+    char *p = static_cast<char *>(sm_dev_base);
+    p += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+    for (int r = 0; r < ring_id; r++) {
+        p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+        p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+        p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
+    }
+    return reinterpret_cast<PTO2TaskDescriptor *>(p);
+}
+
+}  // namespace pto2_sm_layout
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
index cf1f2d28d..b63f20676 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
@@ -47,8 +47,6 @@
 #include "pto_runtime2_types.h"
 #include "tensor.h"
 
-struct PTO2OrchestratorState;  // forward declare
-
 /**
  * Layout descriptor produced by PTO2TensorMap::reserve_layout(). Stores the
  * region offsets returned by DeviceArena::reserve() so init_from_layout()
@@ -369,8 +367,6 @@ struct PTO2TensorMap {
     // Per-ring cleanup progress (for periodic cleanup_retired)
     int32_t last_cleanup[PTO2_MAX_RING_DEPTH]{};
 
-    PTO2OrchestratorState *orch{nullptr};
-
     uint32_t get_task_local_id_slot(uint8_t ring_id, uint32_t task_local_id) const {
         return task_local_id & (task_window_sizes[ring_id] - 1);
     }
@@ -435,11 +431,19 @@ struct PTO2TensorMap {
     reserve_layout_default(DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
 
     /**
-     * Phase 3: bind region pointers and initialize state. The arena must already
-     * be committed; layout must have been produced by reserve_layout() against
-     * the same arena.
+     * Phase 3a: write everything *except* arena-internal pointer fields
+     * (buckets, entry_pool, free_entry_list, task_entry_heads[r]).
+     * Uses arena.region_ptr to address the arena regions for data writes,
+     * but does not store those addresses in struct fields. Safe to call on
+     * a host arena that holds the prebuilt image.
+     */
+    bool init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena);
+
+    /**
+     * Phase 3b: write the arena-internal pointer fields. Idempotent;
+     * called once on the host arena and once on the AICPU after attach.
      */
-    bool init_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena);
+    void wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena);
 
     /**
      * Tear down state. Does not free memory — the arena owns the backing
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 73b6027c4..8e1bb1567 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -119,19 +119,25 @@ struct HostApi {
     void (*device_free)(void *dev_ptr);
     int (*copy_to_device)(void *dev_ptr, const void *host_ptr, size_t size);
     int (*copy_from_device)(void *host_ptr, const void *dev_ptr, size_t size);
-    // Lay out and commit the per-Worker static device arena that backs both
-    // the PTO2 GM heap and the PTO2 shared memory in a single underlying
-    // allocation. Must be called once before acquire_pooled_gm_heap /
-    // acquire_pooled_gm_sm. Idempotent on identical sizes; returns 0 on
-    // success, -1 on allocation failure.
-    int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size);
+    // Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared
+    // memory, trb prebuilt runtime arena) as three independent device
+    // allocations. `runtime_arena_size == 0` skips the third region (hbg
+    // path: hbg has no prebuilt runtime arena). Idempotent on identical
+    // sizes; returns 0 on success, -1 on allocation failure.
+    int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
     // Return the per-Worker pooled pointer for the PTO2 GM heap / shared
-    // memory. The static arena must already be committed via
-    // setup_static_arena; the returned pointer is owned by the DeviceRunner
-    // and freed in `DeviceRunner::finalize()` — do NOT pass it to
-    // device_free or record it in `tensor_pairs_`.
+    // memory / prebuilt runtime arena. setup_static_arena must have already
+    // committed the relevant region; the returned pointer is owned by the
+    // DeviceRunner and freed in `DeviceRunner::finalize()` — do NOT pass it
+    // to device_free or record it in `tensor_pairs_`.
+    //
+    // acquire_pooled_runtime_arena is trb-only — the runtime-arena region is
+    // only committed when setup_static_arena was invoked with
+    // runtime_arena_size > 0. Calling it on the hbg path
+    // (setup_static_arena(...,0)) returns nullptr (not undefined).
     void *(*acquire_pooled_gm_heap)();
     void *(*acquire_pooled_gm_sm)();
+    void *(*acquire_pooled_runtime_arena)();
     // Single-shot upload of the entire ChipCallable buffer. `callable` is a
     // `const ChipCallable *` (declared void* to avoid pulling task_interface
     // headers into runtime.h). DeviceRunner walks child_offsets_ to compute
@@ -211,6 +217,13 @@ class Runtime {
     void *slot_states_ptr_;                  // Pointer to PTO2TaskSlotState array (scheduler-private, for profiling)
     ChipStorageTaskArgs orch_args_storage_;  // Copy of args for device
 
+    // Prebuilt-arena fast path (trb only). Set by the host before rtMemcpy'ing
+    // Runtime to device; AICPU reads them in the boot path to skip
+    // runtime_create_from_sm and reuse the pooled, prebuilt arena buffer
+    // (already populated by runtime_init_data_from_layout + wire on host).
+    void *prebuilt_arena_base_;
+    size_t prebuilt_runtime_offset_;
+
     // Device orchestration SO (for dlopen on AICPU thread 3).
     // The SO bytes themselves live in a separately-allocated device buffer
     // owned by DeviceRunner; only the metadata below travels inside Runtime.
@@ -247,6 +260,16 @@ class Runtime {
     void set_slot_states_ptr(void *p);
     void set_orch_args(const ChipStorageTaskArgs &args);
 
+    // Prebuilt-arena fast path (trb only). Set by host's
+    // bind_prepared_to_runtime_impl; consumed by AICPU at boot to attach a
+    // DeviceArena to `prebuilt_arena_base_` and pick up the PTO2Runtime at
+    // `prebuilt_arena_base_ + prebuilt_runtime_offset_`. Both stay zero on
+    // first construction (Runtime() ctor zeros them) so a non-prebuilt boot
+    // path can still detect "no prebuilt image set" via nullptr.
+    void set_prebuilt_arena(void *arena_base, size_t runtime_off);
+    void *get_prebuilt_arena_base() const;
+    size_t get_prebuilt_runtime_offset() const;
+
     // Device orchestration SO binary (for dlopen on AICPU thread 3)
     void set_dev_orch_so(uint64_t dev_addr, uint64_t size);
     uint64_t get_dev_orch_so_addr() const;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
index 281a714fb..2d777e9b0 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
@@ -61,152 +61,6 @@ PTO2SchedProfilingData scheduler_get_profiling(int thread_idx) {
 }
 #endif
 
-// =============================================================================
-// Ready Queue Implementation
-// =============================================================================
-
-size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity) {
-    // Align the slots[] base to a full cache line so MPMC CAS traffic on the
-    // first slot cannot false-share with whatever region sits in front of us
-    // (e.g. orchestrator tensormap heads written by the orch thread).
-    return arena.reserve(capacity * sizeof(PTO2ReadyQueueSlot), PTO2_ALIGN_SIZE);
-}
-
-bool ready_queue_init_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity) {
-    queue->slots = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
-    queue->capacity = capacity;
-    queue->mask = capacity - 1;
-    queue->enqueue_pos.store(0, std::memory_order_relaxed);
-    queue->dequeue_pos.store(0, std::memory_order_relaxed);
-
-    for (uint64_t i = 0; i < capacity; i++) {
-        queue->slots[i].sequence.store((int64_t)i, std::memory_order_relaxed);
-        queue->slots[i].slot_state = nullptr;
-    }
-
-    return true;
-}
-
-void ready_queue_destroy(PTO2ReadyQueue *queue) {
-    // Arena owns the slots[] buffer; just forget the pointer.
-    queue->slots = nullptr;
-}
-
-// =============================================================================
-// Scheduler Initialization
-// =============================================================================
-
-bool PTO2SchedulerState::RingSchedState::init(PTO2SharedMemoryHeader *sm_header, int32_t ring_id) {
-    ring = &sm_header->rings[ring_id];
-    last_task_alive = 0;
-    advance_lock.store(0, std::memory_order_relaxed);
-
-    // Initialize all per-task slot state fields.
-    // bind() sets payload, task, ring_id — immutable after init, bound once
-    // to their fixed shared-memory addresses.
-    // reset_for_reuse() sets dynamic fields to reclaim defaults (fanout_count=1,
-    // rest zero) so the first submit needs no reset.
-    for (uint64_t i = 0; i < ring->task_window_size; i++) {
-        ring->slot_states[i].bind(&ring->task_payloads[i], &ring->task_descriptors[i], static_cast<uint8_t>(ring_id));
-        ring->slot_states[i].reset_for_reuse();
-        ring->slot_states[i].fanin_count = 0;
-        ring->slot_states[i].active_mask = ActiveMask{};
-    }
-
-    return true;
-}
-
-void PTO2SchedulerState::RingSchedState::destroy() { ring = nullptr; }
-
-PTO2SchedulerLayout PTO2SchedulerState::reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity) {
-    PTO2SchedulerLayout layout{};
-    layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE;
-    layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE;
-    layout.dep_pool_capacity = dep_pool_capacity;
-
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
-    }
-    layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        // Force a cache-line base so writes from scheduler thread 0 (sole
-        // writer of this ring's dep_pool) do not invalidate adjacent
-        // multi-threaded regions like ready_queue.slots.
-        layout.off_dep_pool_entries[r] =
-            arena.reserve(static_cast<size_t>(dep_pool_capacity) * sizeof(PTO2DepListEntry), PTO2_ALIGN_SIZE);
-    }
-    layout.off_wiring_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE);
-    return layout;
-}
-
-bool PTO2SchedulerState::init_from_layout(
-    const PTO2SchedulerLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header_arg
-) {
-    PTO2SchedulerState *sched = this;
-    sched->sm_header = sm_header_arg;
-#if PTO2_SCHED_PROFILING
-    sched->tasks_completed.store(0, std::memory_order_relaxed);
-    sched->tasks_consumed.store(0, std::memory_order_relaxed);
-#endif
-
-    // Per-ring scheduler state — no arena buffers, just field init.
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        if (!sched->ring_sched_states[r].init(sm_header_arg, r)) {
-            return false;
-        }
-    }
-
-    // Ready queues — one per resource shape plus DUMMY.
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        if (!ready_queue_init_from_layout(
-                &sched->ready_queues[i], arena, layout.off_ready_queue_slots[i], layout.ready_queue_capacity
-            )) {
-            return false;
-        }
-    }
-    if (!ready_queue_init_from_layout(
-            &sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity
-        )) {
-        return false;
-    }
-
-    // Per-ring dep_pool: PTO2DepListPool::init takes an externally-allocated
-    // base + capacity, so we just plumb the arena region into it.
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        auto *dep_entries = static_cast<PTO2DepListEntry *>(arena.region_ptr(layout.off_dep_pool_entries[r]));
-        // calloc-equivalent: pool expects entries zeroed at construction.
-        memset(dep_entries, 0, static_cast<size_t>(layout.dep_pool_capacity) * sizeof(PTO2DepListEntry));
-        sched->ring_sched_states[r].dep_pool.init(
-            dep_entries, layout.dep_pool_capacity, &sm_header_arg->orch_error_code
-        );
-    }
-
-    // Wiring SPSC queue (orchestrator push, scheduler thread 0 pop).
-    if (!sched->wiring.queue.init_from_layout(arena, layout.off_wiring_spsc_buffer, layout.spsc_capacity)) {
-        return false;
-    }
-    sched->wiring.batch_count = 0;
-    sched->wiring.batch_index = 0;
-    sched->wiring.backoff_counter = 0;
-
-    return true;
-}
-
-void PTO2SchedulerState::destroy() {
-    PTO2SchedulerState *sched = this;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        sched->ring_sched_states[r].destroy();
-        sched->ring_sched_states[r].dep_pool.base = nullptr;
-    }
-
-    sched->wiring.queue.destroy();
-
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        ready_queue_destroy(&sched->ready_queues[i]);
-    }
-    ready_queue_destroy(&sched->dummy_ready_queue);
-}
-
 // =============================================================================
 // Debug Utilities
 // =============================================================================
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
index 8d50681ba..510187feb 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
@@ -409,7 +409,14 @@ struct alignas(64) PTO2ReadyQueue {
 //                     initialize sequence counters
 //   destroy: forget the slots pointer (arena owns the buffer)
 size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity);
-bool ready_queue_init_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity);
+// Writes everything *except* the arena-internal `slots` pointer field
+// (sequences/positions on the slot array, capacity, mask). Uses
+// arena.region_ptr(slots_off) only to address the slot array for writes;
+// does NOT store the pointer in `queue->slots`. Call
+// `ready_queue_wire_arena_pointers` afterwards to set the field itself.
+bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity);
+// Stores queue->slots = arena.region_ptr(slots_off). Idempotent.
+void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off);
 void ready_queue_destroy(PTO2ReadyQueue *queue);
 
 // =============================================================================
@@ -449,15 +456,17 @@ struct alignas(64) PTO2SpscQueue {
         return arena.reserve(capacity * sizeof(PTO2TaskSlotState *), PTO2_ALIGN_SIZE);
     }
 
-    // Bind buffer pointer + reset indices. The capacity must be a power of two
-    // and match the value passed to reserve_layout.
-    bool init_from_layout(DeviceArena &arena, size_t buffer_off, uint64_t capacity) {
+    // Writes everything except the arena-internal `buffer_` pointer field
+    // (zeros the slot pointer array, mask/head/tail). The host pre-builds the
+    // image without storing a host address in buffer_; the AICPU wires
+    // buffer_ at boot via wire_arena_pointers().
+    bool init_data_from_layout(DeviceArena &arena, size_t buffer_off, uint64_t capacity) {
         if (capacity == 0 || (capacity & (capacity - 1)) != 0) return false;
-        buffer_ = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
+        auto *buf = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
         // calloc'd-equivalent: zero the slot pointers so spurious early pops
         // observe nullptr.
         for (uint64_t i = 0; i < capacity; i++)
-            buffer_[i] = nullptr;
+            buf[i] = nullptr;
         mask_ = capacity - 1;
         head_.store(0, std::memory_order_relaxed);
         tail_.store(0, std::memory_order_relaxed);
@@ -466,6 +475,12 @@ struct alignas(64) PTO2SpscQueue {
         return true;
     }
 
+    // Wire the arena-internal pointer. Called by both host (with host arena)
+    // and AICPU (with device arena attached to the prebuilt image).
+    void wire_arena_pointers(DeviceArena &arena, size_t buffer_off) {
+        buffer_ = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
+    }
+
     // Arena owns the buffer; here we only forget our pointer.
     void destroy() { buffer_ = nullptr; }
 
@@ -563,7 +578,12 @@ struct PTO2SchedulerState {
         // --- Cache Line 1+: Thread 0 only (wiring dep_pool) ---
         alignas(64) PTO2DepListPool dep_pool;
 
-        bool init(PTO2SharedMemoryHeader *sm_header, int32_t ring_id);
+        // Initialize arena-internal data + arena-external pointers; does NOT
+        // store dep_pool.base (that lives in the runtime arena and is wired
+        // by SchedulerState::wire_arena_pointers). The `ring` field stores
+        // the device address of the SM ring header — computed via offset
+        // arithmetic, no SM dereference.
+        bool init_data_from_layout(void *sm_dev_base, int32_t ring_id);
         void destroy();
 
         void sync_to_sm() { ring->fc.last_task_alive.store(last_task_alive, std::memory_order_release); }
@@ -1042,13 +1062,23 @@ struct PTO2SchedulerState {
 
     // Phase 1: declare every sub-region (ready_queue slots, dummy queue slots,
     // per-ring dep_pool entries, wiring SPSC buffer) on the supplied arena.
-    // Capacities are baked into the returned layout; init_from_layout uses
+    // Capacities are baked into the returned layout; init_data_from_layout uses
     // the same values.
     static PTO2SchedulerLayout reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE);
 
-    // Phase 3: bind region pointers and initialize state. The arena must be
-    // committed; layout must come from reserve_layout() against the same arena.
-    bool init_from_layout(const PTO2SchedulerLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header);
+    // Phase 3a: write everything *except* arena-internal pointer fields.
+    // `sm_dev_base` is the device address of the SM (only stored, never
+    // dereferenced here). Safe to call on a host arena that holds the
+    // prebuilt image buffer. (The orchestrator counterpart takes
+    // task_window_size for ring task_descriptors address arithmetic; the
+    // scheduler only needs the SM header / ring header base addresses,
+    // both window-size-independent.)
+    bool init_data_from_layout(const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base);
+
+    // Phase 3b: write the arena-internal pointer fields
+    // (ready_queues[].slots, dummy_ready_queue.slots, dep_pool.base for each
+    // ring, wiring.queue.buffer_). Called on both host and device sides.
+    void wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena);
 
     // Forget per-region pointers; arena owns the backing memory.
     void destroy();
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp
new file mode 100644
index 000000000..d66acfcc4
--- /dev/null
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp
@@ -0,0 +1,355 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Host/AICPU shared runtime-arena layout, init_data and wire implementations.
+ *
+ * Lives under runtime/shared/ so it is included in both the host_runtime.so
+ * build (host pre-populates the prebuilt arena image) and the aicpu_runtime
+ * build (AICPU runs wire_arena_pointers + destroy after attach). The
+ * device-only parts of pto_runtime2.cpp / pto_orchestrator.cpp / pto_scheduler.cpp
+ * (ops table, scope/submit/dispatch business logic, profiling) stay in their
+ * original files and the aicpu build only.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "pto_orchestrator.h"
+#include "pto_runtime2.h"
+#include "pto_ring_buffer.h"
+#include "pto_shared_memory.h"
+#include "pto_tensormap.h"
+#include "scheduler/pto_scheduler.h"
+
+// =============================================================================
+// Ready queue
+// =============================================================================
+
+size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity) {
+    // Align the slots[] base to a full cache line so MPMC CAS traffic on the
+    // first slot cannot false-share with whatever region sits in front of us
+    // (e.g. orchestrator tensormap heads written by the orch thread).
+    return arena.reserve(capacity * sizeof(PTO2ReadyQueueSlot), PTO2_ALIGN_SIZE);
+}
+
+bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity) {
+    // Address the slots region for data writes without storing the pointer in
+    // queue->slots — that field is set by ready_queue_wire_arena_pointers.
+    auto *slots_arena = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
+    queue->capacity = capacity;
+    queue->mask = capacity - 1;
+    queue->enqueue_pos.store(0, std::memory_order_relaxed);
+    queue->dequeue_pos.store(0, std::memory_order_relaxed);
+
+    for (uint64_t i = 0; i < capacity; i++) {
+        slots_arena[i].sequence.store((int64_t)i, std::memory_order_relaxed);
+        slots_arena[i].slot_state = nullptr;
+    }
+
+    return true;
+}
+
+void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off) {
+    queue->slots = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
+}
+
+void ready_queue_destroy(PTO2ReadyQueue *queue) {
+    // Arena owns the slots[] buffer; just forget the pointer.
+    queue->slots = nullptr;
+}
+
+// =============================================================================
+// Scheduler
+// =============================================================================
+
+bool PTO2SchedulerState::RingSchedState::init_data_from_layout(void *sm_dev_base, int32_t ring_id) {
+    // ring stores the device address of the SM ring header — pure offset
+    // arithmetic, no SM load.
+    ring = pto2_sm_layout::ring_header_addr(sm_dev_base, ring_id);
+    last_task_alive = 0;
+    advance_lock.store(0, std::memory_order_relaxed);
+
+    // Per-slot SM-side initialization (bind_ring + reset_for_reuse +
+    // fanin_count/active_mask zero) lives in PTO2SharedMemoryHandle::
+    // init_header_per_ring so the AICPU performs it during SM reset; host
+    // prebuilt-arena init skips SM access here.
+
+    return true;
+}
+
+void PTO2SchedulerState::RingSchedState::destroy() { ring = nullptr; }
+
+PTO2SchedulerLayout PTO2SchedulerState::reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity) {
+    PTO2SchedulerLayout layout{};
+    layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE;
+    layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE;
+    layout.dep_pool_capacity = dep_pool_capacity;
+
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
+    }
+    layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        // Force a cache-line base so writes from scheduler thread 0 (sole
+        // writer of this ring's dep_pool) do not invalidate adjacent
+        // multi-threaded regions like ready_queue.slots.
+        layout.off_dep_pool_entries[r] =
+            arena.reserve(static_cast<size_t>(dep_pool_capacity) * sizeof(PTO2DepListEntry), PTO2_ALIGN_SIZE);
+    }
+    layout.off_wiring_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE);
+    return layout;
+}
+
+bool PTO2SchedulerState::init_data_from_layout(
+    const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base
+) {
+    PTO2SchedulerState *sched = this;
+    sched->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
+#if PTO2_SCHED_PROFILING
+    sched->tasks_completed.store(0, std::memory_order_relaxed);
+    sched->tasks_consumed.store(0, std::memory_order_relaxed);
+#endif
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        if (!sched->ring_sched_states[r].init_data_from_layout(sm_dev_base, r)) {
+            return false;
+        }
+    }
+
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        if (!ready_queue_init_data_from_layout(
+                &sched->ready_queues[i], arena, layout.off_ready_queue_slots[i], layout.ready_queue_capacity
+            )) {
+            return false;
+        }
+    }
+    if (!ready_queue_init_data_from_layout(
+            &sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity
+        )) {
+        return false;
+    }
+
+    auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto *dep_entries = static_cast<PTO2DepListEntry *>(arena.region_ptr(layout.off_dep_pool_entries[r]));
+        memset(dep_entries, 0, static_cast<size_t>(layout.dep_pool_capacity) * sizeof(PTO2DepListEntry));
+        sched->ring_sched_states[r].dep_pool.init(dep_entries, layout.dep_pool_capacity, orch_err);
+    }
+
+    if (!sched->wiring.queue.init_data_from_layout(arena, layout.off_wiring_spsc_buffer, layout.spsc_capacity)) {
+        return false;
+    }
+    sched->wiring.batch_count = 0;
+    sched->wiring.batch_index = 0;
+    sched->wiring.backoff_counter = 0;
+
+    return true;
+}
+
+void PTO2SchedulerState::wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena) {
+    PTO2SchedulerState *sched = this;
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        ready_queue_wire_arena_pointers(&sched->ready_queues[i], arena, layout.off_ready_queue_slots[i]);
+    }
+    ready_queue_wire_arena_pointers(&sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        sched->ring_sched_states[r].dep_pool.base =
+            static_cast<PTO2DepListEntry *>(arena.region_ptr(layout.off_dep_pool_entries[r]));
+    }
+    sched->wiring.queue.wire_arena_pointers(arena, layout.off_wiring_spsc_buffer);
+}
+
+void PTO2SchedulerState::destroy() {
+    PTO2SchedulerState *sched = this;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        sched->ring_sched_states[r].destroy();
+        sched->ring_sched_states[r].dep_pool.base = nullptr;
+    }
+    sched->wiring.queue.destroy();
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        ready_queue_destroy(&sched->ready_queues[i]);
+    }
+    ready_queue_destroy(&sched->dummy_ready_queue);
+}
+
+// =============================================================================
+// Orchestrator
+// =============================================================================
+
+PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout(
+    DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity
+) {
+    PTO2OrchestratorLayout layout{};
+    layout.dep_pool_capacity = dep_pool_capacity;
+    layout.scope_tasks_cap = PTO2_SCOPE_TASKS_CAP;
+    layout.scope_stack_capacity = PTO2_MAX_SCOPE_DEPTH;
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        const size_t fanin_pool_bytes =
+            PTO2_ALIGN_UP(static_cast<size_t>(dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE);
+        layout.off_fanin_pool[r] = arena.reserve(fanin_pool_bytes, PTO2_ALIGN_SIZE);
+    }
+    layout.off_scope_tasks = arena.reserve(
+        static_cast<size_t>(layout.scope_tasks_cap) * sizeof(PTO2TaskSlotState *), alignof(PTO2TaskSlotState *)
+    );
+    layout.off_scope_begins =
+        arena.reserve(static_cast<size_t>(layout.scope_stack_capacity) * sizeof(int32_t), alignof(int32_t));
+    layout.tensor_map = PTO2TensorMap::reserve_layout_default(arena, task_window_sizes);
+    return layout;
+}
+
+bool PTO2OrchestratorState::init_data_from_layout(
+    const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size,
+    uint64_t task_window_size
+) {
+    auto *orch = this;
+    *orch = PTO2OrchestratorState{};
+
+    orch->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
+    orch->gm_heap_base = gm_heap;
+    orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH;
+    orch->fatal = false;
+
+    // Mirror the SM API's per-ring window-size shape so a future per-ring
+    // SM layout cannot silently disagree with the addresses we compute here.
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        task_window_sizes[r] = task_window_size;
+
+    auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        void *ring_heap_base = reinterpret_cast<char *>(gm_heap) + r * heap_size;
+        auto *task_descs_dev = pto2_sm_layout::ring_task_descriptors_addr(sm_dev_base, task_window_sizes, r);
+        auto *cur_idx_dev = pto2_sm_layout::ring_current_task_index_addr(sm_dev_base, r);
+        auto *last_alive_dev = pto2_sm_layout::ring_last_task_alive_addr(sm_dev_base, r);
+
+        orch->rings[r].task_allocator.init(
+            task_descs_dev, static_cast<int32_t>(task_window_size), cur_idx_dev, last_alive_dev, ring_heap_base,
+            heap_size, orch_err
+        );
+
+        const size_t fanin_pool_bytes =
+            PTO2_ALIGN_UP(static_cast<size_t>(layout.dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE);
+        auto *fanin_entries = static_cast<PTO2FaninSpillEntry *>(arena.region_ptr(layout.off_fanin_pool[r]));
+        memset(fanin_entries, 0, fanin_pool_bytes);
+        orch->rings[r].fanin_pool.init(fanin_entries, layout.dep_pool_capacity, orch_err);
+    }
+
+    if (!orch->tensor_map.init_data_from_layout(layout.tensor_map, arena)) {
+        return false;
+    }
+
+    orch->scope_tasks_size = 0;
+    orch->scope_tasks_capacity = layout.scope_tasks_cap;
+    orch->scope_stack_top = -1;
+    orch->scope_stack_capacity = layout.scope_stack_capacity;
+    orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
+
+    return true;
+}
+
+void PTO2OrchestratorState::wire_arena_pointers(
+    const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler_arg
+) {
+    auto *orch = this;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        orch->rings[r].fanin_pool.base = static_cast<PTO2FaninSpillEntry *>(arena.region_ptr(layout.off_fanin_pool[r]));
+    }
+    orch->tensor_map.wire_arena_pointers(layout.tensor_map, arena);
+    orch->scope_tasks = static_cast<PTO2TaskSlotState **>(arena.region_ptr(layout.off_scope_tasks));
+    orch->scope_begins = static_cast<int32_t *>(arena.region_ptr(layout.off_scope_begins));
+    orch->scheduler = scheduler_arg;
+}
+
+void PTO2OrchestratorState::destroy() {
+    auto *orch = this;
+    orch->tensor_map.destroy();
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        orch->rings[r].fanin_pool.base = nullptr;
+    }
+    orch->scope_tasks = nullptr;
+    orch->scope_begins = nullptr;
+}
+
+void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; }
+
+// =============================================================================
+// Top-level runtime arena
+// =============================================================================
+
+PTO2RuntimeArenaLayout
+runtime_reserve_layout(DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity) {
+    PTO2RuntimeArenaLayout layout{};
+    layout.task_window_size = task_window_size;
+    layout.dep_pool_capacity = dep_pool_capacity;
+
+    int32_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_window_sizes[r] = static_cast<int32_t>(task_window_size);
+    }
+
+    layout.off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
+    layout.orch = PTO2OrchestratorState::reserve_layout(arena, task_window_sizes, dep_pool_capacity);
+    layout.sched = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacity);
+    layout.off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE);
+    layout.off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox));
+
+    layout.arena_size = arena.total_size();
+    return layout;
+}
+
+PTO2Runtime *runtime_init_data_from_layout(
+    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base,
+    uint64_t /*sm_size*/, void *gm_heap_dev_base, uint64_t heap_size
+) {
+    PTO2Runtime *rt = static_cast<PTO2Runtime *>(arena.region_ptr(layout.off_runtime));
+    memset(rt, 0, sizeof(*rt));
+
+    auto *sm_wrap = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.off_sm_handle));
+    memset(sm_wrap, 0, sizeof(*sm_wrap));
+
+    // rt->ops is filled by the AICPU at boot.
+    rt->mode = mode;
+    rt->gm_heap = gm_heap_dev_base;
+    rt->gm_heap_size = heap_size > 0 ? heap_size * PTO2_MAX_RING_DEPTH : 0;
+    rt->gm_heap_owned = false;
+    rt->total_cycles = 0;
+
+    if (!rt->orchestrator.init_data_from_layout(
+            layout.orch, arena, sm_dev_base, gm_heap_dev_base, heap_size, layout.task_window_size
+        )) {
+        return nullptr;
+    }
+    if (!rt->scheduler.init_data_from_layout(layout.sched, arena, sm_dev_base)) {
+        return nullptr;
+    }
+
+    auto *mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.off_mailbox));
+    memset(mailbox, 0, sizeof(*mailbox));
+
+    return rt;
+}
+
+void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt) {
+    rt->sm_handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.off_sm_handle));
+    rt->aicore_mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.off_mailbox));
+    rt->orchestrator.wire_arena_pointers(layout.orch, arena, &rt->scheduler);
+    rt->scheduler.wire_arena_pointers(layout.sched, arena);
+}
+
+void runtime_destroy(PTO2Runtime *rt, DeviceArena & /*arena*/) {
+    // Arena buffer is pooled across runs by DeviceRunner — never freed here.
+    if (!rt) return;
+    rt->scheduler.destroy();
+    rt->orchestrator.destroy();
+    rt->aicore_mailbox = nullptr;
+    rt->sm_handle = nullptr;
+}
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp
index 358c87f57..1e1edff92 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp
@@ -167,6 +167,23 @@ void PTO2SharedMemoryHandle::init_header_per_ring(
     header->sched_error_bitmap.store(0, std::memory_order_relaxed);
     header->sched_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
     header->sched_error_thread.store(-1, std::memory_order_relaxed);
+
+    // Per-ring slot_states reset. Previously lived in
+    // PTO2SchedulerState::RingSchedState::init(), but it writes into
+    // ring->slot_states[] which is SM-side storage — keeping it here lets
+    // host-side prebuilt-arena init skip all SM dereferences.
+    // bind_ring() pins the ring_id (slot-invariant after this point);
+    // reset_for_reuse() prepares dynamic fanout/refcount fields so the first
+    // submit doesn't need an explicit reset.
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto &ring = header->rings[r];
+        for (uint64_t i = 0; i < task_window_sizes[r]; i++) {
+            ring.slot_states[i].bind_ring(static_cast<uint8_t>(r));
+            ring.slot_states[i].reset_for_reuse();
+            ring.slot_states[i].fanin_count = 0;
+            ring.slot_states[i].active_mask = ActiveMask{};
+        }
+    }
 }
 
 // =============================================================================
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp
index a0b98bd09..b99c67233 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp
@@ -81,43 +81,45 @@ PTO2TensorMap::reserve_layout_default(DeviceArena &arena, const int32_t new_task
     return reserve_layout(arena, PTO2_TENSORMAP_NUM_BUCKETS, PTO2_TENSORMAP_POOL_SIZE, new_task_window_sizes);
 }
 
-bool PTO2TensorMap::init_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena) {
+bool PTO2TensorMap::init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena) {
     num_buckets = layout.num_buckets;
     pool_size = layout.pool_size;
 
-    buckets = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
-    entry_pool = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
-    free_entry_list = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
+    // Address arena regions for data writes; do not store these in struct
+    // fields (wire_arena_pointers does that).
+    auto *buckets_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
+    auto *entry_pool_arena = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
+    auto *free_list_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
 
     // buckets[]: empty == nullptr.
     for (int32_t i = 0; i < num_buckets; i++) {
-        buckets[i] = nullptr;
+        buckets_arena[i] = nullptr;
     }
 
     // entry_pool: zero-init equivalent to the previous calloc(entry_pool, ...).
     // The pool's persistent invariant after init is "bucket_index == -1 means
     // not linked", set explicitly below.
-    memset(entry_pool, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry));
+    memset(entry_pool_arena, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry));
     for (int32_t i = 0; i < pool_size; i++) {
-        entry_pool[i].bucket_index = -1;
-        entry_pool[i].next_in_bucket = nullptr;
-        entry_pool[i].prev_in_bucket = nullptr;
-        entry_pool[i].next_in_task = nullptr;
-        entry_pool[i].prev_in_task = nullptr;
-        entry_pool[i].producer_task_id = PTO2TaskId{};
+        entry_pool_arena[i].bucket_index = -1;
+        entry_pool_arena[i].next_in_bucket = nullptr;
+        entry_pool_arena[i].prev_in_bucket = nullptr;
+        entry_pool_arena[i].next_in_task = nullptr;
+        entry_pool_arena[i].prev_in_task = nullptr;
+        entry_pool_arena[i].producer_task_id = PTO2TaskId{};
     }
 
     // free_entry_list: zeroed (was calloc'd before); contents become meaningful
     // only after entries are freed back, so the body of the array stays as 0.
-    memset(free_entry_list, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry *));
+    memset(free_list_arena, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry *));
 
     next_entry_idx = 0;
     free_num = 0;
 
     for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_entry_heads[r] = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
+        auto *heads_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
         for (int32_t i = 0; i < layout.task_window_sizes[r]; i++) {
-            task_entry_heads[r][i] = nullptr;
+            heads_arena[i] = nullptr;
         }
         task_window_sizes[r] = layout.task_window_sizes[r];
         last_task_alives[r] = 0;
@@ -127,6 +129,15 @@ bool PTO2TensorMap::init_from_layout(const PTO2TensorMapLayout &layout, DeviceAr
     return true;
 }
 
+void PTO2TensorMap::wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena) {
+    buckets = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
+    entry_pool = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
+    free_entry_list = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_entry_heads[r] = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
+    }
+}
+
 void PTO2TensorMap::destroy() {
     // Arena owns the backing memory; here we only forget our pointers so any
     // stray post-destroy access trips a nullptr dereference instead of reading
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
index 6a7ab65da..b3347b53c 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
@@ -44,6 +44,8 @@ Runtime::Runtime() {
     gm_heap_ptr_ = nullptr;
     slot_states_ptr_ = nullptr;
     orch_args_storage_.clear();
+    prebuilt_arena_base_ = nullptr;
+    prebuilt_runtime_offset_ = 0;
 
     // Initialize device orchestration SO binary
     dev_orch_so_addr_ = 0;
@@ -74,6 +76,13 @@ void Runtime::set_gm_heap(void *p) { gm_heap_ptr_ = p; }
 void Runtime::set_slot_states_ptr(void *p) { slot_states_ptr_ = p; }
 void Runtime::set_orch_args(const ChipStorageTaskArgs &args) { orch_args_storage_ = args; }
 
+void Runtime::set_prebuilt_arena(void *arena_base, size_t runtime_off) {
+    prebuilt_arena_base_ = arena_base;
+    prebuilt_runtime_offset_ = runtime_off;
+}
+void *Runtime::get_prebuilt_arena_base() const { return prebuilt_arena_base_; }
+size_t Runtime::get_prebuilt_runtime_offset() const { return prebuilt_runtime_offset_; }
+
 // Device orchestration SO metadata (bytes live in a separate device buffer
 // owned by DeviceRunner; only the address/size travels in Runtime).
 void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size) {
diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp
index 38242555d..506613dcd 100644
--- a/src/a5/platform/onboard/host/device_runner.cpp
+++ b/src/a5/platform/onboard/host/device_runner.cpp
@@ -195,40 +195,75 @@ static int prof_free_cb(void *dev_ptr) { return rtFree(dev_ptr); }
 
 DeviceRunner::~DeviceRunner() { finalize(); }
 
-int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size) {
-    if (static_arena_.is_committed()) {
-        if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_) return 0;
-        static_arena_.release();
-        gm_heap_region_off_ = SIZE_MAX;
-        gm_sm_region_off_ = SIZE_MAX;
+int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
+    // Three independent device_malloc'd buffers: GM heap, PTO2 SM, prebuilt
+    // runtime arena. Split out from a single large allocation because the
+    // combined size can exceed the device allocator's largest contiguous
+    // block. Each arena commits exactly one region, so its base() is the
+    // pooled pointer the caller wants.
+    //
+    // Idempotent for the production case (sizes do not change across a
+    // worker's lifetime). If a caller asks for a larger layout on any
+    // region, redo just that region — already-committed peers stay alive
+    // so their callers don't have to re-acquire.
+    auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int {
+        if (requested_size == 0) {
+            // hbg's runtime_arena path: caller passed 0 and never reserved
+            // a region. Leave the arena uncommitted; acquire_pooled_* will
+            // return nullptr.
+            if (arena.is_committed() && cached_size != 0) {
+                arena.release();
+                cached_size = 0;
+            }
+            return 0;
+        }
+        if (arena.is_committed() && requested_size <= cached_size) {
+            return 0;
+        }
+        arena.release();
+        cached_size = 0;
+        arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign);
+        if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
+            // commit() failure leaves committed_=false, so the next entry's
+            // is_committed() guard skips the release branch. release() is
+            // idempotent on a never-committed arena (zeroes cursor_).
+            arena.release();
+            return -1;
+        }
+        cached_size = requested_size;
+        return 0;
+    };
+    if (commit_region(gm_heap_arena_, cached_gm_heap_size_, gm_heap_size) != 0) return -1;
+    if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) {
+        gm_heap_arena_.release();
         cached_gm_heap_size_ = 0;
-        cached_gm_sm_size_ = 0;
+        return -1;
     }
-    gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign);
-    gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign);
-    if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
-        // Roll back the two reserves: commit() failure leaves committed_=false,
-        // so the next entry would skip the release branch and stack new
-        // reserves on top of the stale cursor. release() is idempotent on a
-        // never-committed arena (just zeroes cursor_ / region_count_).
-        static_arena_.release();
-        gm_heap_region_off_ = SIZE_MAX;
-        gm_sm_region_off_ = SIZE_MAX;
+    if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) {
+        gm_heap_arena_.release();
+        gm_sm_arena_.release();
+        cached_gm_heap_size_ = 0;
+        cached_gm_sm_size_ = 0;
         return -1;
     }
-    cached_gm_heap_size_ = gm_heap_size;
-    cached_gm_sm_size_ = gm_sm_size;
     return 0;
 }
 
 void *DeviceRunner::acquire_pooled_gm_heap() {
-    if (!static_arena_.is_committed()) return nullptr;
-    return static_arena_.region_ptr(gm_heap_region_off_);
+    if (!gm_heap_arena_.is_committed()) return nullptr;
+    return gm_heap_arena_.base();
 }
 
 void *DeviceRunner::acquire_pooled_gm_sm() {
-    if (!static_arena_.is_committed()) return nullptr;
-    return static_arena_.region_ptr(gm_sm_region_off_);
+    if (!gm_sm_arena_.is_committed()) return nullptr;
+    return gm_sm_arena_.base();
+}
+
+void *DeviceRunner::acquire_pooled_runtime_arena() {
+    // hbg calls setup_static_arena(...,0) and leaves runtime_arena_pool_
+    // uncommitted — fail loudly if a caller asks for it anyway.
+    if (!runtime_arena_pool_.is_committed()) return nullptr;
+    return runtime_arena_pool_.base();
 }
 
 std::thread DeviceRunner::create_thread(std::function<void()> fn) {
@@ -1039,14 +1074,25 @@ int DeviceRunner::finalize() {
         pmu_collector_.finalize(/*unregister_cb=*/nullptr, prof_free_cb);
     }
 
-    // Release per-Worker static arena (GM heap + PTO2 SM in a single backing
-    // device allocation). Must precede mem_alloc_.finalize() so the arena
-    // frees through the still-live allocator, not after it.
-    static_arena_.release();
-    gm_heap_region_off_ = SIZE_MAX;
-    gm_sm_region_off_ = SIZE_MAX;
+    // Release the three per-Worker pooled arenas (GM heap, PTO2 SM, optional
+    // trb prebuilt runtime arena — each its own device_malloc). Must precede
+    // mem_alloc_.finalize() so the arenas free through the still-live
+    // allocator, not after it.
+    gm_heap_arena_.release();
+    gm_sm_arena_.release();
+    runtime_arena_pool_.release();
     cached_gm_heap_size_ = 0;
     cached_gm_sm_size_ = 0;
+    cached_runtime_arena_size_ = 0;
+
+    // Free the 8-byte device_wall buffer (allocated lazily in run()) while
+    // mem_alloc_ and the device context are still live. free_tensor() routes
+    // through mem_alloc_.free(), so it must run before finalize() and before
+    // rtDeviceReset() tears down the device runtime.
+    if (device_wall_dev_ptr_ != nullptr) {
+        free_tensor(device_wall_dev_ptr_);
+        device_wall_dev_ptr_ = nullptr;
+    }
 
     // Free all remaining allocations (including handshake buffer and binGmAddr)
     mem_alloc_.finalize();
@@ -1057,11 +1103,6 @@ int DeviceRunner::finalize() {
         return rc;
     }
 
-    // Free the 8-byte device_wall buffer (allocated lazily in run()).
-    if (device_wall_dev_ptr_ != nullptr) {
-        free_tensor(device_wall_dev_ptr_);
-        device_wall_dev_ptr_ = nullptr;
-    }
     device_id_ = -1;
     block_dim_ = 0;
     worker_count_ = 0;
diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h
index a07ab28bb..0d8cc0397 100644
--- a/src/a5/platform/onboard/host/device_runner.h
+++ b/src/a5/platform/onboard/host/device_runner.h
@@ -174,24 +174,36 @@ struct KernelArgsHelper {
 class DeviceRunner {
 public:
     DeviceRunner() :
-        static_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {}
+        gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
+        gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
+        runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {}
     ~DeviceRunner();
 
     /**
-     * Lay out and commit the per-Worker static device arena that backs the
-     * PTO2 GM heap and PTO2 shared memory in a single underlying allocation.
-     * Must be called before acquire_pooled_gm_heap / acquire_pooled_gm_sm.
-     * Idempotent on identical sizes. Returns 0 on success, -1 on failure.
+     * Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared
+     * memory, trb prebuilt runtime arena) as three independent device
+     * allocations. Must be called before any acquire_pooled_*. Idempotent
+     * on identical sizes. `runtime_arena_size` is 0 for the hbg path (no
+     * prebuilt runtime arena) — the corresponding arena stays uncommitted.
+     * Returns 0 on success, -1 on failure.
      */
-    int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size);
+    int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
 
     /**
-     * Return the pooled GM heap / PTO2 SM pointer. setup_static_arena must
-     * have been called earlier in this Worker; otherwise these return
-     * nullptr. Pointers are stable for the lifetime of the Worker.
+     * Return the pooled GM heap / PTO2 SM / runtime arena pointer.
+     * setup_static_arena must have already committed the relevant region;
+     * otherwise these return nullptr. All pointers are stable for the
+     * Worker's lifetime; the three underlying device buffers are released
+     * in `finalize()`.
+     *
+     * acquire_pooled_runtime_arena() is trb-only — the runtime arena region
+     * is only committed when setup_static_arena was called with
+     * runtime_arena_size > 0. Calling it on the hbg path
+     * (setup_static_arena(...,0)) returns nullptr (well-defined).
      */
     void *acquire_pooled_gm_heap();
     void *acquire_pooled_gm_sm();
+    void *acquire_pooled_runtime_arena();
 
     /**
      * Create a thread bound to this device.
@@ -511,22 +523,30 @@ class DeviceRunner {
     // Memory management
     MemoryAllocator mem_alloc_;
 
-    // Per-Worker arena backing the PTO2 GM heap + PTO2 shared memory in a
-    // single device allocation. Released explicitly in finalize() before
-    // mem_alloc_.finalize() so it does not free pointers a second time.
+    // Three independent per-Worker arenas, each backing a single pooled
+    // region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime
+    // arena). Split out from a single backing allocation because the
+    // combined size can exceed the device allocator's largest contiguous
+    // block — three separate device_malloc calls are friendlier than one
+    // big one. Released explicitly in finalize() before mem_alloc_.finalize()
+    // so the underlying buffers do not get freed twice.
+    //
+    // `runtime_arena_pool_` stays unreserved when setup_static_arena was
+    // invoked with runtime_arena_size == 0 (hbg path).
     //
     // Trampolines forward DeviceArena's alloc/free calls to mem_alloc_.
     static void *arena_alloc_trampoline(void *ctx, size_t size) {
         return static_cast<MemoryAllocator *>(ctx)->alloc(size);
     }
     static void arena_free_trampoline(void *ctx, void *p) { static_cast<MemoryAllocator *>(ctx)->free(p); }
-    DeviceArena static_arena_;
-    size_t gm_heap_region_off_{SIZE_MAX};
-    size_t gm_sm_region_off_{SIZE_MAX};
-    // Cached sizes for setup_static_arena's "fits" check — avoids calling
-    // region_size() on the arena's public API for the two regions we own.
+    DeviceArena gm_heap_arena_;
+    DeviceArena gm_sm_arena_;
+    DeviceArena runtime_arena_pool_;
+    // Cached sizes for setup_static_arena's "fits" check — avoids re-allocating
+    // a buffer when a later worker init asks for an equal-or-smaller layout.
     size_t cached_gm_heap_size_{0};
     size_t cached_gm_sm_size_{0};
+    size_t cached_runtime_arena_size_{0};
 
     // Device resources
     rtStream_t stream_aicpu_{nullptr};
diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
index 0cc17c81f..1a2bb32a9 100644
--- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
@@ -108,9 +108,9 @@ static uint64_t upload_chip_callable_buffer_wrapper(const void *callable) {
     }
 }
 
-static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size) {
+static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
     try {
-        return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size);
+        return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size, runtime_arena_size);
     } catch (...) {
         return -1;
     }
@@ -132,6 +132,14 @@ static void *acquire_pooled_gm_sm_wrapper() {
     }
 }
 
+static void *acquire_pooled_runtime_arena_wrapper() {
+    try {
+        return current_runner()->acquire_pooled_runtime_arena();
+    } catch (...) {
+        return nullptr;
+    }
+}
+
 /* ===========================================================================
  * Public C API (resolved by ChipWorker via dlsym)
  * =========================================================================== */
@@ -426,6 +434,7 @@ int run_prepared(
         r->host_api.setup_static_arena = setup_static_arena_wrapper;
         r->host_api.acquire_pooled_gm_heap = acquire_pooled_gm_heap_wrapper;
         r->host_api.acquire_pooled_gm_sm = acquire_pooled_gm_sm_wrapper;
+        r->host_api.acquire_pooled_runtime_arena = acquire_pooled_runtime_arena_wrapper;
         r->host_api.upload_chip_callable_buffer = upload_chip_callable_buffer_wrapper;
 
         // Restore kernel addrs + orch symbol names + active_callable_id; the
diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp
index c0d26fbe1..b3072919c 100644
--- a/src/a5/platform/sim/host/device_runner.cpp
+++ b/src/a5/platform/sim/host/device_runner.cpp
@@ -112,40 +112,66 @@ static int prof_free_cb(void *dev_ptr) {
 
 DeviceRunner::~DeviceRunner() { finalize(); }
 
-int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size) {
-    if (static_arena_.is_committed()) {
-        if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_) return 0;
-        static_arena_.release();
-        gm_heap_region_off_ = SIZE_MAX;
-        gm_sm_region_off_ = SIZE_MAX;
-        cached_gm_heap_size_ = 0;
-        cached_gm_sm_size_ = 0;
-    }
-    gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign);
-    gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign);
-    if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
-        // Roll back the two reserves: commit() failure leaves committed_=false,
-        // so the next entry would skip the release branch and stack new
-        // reserves on top of the stale cursor. release() is idempotent on a
-        // never-committed arena (just zeroes cursor_ / region_count_).
-        static_arena_.release();
-        gm_heap_region_off_ = SIZE_MAX;
-        gm_sm_region_off_ = SIZE_MAX;
-        return -1;
-    }
-    cached_gm_heap_size_ = gm_heap_size;
-    cached_gm_sm_size_ = gm_sm_size;
+int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
+    // Three independent device_malloc'd buffers: GM heap, PTO2 SM, prebuilt
+    // runtime arena. Split out from a single large allocation because the
+    // combined size can exceed the device allocator's largest contiguous
+    // block. Each arena commits exactly one region, so its base() is the
+    // pooled pointer the caller wants.
+    //
+    // Idempotent for the production case (sizes do not change across a
+    // worker's lifetime). If a caller asks for a larger layout on any
+    // region, redo just that region — already-committed peers stay alive
+    // so their callers don't have to re-acquire.
+    auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int {
+        if (requested_size == 0) {
+            // hbg's runtime_arena path: caller passed 0 and never reserved
+            // a region. Leave the arena uncommitted; acquire_pooled_* will
+            // return nullptr.
+            if (arena.is_committed() && cached_size != 0) {
+                arena.release();
+                cached_size = 0;
+            }
+            return 0;
+        }
+        if (arena.is_committed() && requested_size <= cached_size) {
+            return 0;
+        }
+        arena.release();
+        cached_size = 0;
+        arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign);
+        if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
+            // commit() failure leaves committed_=false, so the next entry's
+            // is_committed() guard skips the release branch. release() is
+            // idempotent on a never-committed arena (zeroes cursor_).
+            arena.release();
+            return -1;
+        }
+        cached_size = requested_size;
+        return 0;
+    };
+    // Failure of a later region leaves earlier peers committed on purpose:
+    // pooled pointers previously returned to callers must stay valid even if
+    // this resize attempt aborts.
+    if (commit_region(gm_heap_arena_, cached_gm_heap_size_, gm_heap_size) != 0) return -1;
+    if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) return -1;
+    if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) return -1;
     return 0;
 }
 
 void *DeviceRunner::acquire_pooled_gm_heap() {
-    if (!static_arena_.is_committed()) return nullptr;
-    return static_arena_.region_ptr(gm_heap_region_off_);
+    if (!gm_heap_arena_.is_committed()) return nullptr;
+    return gm_heap_arena_.base();
 }
 
 void *DeviceRunner::acquire_pooled_gm_sm() {
-    if (!static_arena_.is_committed()) return nullptr;
-    return static_arena_.region_ptr(gm_sm_region_off_);
+    if (!gm_sm_arena_.is_committed()) return nullptr;
+    return gm_sm_arena_.base();
+}
+
+void *DeviceRunner::acquire_pooled_runtime_arena() {
+    if (!runtime_arena_pool_.is_committed()) return nullptr;
+    return runtime_arena_pool_.base();
 }
 
 std::thread DeviceRunner::create_thread(std::function<void()> fn) {
@@ -929,14 +955,16 @@ int DeviceRunner::finalize() {
     // Close executor .so files (typically already closed by run(), this is a safety net)
     unload_executor_binaries();
 
-    // Release per-Worker static arena (GM heap + PTO2 SM in a single backing
-    // device allocation). Must precede mem_alloc_.finalize() so the arena
-    // frees through the still-live allocator, not after it.
-    static_arena_.release();
-    gm_heap_region_off_ = SIZE_MAX;
-    gm_sm_region_off_ = SIZE_MAX;
+    // Release the three per-Worker pooled arenas (GM heap, PTO2 SM, optional
+    // trb prebuilt runtime arena — each its own device_malloc). Must precede
+    // mem_alloc_.finalize() so the arenas free through the still-live
+    // allocator, not after it.
+    gm_heap_arena_.release();
+    gm_sm_arena_.release();
+    runtime_arena_pool_.release();
     cached_gm_heap_size_ = 0;
     cached_gm_sm_size_ = 0;
+    cached_runtime_arena_size_ = 0;
 
     // Free all remaining allocations
     mem_alloc_.finalize();
diff --git a/src/a5/platform/sim/host/device_runner.h b/src/a5/platform/sim/host/device_runner.h
index 0aa6e6fa1..59b685572 100644
--- a/src/a5/platform/sim/host/device_runner.h
+++ b/src/a5/platform/sim/host/device_runner.h
@@ -72,24 +72,33 @@
 class DeviceRunner {
 public:
     DeviceRunner() :
-        static_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {}
+        gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
+        gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
+        runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {}
     ~DeviceRunner();
 
     /**
-     * Lay out and commit the per-Worker static device arena that backs the
-     * PTO2 GM heap and PTO2 shared memory in a single underlying allocation.
-     * Must be called before acquire_pooled_gm_heap / acquire_pooled_gm_sm.
-     * Idempotent on identical sizes. Returns 0 on success, -1 on failure.
+     * Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared
+     * memory, trb prebuilt runtime arena) as three independent device
+     * allocations. Must be called before any acquire_pooled_*. Idempotent
+     * on identical sizes. `runtime_arena_size` is 0 for the hbg path
+     * (leaves that arena uncommitted). Returns 0 on success, -1 on
+     * failure.
      */
-    int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size);
+    int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
 
     /**
-     * Return the pooled GM heap / PTO2 SM pointer. setup_static_arena must
-     * have been called earlier in this Worker; otherwise these return
-     * nullptr. Pointers are stable for the lifetime of the Worker.
+     * Return the pooled GM heap / PTO2 SM / runtime arena pointer.
+     * setup_static_arena must have already committed the relevant region;
+     * otherwise these return nullptr.
+     *
+     * acquire_pooled_runtime_arena() is trb-only — the region is only
+     * committed when setup_static_arena was called with
+     * runtime_arena_size > 0. Calling it on the hbg path returns nullptr.
      */
     void *acquire_pooled_gm_heap();
     void *acquire_pooled_gm_sm();
+    void *acquire_pooled_runtime_arena();
 
     /**
      * Create a thread bound to this device.
@@ -280,22 +289,30 @@ class DeviceRunner {
     // Memory management
     MemoryAllocator mem_alloc_;
 
-    // Per-Worker arena backing the PTO2 GM heap + PTO2 shared memory in a
-    // single device allocation. Released explicitly in finalize() before
-    // mem_alloc_.finalize() so it does not free pointers a second time.
+    // Three independent per-Worker arenas, each backing a single pooled
+    // region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime
+    // arena). Split out from a single backing allocation because the
+    // combined size can exceed the device allocator's largest contiguous
+    // block — three separate device_malloc calls are friendlier than one
+    // big one. Released explicitly in finalize() before mem_alloc_.finalize()
+    // so the underlying buffers do not get freed twice.
+    //
+    // `runtime_arena_pool_` stays unreserved when setup_static_arena was
+    // invoked with runtime_arena_size == 0 (hbg path).
     //
     // Trampolines forward DeviceArena's alloc/free to mem_alloc_.
     static void *arena_alloc_trampoline(void *ctx, size_t size) {
         return static_cast<MemoryAllocator *>(ctx)->alloc(size);
     }
     static void arena_free_trampoline(void *ctx, void *p) { static_cast<MemoryAllocator *>(ctx)->free(p); }
-    DeviceArena static_arena_;
-    size_t gm_heap_region_off_{SIZE_MAX};
-    size_t gm_sm_region_off_{SIZE_MAX};
-    // Cached sizes for setup_static_arena's "fits" check — avoids calling
-    // region_size() on the arena's public API for the two regions we own.
+    DeviceArena gm_heap_arena_;
+    DeviceArena gm_sm_arena_;
+    DeviceArena runtime_arena_pool_;
+    // Cached sizes for setup_static_arena's "fits" check — avoids re-allocating
+    // a buffer when a later worker init asks for an equal-or-smaller layout.
     size_t cached_gm_heap_size_{0};
     size_t cached_gm_sm_size_{0};
+    size_t cached_runtime_arena_size_{0};
 
     // Simulation state (no actual device resources)
     KernelArgs kernel_args_;
diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
index 81e9b138f..f2dc10b4e 100644
--- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
@@ -103,9 +103,9 @@ static uint64_t upload_chip_callable_buffer_wrapper(const void *callable) {
     }
 }
 
-static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size) {
+static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
     try {
-        return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size);
+        return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size, runtime_arena_size);
     } catch (...) {
         return -1;
     }
@@ -127,6 +127,14 @@ static void *acquire_pooled_gm_sm_wrapper() {
     }
 }
 
+static void *acquire_pooled_runtime_arena_wrapper() {
+    try {
+        return current_runner()->acquire_pooled_runtime_arena();
+    } catch (...) {
+        return nullptr;
+    }
+}
+
 /* ===========================================================================
  * Public C API (resolved by ChipWorker via dlsym)
  * =========================================================================== */
@@ -328,6 +336,7 @@ int run_prepared(
         r->host_api.setup_static_arena = setup_static_arena_wrapper;
         r->host_api.acquire_pooled_gm_heap = acquire_pooled_gm_heap_wrapper;
         r->host_api.acquire_pooled_gm_sm = acquire_pooled_gm_sm_wrapper;
+        r->host_api.acquire_pooled_runtime_arena = acquire_pooled_runtime_arena_wrapper;
         r->host_api.upload_chip_callable_buffer = upload_chip_callable_buffer_wrapper;
 
         auto bind_result = runner->bind_prepared_callable_to_runtime(*r, callable_id);
diff --git a/src/a5/runtime/host_build_graph/runtime/runtime.h b/src/a5/runtime/host_build_graph/runtime/runtime.h
index b9edf7020..25c6c13f4 100644
--- a/src/a5/runtime/host_build_graph/runtime/runtime.h
+++ b/src/a5/runtime/host_build_graph/runtime/runtime.h
@@ -146,9 +146,10 @@ struct HostApi {
     // pto_runtime_c_api.cpp can populate the same HostApi struct regardless of
     // which runtime variant it is built against. Unset for this variant; do
     // not call.
-    int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size);
+    int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
     void *(*acquire_pooled_gm_heap)();
     void *(*acquire_pooled_gm_sm)();
+    void *(*acquire_pooled_runtime_arena)();
     // Single-shot upload of the entire ChipCallable buffer. `callable` is a
     // `const ChipCallable *` (declared void* to avoid pulling task_interface
     // headers into runtime.h). DeviceRunner walks child_offsets_ to compute
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index bcea9b09e..49d55380f 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -125,8 +125,10 @@ struct AicpuExecutor {
     std::atomic<int32_t> finished_count_{0};
     std::atomic<bool> runtime_init_ready_{false};
 
-    // Per-Worker arena backing the PTO2Runtime + sm_handle + orch/sched/mailbox
-    // sub-regions (created in runtime_create_from_sm, released in runtime_destroy).
+    // Per-Worker arena attaching to the pooled prebuilt runtime image. Host
+    // populates the layout + data on its own arena, rtMemcpys into a pooled
+    // device buffer owned by DeviceRunner, and the AICPU attach()es to that
+    // buffer on each boot — no AICPU-side commit, no per-boot rtMalloc.
     // Default-constructed: libc-backed backend, no ctx.
     DeviceArena runtime_arena_;
 
@@ -466,29 +468,61 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 static_cast<uint64_t>(task_window_size), static_cast<uint64_t>(heap_size), dep_pool_capacity
             );
 
-            void *sm_ptr = runtime->get_gm_sm_ptr();
-            void *gm_heap = runtime->get_gm_heap_ptr();
+            // gm_heap pointer / dep_pool_capacity are encoded into the prebuilt
+            // runtime arena image at host build time, so we no longer fetch
+            // them here. They remain on the host Runtime instance and on the
+            // PTO2Runtime header for diagnostic purposes only.
+            (void)dep_pool_capacity;
 
+            void *sm_ptr = runtime->get_gm_sm_ptr();
             uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(task_window_size);
-            rt = runtime_create_from_sm(
-                PTO2_MODE_EXECUTE, sm_ptr, sm_size, task_window_size, gm_heap, heap_size, runtime_arena_,
-                dep_pool_capacity
-            );
-            if (!rt) {
-                LOG_ERROR("Thread %d: Failed to create PTO2Runtime", thread_idx);
-                // Unblock scheduler threads before returning so they don't spin forever.
+
+            // Prebuilt-arena fast path. Host has pre-populated the entire
+            // runtime arena (PTO2Runtime + orchestrator/scheduler/tensor_map
+            // sub-regions + sm_handle wrapper + mailbox) and uploaded it via
+            // rtMemcpy into the pooled runtime_arena buffer. We attach to it,
+            // wire arena-internal pointers to their device addresses, reset
+            // the SM, and finalize the few device-only fields the host could
+            // not know at image-build time.
+            void *prebuilt_arena = runtime->get_prebuilt_arena_base();
+            size_t off_runtime = runtime->get_prebuilt_runtime_offset();
+            if (prebuilt_arena == nullptr) {
+                LOG_ERROR("Thread %d: prebuilt_arena_base is null", thread_idx);
+                runtime_init_ready_.store(true, std::memory_order_release);
+                return -1;
+            }
+            runtime_arena_.attach(prebuilt_arena, DeviceArena::kDefaultBaseAlign);
+            rt = reinterpret_cast<PTO2Runtime *>(static_cast<char *>(prebuilt_arena) + off_runtime);
+
+            // Wire every arena-internal pointer field (host wrote host-mirror
+            // addresses; we overwrite them with device addresses).
+            runtime_wire_arena_pointers(runtime_arena_, rt->prebuilt_layout, rt);
+
+            // Reset SM state. setup_pointers + init_header_per_ring restore
+            // ring flow-control counters, layout metadata, error flags, and
+            // the per-slot ring->slot_states[] (bind_ring + reset_for_reuse +
+            // fanin_count/active_mask zero — previously done inside
+            // RingSchedState::init).
+            memset(rt->sm_handle, 0, sizeof(*rt->sm_handle));
+            if (!rt->sm_handle->init(sm_ptr, sm_size, task_window_size, heap_size)) {
+                LOG_ERROR("Thread %d: sm_handle->init failed", thread_idx);
                 runtime_init_ready_.store(true, std::memory_order_release);
                 return -1;
             }
 
+            // AICore completion mailbox lives in the arena; reset it each
+            // boot so stale completion notifications from a previous run do
+            // not leak.
+            memset(rt->aicore_mailbox, 0, sizeof(*rt->aicore_mailbox));
+
+            // Fill ops / core counts (host can't resolve s_runtime_ops's
+            // device address nor know the SchedulerContext's core fan-out).
+            runtime_finalize_after_wire(rt, sched_ctx_.aic_count(), sched_ctx_.aiv_count());
+
 #if PTO2_PROFILING
             rt->orchestrator.l2_perf_level = get_l2_perf_level();
 #endif
 
-            // Total core counts = aic_count_ / aiv_count_ (set once at runtime init).
-            rt->orchestrator.total_cluster_count = sched_ctx_.aic_count();
-            rt->orchestrator.total_aiv_count = sched_ctx_.aiv_count();
-
             // With multi-ring, slot_states are per-ring inside the scheduler.
             runtime->set_slot_states_ptr(nullptr);
 
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index 0c7ac3872..037d3ab04 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -36,8 +36,10 @@
 #include <cstring>
 
 #include "../common/pto_runtime_status.h"
+#include "../runtime/pto_runtime2.h"
 #include "../runtime/pto_shared_memory.h"
 #include "../runtime/runtime.h"
+#include "device_arena.h"
 #include "callable.h"
 #include "common/platform_config.h"
 #include "common/unified_log.h"
@@ -271,15 +273,35 @@ extern "C" int bind_prepared_to_runtime_impl(
     uint64_t eff_heap_size = runtime->heap_size ? runtime->heap_size : PTO2_HEAP_SIZE;
     uint64_t eff_task_window_size = runtime->task_window_size ? runtime->task_window_size : PTO2_TASK_WINDOW_SIZE;
 
-    // Lay out the per-Worker static device arena. GM heap (orchestrator output
-    // buffers, all rings combined) and PTO2 shared memory live in a single
-    // backing allocation; setup_static_arena reserves both regions and
-    // commits in one shot. Owned by DeviceRunner across runs — do NOT record
-    // in tensor_pairs_; the free is deferred to DeviceRunner::finalize().
+    // Lay out the per-Worker static device arena. GM heap, PTO2 shared memory,
+    // and the prebuilt runtime arena all live in a single backing allocation;
+    // setup_static_arena reserves the three regions and commits in one shot.
+    // Owned by DeviceRunner across runs — do NOT record in tensor_pairs_; the
+    // free is deferred to DeviceRunner::finalize(). The runtime-arena size is
+    // determined by replaying the reserve sequence on a host-side arena.
     uint64_t total_heap_size = eff_heap_size * PTO2_MAX_RING_DEPTH;
     uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(eff_task_window_size);
+    // dep_pool_size comes from a uint64 env var; reject values that don't fit
+    // the int32_t layout-sizing path rather than silently truncating.
+    int32_t eff_dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE;
+    if (runtime->dep_pool_size != 0) {
+        if (runtime->dep_pool_size > static_cast<uint64_t>(INT32_MAX)) {
+            LOG_ERROR("PTO2_RING_DEP_POOL=%" PRIu64 " exceeds INT32_MAX", runtime->dep_pool_size);
+            return -1;
+        }
+        eff_dep_pool_capacity = static_cast<int32_t>(runtime->dep_pool_size);
+    }
+
+    int64_t t_prebuilt_start = _now_ms();
+    DeviceArena host_arena;  // libc malloc backend by default
+    PTO2RuntimeArenaLayout layout = runtime_reserve_layout(host_arena, eff_task_window_size, eff_dep_pool_capacity);
+    if (host_arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
+        LOG_ERROR("Failed to commit host arena for prebuilt runtime image");
+        return -1;
+    }
+
     int64_t t_setup_start = _now_ms();
-    if (runtime->host_api.setup_static_arena(total_heap_size, sm_size) != 0) {
+    if (runtime->host_api.setup_static_arena(total_heap_size, sm_size, layout.arena_size) != 0) {
         LOG_ERROR("Failed to setup pooled static arena");
         return -1;
     }
@@ -303,9 +325,48 @@ extern "C" int bind_prepared_to_runtime_impl(
     }
     runtime->set_gm_sm_ptr(sm_ptr);
 
+    void *runtime_arena_dev = runtime->host_api.acquire_pooled_runtime_arena();
+    if (runtime_arena_dev == nullptr) {
+        LOG_ERROR("Failed to acquire pooled runtime arena");
+        return -1;
+    }
+
     // Set up device orchestration state
     runtime->set_orch_args(device_args);
 
+    // -------------------------------------------------------------------------
+    // Build the prebuilt runtime-arena image on host.
+    //
+    // We pre-compute every byte the AICPU's runtime arena would otherwise have
+    // to write at boot: layout offsets, sub-structure init data, and pointers
+    // back to the SM / GM heap. Then we rtMemcpy the image into the pooled
+    // runtime-arena region that DeviceRunner keeps alive across runs. AICPU
+    // boot becomes attach + wire (cheap pointer fixup) + sm_handle->init (SM
+    // reset) + a handful of device-only field fixups.
+    // -------------------------------------------------------------------------
+    PTO2Runtime *rt =
+        runtime_init_data_from_layout(host_arena, layout, PTO2_MODE_EXECUTE, sm_ptr, sm_size, gm_heap, eff_heap_size);
+    if (rt == nullptr) {
+        LOG_ERROR("runtime_init_data_from_layout failed");
+        return -1;
+    }
+    runtime_wire_arena_pointers(host_arena, layout, rt);
+
+    // Stash the layout inside the PTO2Runtime image so the AICPU can recover
+    // every arena-internal offset after rtMemcpy. The runtime arena's device
+    // base does NOT travel in this image — it's on the host Runtime
+    // (set_prebuilt_arena below), since the AICPU needs that pointer
+    // *before* it can dereference the image.
+    rt->prebuilt_layout = layout;
+
+    int rc_upload = runtime->host_api.copy_to_device(runtime_arena_dev, host_arena.base(), layout.arena_size);
+    if (rc_upload != 0) {
+        LOG_ERROR("Failed to rtMemcpy prebuilt runtime arena to device (rc=%d)", rc_upload);
+        return -1;
+    }
+    runtime->set_prebuilt_arena(runtime_arena_dev, layout.off_runtime);
+    int64_t t_prebuilt_end = _now_ms();
+
     LOG_INFO_V0("Device orchestration ready: %d tensors + %d scalars", tensor_count, scalar_count);
 
     int64_t t_total_end = _now_ms();
@@ -313,6 +374,7 @@ extern "C" int bind_prepared_to_runtime_impl(
     LOG_INFO_V0("TIMING: static_arena_setup = %" PRId64 "ms", t_setup_end - t_setup_start);
     LOG_INFO_V0("TIMING: gm_heap_acquire = %" PRId64 "ms", t_heap_end - t_heap_start);
     LOG_INFO_V0("TIMING: shared_mem_acquire = %" PRId64 "ms", t_sm_end - t_sm_start);
+    LOG_INFO_V0("TIMING: prebuilt_runtime_arena = %" PRId64 "ms", t_prebuilt_end - t_prebuilt_start);
     LOG_INFO_V0("TIMING: total_init_runtime_impl = %" PRId64 "ms", t_total_end - t_total_start);
 
     return 0;
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index 05ac105a8..c937fd986 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -329,11 +329,22 @@ static bool prepare_task(
 
     prefetch_payload(out->payload, args.tensor_count(), args.scalar_count());
 
+    // Re-bind payload/task pointers each submit. Value is per-slot constant
+    // (same as &task_payloads[slot] / &task_descriptors[slot]), but writing
+    // here lets RingSchedState::init_data_from_layout() skip the
+    // O(window_size) bind loop. Both writes hit the same 64B slot_state
+    // cache line we're about to dirty below, so the extra cost is two
+    // stores on an already-hot line. Must precede the scheduler
+    // wiring.queue.push at the end of submit_task_common — that push is
+    // the first read of slot_state->task / slot_state->payload by another
+    // thread.
+    out->slot_state->bind_buffers(out->payload, out->task);
+
     // Fields already reset by advance_ring_pointers (eager reset after CONSUMED):
     //   fanout_lock=0, fanout_count=1, fanout_head=nullptr,
     //   fanin_refcount=0, fanout_refcount=0, completed_subtasks=0, next_block_idx=0
-    // Fields immutable after RingSchedState::init():
-    //   payload, task, ring_id
+    // Fields immutable after RingSchedState::init_data_from_layout():
+    //   ring_id
     // task_state left as CONSUMED by eager reset (safe for stale wait_for_tensor
     // observers); set to PENDING here when orchestrator actually reuses the slot.
     out->slot_state->task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
@@ -348,88 +359,6 @@ static bool prepare_task(
     return true;
 }
 
-// =============================================================================
-// Orchestrator Initialization
-// =============================================================================
-
-PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout(
-    DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity
-) {
-    PTO2OrchestratorLayout layout{};
-    layout.dep_pool_capacity = dep_pool_capacity;
-    layout.scope_tasks_cap = PTO2_SCOPE_TASKS_CAP;
-    layout.scope_stack_capacity = PTO2_MAX_SCOPE_DEPTH;
-
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        const size_t fanin_pool_bytes =
-            PTO2_ALIGN_UP(static_cast<size_t>(dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE);
-        layout.off_fanin_pool[r] = arena.reserve(fanin_pool_bytes, PTO2_ALIGN_SIZE);
-    }
-    layout.off_scope_tasks = arena.reserve(
-        static_cast<size_t>(layout.scope_tasks_cap) * sizeof(PTO2TaskSlotState *), alignof(PTO2TaskSlotState *)
-    );
-    layout.off_scope_begins =
-        arena.reserve(static_cast<size_t>(layout.scope_stack_capacity) * sizeof(int32_t), alignof(int32_t));
-    layout.tensor_map = PTO2TensorMap::reserve_layout_default(arena, task_window_sizes);
-    return layout;
-}
-
-bool PTO2OrchestratorState::init_from_layout(
-    const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header_arg, void *gm_heap,
-    uint64_t heap_size
-) {
-    auto *orch = this;
-    *orch = PTO2OrchestratorState{};
-
-    orch->sm_header = sm_header_arg;
-    orch->gm_heap_base = gm_heap;
-    orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH;
-    orch->fatal = false;
-
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        void *ring_heap_base = reinterpret_cast<char *>(gm_heap) + r * heap_size;
-        auto &ring = sm_header_arg->rings[r];
-
-        orch->rings[r].task_allocator.init(
-            ring.task_descriptors, ring.task_window_size, &ring.fc.current_task_index, &ring.fc.last_task_alive,
-            ring_heap_base, heap_size, &sm_header_arg->orch_error_code
-        );
-
-        const size_t fanin_pool_bytes =
-            PTO2_ALIGN_UP(static_cast<size_t>(layout.dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE);
-        auto *fanin_entries = static_cast<PTO2FaninSpillEntry *>(arena.region_ptr(layout.off_fanin_pool[r]));
-        memset(fanin_entries, 0, fanin_pool_bytes);
-        orch->rings[r].fanin_pool.init(fanin_entries, layout.dep_pool_capacity, &sm_header_arg->orch_error_code);
-    }
-
-    if (!orch->tensor_map.init_from_layout(layout.tensor_map, arena)) {
-        return false;
-    }
-    orch->tensor_map.orch = orch;
-
-    orch->scope_tasks = static_cast<PTO2TaskSlotState **>(arena.region_ptr(layout.off_scope_tasks));
-    orch->scope_begins = static_cast<int32_t *>(arena.region_ptr(layout.off_scope_begins));
-    orch->scope_tasks_size = 0;
-    orch->scope_tasks_capacity = layout.scope_tasks_cap;
-    orch->scope_stack_top = -1;
-    orch->scope_stack_capacity = layout.scope_stack_capacity;
-    orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
-
-    return true;
-}
-
-void PTO2OrchestratorState::destroy() {
-    auto *orch = this;
-    orch->tensor_map.destroy();
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        orch->rings[r].fanin_pool.base = nullptr;
-    }
-    orch->scope_tasks = nullptr;
-    orch->scope_begins = nullptr;
-}
-
-void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; }
-
 // =============================================================================
 // Scope Management
 // =============================================================================
@@ -578,9 +507,6 @@ static TaskOutputTensors submit_task_common(
     auto runtime_emit = [&](PTO2TaskId producer_task_id) -> bool {
         PTO2TaskSlotState *prod_state =
             &orch->sm_header->rings[producer_task_id.ring()].get_slot_state_by_task_id(producer_task_id.local());
-        if (prod_state->task == nullptr || prod_state->task->task_id != producer_task_id) {
-            return true;  // producer slot reused for a different task — dep is moot
-        }
         return append_fanin_or_fail(orch, prod_state, &fanin_builder, ring_id);
     };
 
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index e24b85b4e..9a73714c0 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -133,19 +133,29 @@ struct PTO2OrchestratorState {
     // === Cold-path API (defined in pto_orchestrator.cpp) ===
 
     // Phase 1: declare every sub-region (per-ring fanin pool, scope arrays,
-    // tensor_map sub-layout) on the supplied arena.
+    // tensor_map sub-layout) on the supplied arena. task_window_sizes feeds
+    // the nested tensor_map layout. Returned layout is consumed by
+    // init_data_from_layout.
     static PTO2OrchestratorLayout reserve_layout(
         DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH],
         int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE
     );
 
-    // Phase 3: bind region pointers, wire per-ring task_allocator + fanin_pool
-    // and tensor_map. Arena must be committed.
-    bool init_from_layout(
-        const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header, void *gm_heap,
-        uint64_t heap_size
+    // Phase 3a: write everything *except* arena-internal pointer fields.
+    // sm_dev_base is the SM device address (only stored, never dereferenced);
+    // task_window_size feeds the per-ring SM address arithmetic. Safe to call
+    // on a host arena that holds the prebuilt image.
+    bool init_data_from_layout(
+        const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size,
+        uint64_t task_window_size
     );
 
+    // Phase 3b: write the arena-internal pointer fields (scope_tasks,
+    // scope_begins, rings[].fanin_pool.base, tensor_map.{buckets,entry_pool,
+    // free_entry_list,task_entry_heads}, scheduler reference).
+    // Idempotent — host runs once on the image, AICPU runs once after attach.
+    void wire_arena_pointers(const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler);
+
     // Forget pointers; arena owns the backing buffers.
     void destroy();
     void set_scheduler(PTO2SchedulerState *scheduler);
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
index 5a3e3d3d3..abd2a7510 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
@@ -68,10 +68,22 @@ class PTO2TaskAllocator {
 public:
     /**
      * Initialize the allocator with task ring and heap ring resources.
+     *
+     * All pointer arguments are device addresses (live in SM / GM heap); this
+     * function only stores them, no dereferences, so it is safe to invoke
+     * from host code that constructs a prebuilt arena image.
+     *
+     * Production callers leave `initial_local_task_id` at 0: the SM ring
+     * flow-control counters that current_index_ptr / last_alive_ptr point at
+     * start at zero (PTO2RingFlowControl::init() runs on the AICPU during SM
+     * reset), so we keep local_task_id_ aligned with that without reading the
+     * SM. Tests that drive SM state directly may pass a non-zero seed to
+     * exercise corner cases like task IDs near INT32_MAX.
      */
     void init(
         PTO2TaskDescriptor *descriptors, int32_t window_size, std::atomic<int32_t> *current_index_ptr,
-        std::atomic<int32_t> *last_alive_ptr, void *heap_base, uint64_t heap_size, std::atomic<int32_t> *error_code_ptr
+        std::atomic<int32_t> *last_alive_ptr, void *heap_base, uint64_t heap_size, std::atomic<int32_t> *error_code_ptr,
+        int32_t initial_local_task_id = 0
     ) {
         descriptors_ = descriptors;
         window_size_ = window_size;
@@ -81,7 +93,7 @@ class PTO2TaskAllocator {
         heap_base_ = heap_base;
         heap_size_ = heap_size;
         error_code_ptr_ = error_code_ptr;
-        local_task_id_ = current_index_ptr->load(std::memory_order_relaxed);
+        local_task_id_ = initial_local_task_id;
         heap_top_ = 0;
         heap_tail_ = 0;
         last_alive_seen_ = 0;
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
index c801d5c15..f39bac365 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
@@ -249,81 +249,19 @@ static const PTO2RuntimeOps s_runtime_ops = {
 };
 
 // =============================================================================
-// Runtime Creation and Destruction
+// Runtime Lifecycle (AICPU-only fixup)
 // =============================================================================
-
-PTO2Runtime *runtime_create_from_sm(
-    PTO2RuntimeMode mode, void *sm_base, uint64_t sm_size, uint64_t task_window_size, void *gm_heap, uint64_t heap_size,
-    DeviceArena &arena, int32_t dep_pool_capacity
-) {
-    if (!sm_base || sm_size == 0) return nullptr;
-
-    // Phase 1: layout. Reserve every sub-region the runtime needs (including
-    // the SM handle wrapper itself) without touching memory yet.
-    int32_t task_window_sizes[PTO2_MAX_RING_DEPTH];
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_window_sizes[r] = static_cast<int32_t>(task_window_size);
-    }
-    const size_t off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
-    PTO2OrchestratorLayout orch_layout =
-        PTO2OrchestratorState::reserve_layout(arena, task_window_sizes, dep_pool_capacity);
-    PTO2SchedulerLayout sched_layout = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacity);
-    const size_t off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE);
-    const size_t off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox));
-
-    // Phase 2: single backing allocation.
-    if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) return nullptr;
-
-    // Phase 3: bind region pointers and initialize.
-    PTO2Runtime *rt = static_cast<PTO2Runtime *>(arena.region_ptr(off_runtime));
-    memset(rt, 0, sizeof(*rt));  // calloc-equivalent for the runtime header.
-
-    // Initialize the SM handle wrapper in-place on its arena region before
-    // anything that reads sm_handle->header (orchestrator / scheduler init).
-    rt->sm_handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(off_sm_handle));
-    memset(rt->sm_handle, 0, sizeof(*rt->sm_handle));
-    if (!rt->sm_handle->init(sm_base, sm_size, task_window_size, heap_size)) {
-        arena.release();
-        return nullptr;
-    }
-
+//
+// Layout / init_data / wire / destroy live in
+// runtime/shared/pto_runtime2_init.cpp so the host build can pre-populate the
+// prebuilt arena image. The pieces below — wiring the ops table and the
+// SPMD core counts — depend on the device-side s_runtime_ops global and the
+// AICPU SchedulerContext respectively, so they remain in the AICPU build.
+
+void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count) {
     rt->ops = &s_runtime_ops;
-    rt->mode = mode;
-    rt->gm_heap = gm_heap;
-    rt->gm_heap_size = heap_size > 0 ? heap_size * PTO2_MAX_RING_DEPTH : 0;
-    rt->gm_heap_owned = false;
-
-    if (!rt->orchestrator.init_from_layout(orch_layout, arena, rt->sm_handle->header, gm_heap, heap_size)) {
-        arena.release();
-        return nullptr;
-    }
-    if (!rt->scheduler.init_from_layout(sched_layout, arena, rt->sm_handle->header)) {
-        rt->orchestrator.destroy();
-        arena.release();
-        return nullptr;
-    }
-    rt->orchestrator.set_scheduler(&rt->scheduler);
-
-    rt->aicore_mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(off_mailbox));
-    memset(rt->aicore_mailbox, 0, sizeof(*rt->aicore_mailbox));
-
-    return rt;
-}
-
-void runtime_destroy(PTO2Runtime *rt, DeviceArena &arena) {
-    if (!rt) {
-        arena.release();  // safe: idempotent if nothing's committed.
-        return;
-    }
-
-    rt->scheduler.destroy();
-    rt->orchestrator.destroy();
-    rt->aicore_mailbox = nullptr;  // arena-owned.
-    rt->sm_handle = nullptr;       // wrapper lives in arena; release() reclaims it.
-
-    // arena.release() frees the single backing buffer that holds rt,
-    // mailbox, sm_handle, orchestrator and scheduler sub-regions in one shot.
-    arena.release();
+    rt->orchestrator.total_cluster_count = aic_count;
+    rt->orchestrator.total_aiv_count = aiv_count;
 }
 
 void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode) {
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
index 1da622407..460624e69 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
@@ -91,6 +91,30 @@ struct PTO2RuntimeOps {
     TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const Arg &args);
 };
 
+/**
+ * Layout descriptor for the prebuilt runtime arena. Holds all sub-region
+ * offsets (orchestrator / scheduler / sm_handle wrapper / runtime header /
+ * AICore mailbox) plus the layout-defining capacities. Produced once on the
+ * host by runtime_reserve_layout(); consumed by runtime_init_data_from_layout
+ * and runtime_wire_arena_pointers.
+ */
+struct PTO2RuntimeArenaLayout {
+    size_t off_sm_handle{0};
+    PTO2OrchestratorLayout orch;
+    PTO2SchedulerLayout sched;
+    size_t off_runtime{0};
+    size_t off_mailbox{0};
+
+    // Cached parameters (re-used by init_data + wire stages).
+    uint64_t task_window_size{0};
+    uint64_t heap_size{0};
+    int32_t dep_pool_capacity{0};
+
+    // Total arena byte size post-commit. Used by host to size the prebuilt
+    // image buffer and as the rtMemcpy length.
+    size_t arena_size{0};
+};
+
 /**
  * PTO Runtime2 context
  *
@@ -118,6 +142,16 @@ struct PTO2Runtime {
 
     // Statistics
     int64_t total_cycles;
+
+    // Prebuilt-arena fast path metadata. Carries every offset
+    // wire_arena_pointers needs at AICPU boot so the AICPU can reconstruct
+    // all arena-internal pointer fields without re-running init_data. The
+    // device base of the runtime arena travels separately on the host-side
+    // Runtime (Runtime::prebuilt_arena_base_), since the AICPU needs it
+    // *before* dereferencing this image. Populated on host by
+    // runtime_init_data_from_layout + runtime_wire_arena_pointers; read by
+    // aicpu_executor.cpp.
+    PTO2RuntimeArenaLayout prebuilt_layout;
 };
 
 // =============================================================================
@@ -125,31 +159,60 @@ struct PTO2Runtime {
 // =============================================================================
 
 /**
- * Create runtime from caller-provided GM SM buffer + GM heap.
+ * Phase 1 — declare every sub-region (sm_handle wrapper, orchestrator /
+ * scheduler / tensor_map / mailbox / PTO2Runtime header) on the supplied
+ * arena. Pure arithmetic; does not touch device memory and may run on host.
+ * Returns the layout descriptor; caller commits/attaches the arena before
+ * Phase 2/3.
+ */
+PTO2RuntimeArenaLayout runtime_reserve_layout(
+    DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE
+);
+
+/**
+ * Phase 2 — write the data half of the runtime arena: standalone fields,
+ * memset'd arena regions, sub-structure initializers, and SM-side device
+ * pointers. The arena must already be committed (or attached); writes go
+ * into arena.base() + sub-region offsets.
  *
- * All AICPU-side runtime state (PTO2SharedMemoryHandle wrapper, PTO2Runtime,
- * AICoreCompletionMailbox, plus the orchestrator/scheduler/tensor_map
- * sub-regions) is laid out on the supplied arena and committed in a single
- * backing allocation. runtime_destroy() calls arena.release() once to free
- * the lot.
+ * `sm_dev_base` / `gm_heap_dev_base` are device addresses; we only store
+ * them (never dereference). Safe to run on a host arena that owns a host
+ * mirror of the runtime image — the resulting buffer is rtMemcpy-ready.
  *
- * @param mode             Execution mode
- * @param sm_base          Pre-allocated SM buffer base (host-owned)
- * @param sm_size          Size of the SM buffer in bytes
- * @param task_window_size Per-ring task window size used to lay out SM
- * @param gm_heap          GM heap base for output buffers (or NULL if not used)
- * @param heap_size        GM heap size in bytes
- * @param arena            Caller-owned arena that sources all runtime sub-regions.
- * @return Runtime context, or NULL on failure
- */
-PTO2Runtime *runtime_create_from_sm(
-    PTO2RuntimeMode mode, void *sm_base, uint64_t sm_size, uint64_t task_window_size, void *gm_heap, uint64_t heap_size,
-    DeviceArena &arena, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE
+ * Returns the PTO2Runtime* that sits at layout.off_runtime within the arena.
+ * Caller must follow up with runtime_wire_arena_pointers; rt->ops and the
+ * AICore-side count fields are left untouched and must be filled by the
+ * AICPU at boot.
+ */
+PTO2Runtime *runtime_init_data_from_layout(
+    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, uint64_t sm_size,
+    void *gm_heap_dev_base, uint64_t heap_size
 );
 
 /**
- * Destroy runtime and free all resources. arena.release() is the actual
- * memory free; the rt pointer is no longer valid afterward.
+ * Phase 3 — wire every arena-internal pointer field (rt->sm_handle,
+ * rt->aicore_mailbox, orchestrator.{scope_tasks, scope_begins, scheduler,
+ * tensor_map.*, rings[].fanin_pool.base}, scheduler.{ready_queues, dep_pool,
+ * wiring.queue}) so each holds arena.base() + offset. Idempotent — runs on
+ * both host (writing host-mirror addresses) and AICPU (writing device
+ * addresses) sides.
+ */
+void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt);
+
+/**
+ * AICPU-only Phase 4 — fill in the few fields the host could not know at
+ * prebuilt-image build time: the ops table (s_runtime_ops is a device-side
+ * file-local global, host cannot resolve its device address) and the
+ * orchestrator's core counts (depend on the executor's scheduler context).
+ * Call once per boot after runtime_wire_arena_pointers.
+ */
+void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count);
+
+/**
+ * Destroy runtime. With the prebuilt-arena fast path the arena buffer is
+ * pooled across runs by DeviceRunner, so we never call arena.release()
+ * here — the destructor only forgets sub-structure pointers (idempotent
+ * cleanup).
  */
 void runtime_destroy(PTO2Runtime *rt, DeviceArena &arena);
 
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
index 999dbf6c5..a0dfbd9ef 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
@@ -92,7 +92,7 @@
 
 // Task management
 // NOTE: PTO2_TASK_WINDOW_SIZE is now a per-ring default value.
-// Actual window size is passed at runtime to runtime_create_from_sm().
+// Actual window size is passed at runtime to runtime_reserve_layout().
 // Use pto2_task_slot(sched, task_id) for slot calculation.
 #define PTO2_TASK_WINDOW_SIZE 16384  // Default per-ring task window size (power of 2)
 
@@ -330,7 +330,11 @@ struct alignas(64) PTO2TaskSlotState {
     // Fanout refcount (accessed with fanout_count in check_and_handle_consumed)
     std::atomic<int32_t> fanout_refcount;  // Dynamic: counts released references
 
-    // --- Immutable after RingSchedState::init() (same value on every slot reuse) ---
+    // --- Per-slot constant, re-bound by orch::prepare_task each submit ---
+    // Value is the same on every reuse (&task_payloads[slot] / &task_descriptors[slot]),
+    // but written here per-submit instead of in an O(window_size) init loop —
+    // these are the only "scale-dependent" pointers in this struct, so moving
+    // them out of init makes startup cost independent of task_window_size.
     PTO2TaskPayload *payload;
     PTO2TaskDescriptor *task;
 
@@ -345,14 +349,21 @@ struct alignas(64) PTO2TaskSlotState {
     int16_t next_block_idx{0};                   // Next block to dispatch (scheduler state)
 
     /**
-     * One-time binding of slot-invariant fields.
-     * Called during RingSchedState::init() — these values are determined by
-     * the slot's position in the ring and never change across reuses.
+     * Bind the slot-invariant ring id. Called once per slot during
+     * RingSchedState::init(); ring_id never changes across reuses.
      */
-    void bind(PTO2TaskPayload *p, PTO2TaskDescriptor *t, uint8_t rid) {
+    void bind_ring(uint8_t rid) { ring_id = rid; }
+
+    /**
+     * Re-bind the per-slot payload/task pointers. Called by
+     * orch::prepare_task on every submit. Value is constant for a given
+     * slot, but we pay the cheap re-write each submit (both fields land on
+     * the same 64B slot_state cache line that prepare_task is already
+     * dirtying) to avoid the init-time per-slot loop.
+     */
+    void bind_buffers(PTO2TaskPayload *p, PTO2TaskDescriptor *t) {
         payload = p;
         task = t;
-        ring_id = rid;
     }
 
     /**
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
index cf8dbb780..98b832510 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
@@ -53,11 +53,18 @@ struct PTO2SharedMemoryHandle;
  */
 struct alignas(64) PTO2RingFlowControl {
     // === Cache Line 0: Written by Orchestrator, Read by Scheduler ===
-    std::atomic<int32_t> current_task_index;  // Task ring head (next to allocate)
+    alignas(64) std::atomic<int32_t> current_task_index;  // Task ring head (next to allocate)
 
     // === Cache Line 1: Written by Scheduler, Read by Orchestrator (for back-pressure) ===
     alignas(64) std::atomic<int32_t> last_task_alive;  // Task ring tail (oldest active task)
 
+    // Per-boot SM reset. PTO2TaskAllocator::init() seeds its private
+    // local_task_id_ from initial_local_task_id (default 0 in production)
+    // *without* dereferencing current_task_index — it relies on this reset
+    // running on every AICPU boot so 0 stays in sync. If you ever change
+    // the initial fc value or the boot ordering, update the default in
+    // PTO2TaskAllocator::init (pto_ring_buffer.h) in the same change, or
+    // submit IDs will be off by the divergence.
     void init() {
         current_task_index.store(0, std::memory_order_relaxed);
         last_task_alive.store(0, std::memory_order_relaxed);
@@ -187,3 +194,67 @@ struct PTO2SharedMemoryHandle {
     void setup_pointers(uint64_t task_window_size);
     void setup_pointers_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
 };
+
+// =============================================================================
+// SM Device Layout Helpers
+// =============================================================================
+//
+// When the host pre-builds a runtime-arena image, it needs the device-side
+// addresses of several SM sub-fields (ring flow-control counters,
+// task_descriptors arrays, orch_error_code) so it can wire them into the
+// orchestrator / scheduler init_data path without dereferencing the SM —
+// the SM lives in device memory and cannot be touched from host.
+//
+// These helpers compute those addresses by offset arithmetic on the SM
+// device base. Pure pointer math, no loads/stores; safe to call from host.
+// The same arithmetic happens on AICPU too (via PTO2SharedMemoryHandle's
+// own setup_pointers), so values are guaranteed consistent across sides.
+namespace pto2_sm_layout {
+
+inline std::atomic<int32_t> *orch_error_code_addr(void *sm_dev_base) noexcept {
+    return reinterpret_cast<std::atomic<int32_t> *>(
+        static_cast<char *>(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, orch_error_code)
+    );
+}
+
+inline PTO2SharedMemoryRingHeader *ring_header_addr(void *sm_dev_base, int ring_id) noexcept {
+    return reinterpret_cast<PTO2SharedMemoryRingHeader *>(
+        static_cast<char *>(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, rings) +
+        static_cast<size_t>(ring_id) * sizeof(PTO2SharedMemoryRingHeader)
+    );
+}
+
+inline std::atomic<int32_t> *ring_current_task_index_addr(void *sm_dev_base, int ring_id) noexcept {
+    return reinterpret_cast<std::atomic<int32_t> *>(
+        reinterpret_cast<char *>(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) +
+        offsetof(PTO2RingFlowControl, current_task_index)
+    );
+}
+
+inline std::atomic<int32_t> *ring_last_task_alive_addr(void *sm_dev_base, int ring_id) noexcept {
+    return reinterpret_cast<std::atomic<int32_t> *>(
+        reinterpret_cast<char *>(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) +
+        offsetof(PTO2RingFlowControl, last_task_alive)
+    );
+}
+
+// Walk the per-ring SM layout (same arithmetic as setup_pointers_per_ring)
+// to compute ring `ring_id`'s task_descriptors device address. Accepts a
+// per-ring window-size array so the helper's signature mirrors
+// `PTO2SharedMemoryHandle::setup_pointers_per_ring` and cannot silently
+// disagree with the SM layout when (hypothetically) ring sizes diverge.
+inline PTO2TaskDescriptor *ring_task_descriptors_addr(
+    void *sm_dev_base, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], int ring_id
+) noexcept {
+    assert(ring_id >= 0 && ring_id < PTO2_MAX_RING_DEPTH && "pto2_sm_layout: ring_id out of range");
+    char *p = static_cast<char *>(sm_dev_base);
+    p += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+    for (int r = 0; r < ring_id; r++) {
+        p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+        p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+        p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
+    }
+    return reinterpret_cast<PTO2TaskDescriptor *>(p);
+}
+
+}  // namespace pto2_sm_layout
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
index 39d6e4ad2..b63f20676 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
@@ -47,12 +47,12 @@
 #include "pto_runtime2_types.h"
 #include "tensor.h"
 
-struct PTO2OrchestratorState;  // forward declare
-
 /**
  * Layout descriptor produced by PTO2TensorMap::reserve_layout(). Stores the
  * region offsets returned by DeviceArena::reserve() so init_from_layout()
  * can fetch the matching pointers after the arena is committed.
+ *
+ * All offsets are relative to the arena's base.
  */
 struct PTO2TensorMapLayout {
     size_t off_buckets;
@@ -367,8 +367,6 @@ struct PTO2TensorMap {
     // Per-ring cleanup progress (for periodic cleanup_retired)
     int32_t last_cleanup[PTO2_MAX_RING_DEPTH]{};
 
-    PTO2OrchestratorState *orch{nullptr};
-
     uint32_t get_task_local_id_slot(uint8_t ring_id, uint32_t task_local_id) const {
         return task_local_id & (task_window_sizes[ring_id] - 1);
     }
@@ -433,11 +431,19 @@ struct PTO2TensorMap {
     reserve_layout_default(DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
 
     /**
-     * Phase 3: bind region pointers and initialize state. The arena must already
-     * be committed; layout must have been produced by reserve_layout() against
-     * the same arena.
+     * Phase 3a: write everything *except* arena-internal pointer fields
+     * (buckets, entry_pool, free_entry_list, task_entry_heads[r]).
+     * Uses arena.region_ptr to address the arena regions for data writes,
+     * but does not store those addresses in struct fields. Safe to call on
+     * a host arena that holds the prebuilt image.
+     */
+    bool init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena);
+
+    /**
+     * Phase 3b: write the arena-internal pointer fields. Idempotent;
+     * called once on the host arena and once on the AICPU after attach.
      */
-    bool init_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena);
+    void wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena);
 
     /**
      * Tear down state. Does not free memory — the arena owns the backing
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index a4aef9c04..4a690e8ca 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -48,7 +48,7 @@
 #define RUNTIME_MAX_ARGS 128
 #define RUNTIME_MAX_WORKER 108  // 36 AIC + 72 AIV cores
 #define RUNTIME_MAX_FUNC_ID 1024
-#define RUNTIME_MAX_ORCH_SO_SIZE (4 * 1024 * 1024)  // 1MB max for orchestration SO
+#define RUNTIME_MAX_ORCH_SO_SIZE (4 * 1024 * 1024)  // 4MB max for orchestration SO
 #define RUNTIME_MAX_ORCH_SYMBOL_NAME 64
 
 // Default ready queue shards: one shard per worker thread (total minus orchestrator)
@@ -127,18 +127,25 @@ struct HostApi {
     void (*device_free)(void *dev_ptr);
     int (*copy_to_device)(void *dev_ptr, const void *host_ptr, size_t size);
     int (*copy_from_device)(void *host_ptr, const void *dev_ptr, size_t size);
-    // Lay out and commit the per-Worker static device arena that backs both
-    // the PTO2 GM heap and the PTO2 shared memory in a single underlying
-    // allocation. Must be called once before acquire_pooled_gm_heap /
-    // acquire_pooled_gm_sm. Returns 0 on success, -1 on allocation failure.
-    int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size);
+    // Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared
+    // memory, trb prebuilt runtime arena) as three independent device
+    // allocations. `runtime_arena_size == 0` skips the third region (hbg
+    // path: hbg has no prebuilt runtime arena). Idempotent on identical
+    // sizes; returns 0 on success, -1 on allocation failure.
+    int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
     // Return the per-Worker pooled pointer for the PTO2 GM heap / shared
-    // memory. The static arena must already be committed via
-    // setup_static_arena; the returned pointer is owned by the DeviceRunner
-    // and freed in `DeviceRunner::finalize()` — do NOT pass it to
-    // device_free or record it in `tensor_pairs_`.
+    // memory / prebuilt runtime arena. setup_static_arena must have already
+    // committed the relevant region; the returned pointer is owned by the
+    // DeviceRunner and freed in `DeviceRunner::finalize()` — do NOT pass it
+    // to device_free or record it in `tensor_pairs_`.
+    //
+    // acquire_pooled_runtime_arena is trb-only — the runtime-arena region is
+    // only committed when setup_static_arena was invoked with
+    // runtime_arena_size > 0. Calling it on the hbg path
+    // (setup_static_arena(...,0)) returns nullptr (not undefined).
     void *(*acquire_pooled_gm_heap)();
     void *(*acquire_pooled_gm_sm)();
+    void *(*acquire_pooled_runtime_arena)();
     // Single-shot upload of the entire ChipCallable buffer. `callable` is a
     // `const ChipCallable *` (declared void* to avoid pulling task_interface
     // headers into runtime.h). DeviceRunner walks child_offsets_ to compute
@@ -218,6 +225,13 @@ class Runtime {
     void *slot_states_ptr_;                  // Pointer to PTO2TaskSlotState array (scheduler-private, for profiling)
     ChipStorageTaskArgs orch_args_storage_;  // Copy of args for device
 
+    // Prebuilt-arena fast path (trb only). Set by the host before rtMemcpy'ing
+    // Runtime to device; AICPU reads them in the boot path to skip
+    // runtime_create_from_sm and reuse the pooled, prebuilt arena buffer
+    // (already populated by runtime_init_data_from_layout + wire on host).
+    void *prebuilt_arena_base_;
+    size_t prebuilt_runtime_offset_;
+
     // Device orchestration SO (for dlopen on AICPU thread 3).
     // The SO bytes themselves live in a separately-allocated device buffer
     // owned by DeviceRunner; only the metadata below travels inside Runtime.
@@ -254,6 +268,16 @@ class Runtime {
     void set_slot_states_ptr(void *p);
     void set_orch_args(const ChipStorageTaskArgs &args);
 
+    // Prebuilt-arena fast path (trb only). Set by host's
+    // bind_prepared_to_runtime_impl; consumed by AICPU at boot to attach a
+    // DeviceArena to `prebuilt_arena_base_` and pick up the PTO2Runtime at
+    // `prebuilt_arena_base_ + prebuilt_runtime_offset_`. Both stay zero on
+    // first construction (Runtime() ctor zeros them) so a non-prebuilt boot
+    // path can still detect "no prebuilt image set" via nullptr.
+    void set_prebuilt_arena(void *arena_base, size_t runtime_off);
+    void *get_prebuilt_arena_base() const;
+    size_t get_prebuilt_runtime_offset() const;
+
     // Device orchestration SO binary (for dlopen on AICPU thread 3)
     void set_dev_orch_so(uint64_t dev_addr, uint64_t size);
     uint64_t get_dev_orch_so_addr() const;
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
index 281a714fb..2d777e9b0 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
@@ -61,152 +61,6 @@ PTO2SchedProfilingData scheduler_get_profiling(int thread_idx) {
 }
 #endif
 
-// =============================================================================
-// Ready Queue Implementation
-// =============================================================================
-
-size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity) {
-    // Align the slots[] base to a full cache line so MPMC CAS traffic on the
-    // first slot cannot false-share with whatever region sits in front of us
-    // (e.g. orchestrator tensormap heads written by the orch thread).
-    return arena.reserve(capacity * sizeof(PTO2ReadyQueueSlot), PTO2_ALIGN_SIZE);
-}
-
-bool ready_queue_init_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity) {
-    queue->slots = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
-    queue->capacity = capacity;
-    queue->mask = capacity - 1;
-    queue->enqueue_pos.store(0, std::memory_order_relaxed);
-    queue->dequeue_pos.store(0, std::memory_order_relaxed);
-
-    for (uint64_t i = 0; i < capacity; i++) {
-        queue->slots[i].sequence.store((int64_t)i, std::memory_order_relaxed);
-        queue->slots[i].slot_state = nullptr;
-    }
-
-    return true;
-}
-
-void ready_queue_destroy(PTO2ReadyQueue *queue) {
-    // Arena owns the slots[] buffer; just forget the pointer.
-    queue->slots = nullptr;
-}
-
-// =============================================================================
-// Scheduler Initialization
-// =============================================================================
-
-bool PTO2SchedulerState::RingSchedState::init(PTO2SharedMemoryHeader *sm_header, int32_t ring_id) {
-    ring = &sm_header->rings[ring_id];
-    last_task_alive = 0;
-    advance_lock.store(0, std::memory_order_relaxed);
-
-    // Initialize all per-task slot state fields.
-    // bind() sets payload, task, ring_id — immutable after init, bound once
-    // to their fixed shared-memory addresses.
-    // reset_for_reuse() sets dynamic fields to reclaim defaults (fanout_count=1,
-    // rest zero) so the first submit needs no reset.
-    for (uint64_t i = 0; i < ring->task_window_size; i++) {
-        ring->slot_states[i].bind(&ring->task_payloads[i], &ring->task_descriptors[i], static_cast<uint8_t>(ring_id));
-        ring->slot_states[i].reset_for_reuse();
-        ring->slot_states[i].fanin_count = 0;
-        ring->slot_states[i].active_mask = ActiveMask{};
-    }
-
-    return true;
-}
-
-void PTO2SchedulerState::RingSchedState::destroy() { ring = nullptr; }
-
-PTO2SchedulerLayout PTO2SchedulerState::reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity) {
-    PTO2SchedulerLayout layout{};
-    layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE;
-    layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE;
-    layout.dep_pool_capacity = dep_pool_capacity;
-
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
-    }
-    layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        // Force a cache-line base so writes from scheduler thread 0 (sole
-        // writer of this ring's dep_pool) do not invalidate adjacent
-        // multi-threaded regions like ready_queue.slots.
-        layout.off_dep_pool_entries[r] =
-            arena.reserve(static_cast<size_t>(dep_pool_capacity) * sizeof(PTO2DepListEntry), PTO2_ALIGN_SIZE);
-    }
-    layout.off_wiring_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE);
-    return layout;
-}
-
-bool PTO2SchedulerState::init_from_layout(
-    const PTO2SchedulerLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header_arg
-) {
-    PTO2SchedulerState *sched = this;
-    sched->sm_header = sm_header_arg;
-#if PTO2_SCHED_PROFILING
-    sched->tasks_completed.store(0, std::memory_order_relaxed);
-    sched->tasks_consumed.store(0, std::memory_order_relaxed);
-#endif
-
-    // Per-ring scheduler state — no arena buffers, just field init.
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        if (!sched->ring_sched_states[r].init(sm_header_arg, r)) {
-            return false;
-        }
-    }
-
-    // Ready queues — one per resource shape plus DUMMY.
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        if (!ready_queue_init_from_layout(
-                &sched->ready_queues[i], arena, layout.off_ready_queue_slots[i], layout.ready_queue_capacity
-            )) {
-            return false;
-        }
-    }
-    if (!ready_queue_init_from_layout(
-            &sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity
-        )) {
-        return false;
-    }
-
-    // Per-ring dep_pool: PTO2DepListPool::init takes an externally-allocated
-    // base + capacity, so we just plumb the arena region into it.
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        auto *dep_entries = static_cast<PTO2DepListEntry *>(arena.region_ptr(layout.off_dep_pool_entries[r]));
-        // calloc-equivalent: pool expects entries zeroed at construction.
-        memset(dep_entries, 0, static_cast<size_t>(layout.dep_pool_capacity) * sizeof(PTO2DepListEntry));
-        sched->ring_sched_states[r].dep_pool.init(
-            dep_entries, layout.dep_pool_capacity, &sm_header_arg->orch_error_code
-        );
-    }
-
-    // Wiring SPSC queue (orchestrator push, scheduler thread 0 pop).
-    if (!sched->wiring.queue.init_from_layout(arena, layout.off_wiring_spsc_buffer, layout.spsc_capacity)) {
-        return false;
-    }
-    sched->wiring.batch_count = 0;
-    sched->wiring.batch_index = 0;
-    sched->wiring.backoff_counter = 0;
-
-    return true;
-}
-
-void PTO2SchedulerState::destroy() {
-    PTO2SchedulerState *sched = this;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        sched->ring_sched_states[r].destroy();
-        sched->ring_sched_states[r].dep_pool.base = nullptr;
-    }
-
-    sched->wiring.queue.destroy();
-
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        ready_queue_destroy(&sched->ready_queues[i]);
-    }
-    ready_queue_destroy(&sched->dummy_ready_queue);
-}
-
 // =============================================================================
 // Debug Utilities
 // =============================================================================
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
index 32887d0be..173f65135 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
@@ -409,7 +409,14 @@ struct alignas(64) PTO2ReadyQueue {
 //                     initialize sequence counters
 //   destroy: forget the slots pointer (arena owns the buffer)
 size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity);
-bool ready_queue_init_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity);
+// Writes everything *except* the arena-internal `slots` pointer field
+// (sequences/positions on the slot array, capacity, mask). Uses
+// arena.region_ptr(slots_off) only to address the slot array for writes;
+// does NOT store the pointer in `queue->slots`. Call
+// `ready_queue_wire_arena_pointers` afterwards to set the field itself.
+bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity);
+// Stores queue->slots = arena.region_ptr(slots_off). Idempotent.
+void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off);
 void ready_queue_destroy(PTO2ReadyQueue *queue);
 
 // =============================================================================
@@ -449,13 +456,17 @@ struct alignas(64) PTO2SpscQueue {
         return arena.reserve(capacity * sizeof(PTO2TaskSlotState *), PTO2_ALIGN_SIZE);
     }
 
-    // Bind buffer pointer + reset indices. The capacity must be a power of two
-    // and match the value passed to reserve_layout.
-    bool init_from_layout(DeviceArena &arena, size_t buffer_off, uint64_t capacity) {
+    // Writes everything except the arena-internal `buffer_` pointer field
+    // (zeros the slot pointer array, mask/head/tail). The host pre-builds the
+    // image without storing a host address in buffer_; the AICPU wires
+    // buffer_ at boot via wire_arena_pointers().
+    bool init_data_from_layout(DeviceArena &arena, size_t buffer_off, uint64_t capacity) {
         if (capacity == 0 || (capacity & (capacity - 1)) != 0) return false;
-        buffer_ = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
+        auto *buf = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
+        // calloc'd-equivalent: zero the slot pointers so spurious early pops
+        // observe nullptr.
         for (uint64_t i = 0; i < capacity; i++)
-            buffer_[i] = nullptr;
+            buf[i] = nullptr;
         mask_ = capacity - 1;
         head_.store(0, std::memory_order_relaxed);
         tail_.store(0, std::memory_order_relaxed);
@@ -464,6 +475,12 @@ struct alignas(64) PTO2SpscQueue {
         return true;
     }
 
+    // Wire the arena-internal pointer. Called by both host (with host arena)
+    // and AICPU (with device arena attached to the prebuilt image).
+    void wire_arena_pointers(DeviceArena &arena, size_t buffer_off) {
+        buffer_ = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
+    }
+
     // Arena owns the buffer; here we only forget our pointer.
     void destroy() { buffer_ = nullptr; }
 
@@ -561,7 +578,12 @@ struct PTO2SchedulerState {
         // --- Cache Line 1+: Thread 0 only (wiring dep_pool) ---
         alignas(64) PTO2DepListPool dep_pool;
 
-        bool init(PTO2SharedMemoryHeader *sm_header, int32_t ring_id);
+        // Initialize arena-internal data + arena-external pointers; does NOT
+        // store dep_pool.base (that lives in the runtime arena and is wired
+        // by SchedulerState::wire_arena_pointers). The `ring` field stores
+        // the device address of the SM ring header — computed via offset
+        // arithmetic, no SM dereference.
+        bool init_data_from_layout(void *sm_dev_base, int32_t ring_id);
         void destroy();
 
         void sync_to_sm() { ring->fc.last_task_alive.store(last_task_alive, std::memory_order_release); }
@@ -1040,10 +1062,23 @@ struct PTO2SchedulerState {
 
     // Phase 1: declare every sub-region (ready_queue slots, dummy queue slots,
     // per-ring dep_pool entries, wiring SPSC buffer) on the supplied arena.
+    // Capacities are baked into the returned layout; init_data_from_layout uses
+    // the same values.
     static PTO2SchedulerLayout reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE);
 
-    // Phase 3: bind region pointers and initialize state.
-    bool init_from_layout(const PTO2SchedulerLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header);
+    // Phase 3a: write everything *except* arena-internal pointer fields.
+    // `sm_dev_base` is the device address of the SM (only stored, never
+    // dereferenced here). Safe to call on a host arena that holds the
+    // prebuilt image buffer. (The orchestrator counterpart takes
+    // task_window_size for ring task_descriptors address arithmetic; the
+    // scheduler only needs the SM header / ring header base addresses,
+    // both window-size-independent.)
+    bool init_data_from_layout(const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base);
+
+    // Phase 3b: write the arena-internal pointer fields
+    // (ready_queues[].slots, dummy_ready_queue.slots, dep_pool.base for each
+    // ring, wiring.queue.buffer_). Called on both host and device sides.
+    void wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena);
 
     // Forget per-region pointers; arena owns the backing memory.
     void destroy();
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp
new file mode 100644
index 000000000..d66acfcc4
--- /dev/null
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp
@@ -0,0 +1,355 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Host/AICPU shared runtime-arena layout, init_data and wire implementations.
+ *
+ * Lives under runtime/shared/ so it is included in both the host_runtime.so
+ * build (host pre-populates the prebuilt arena image) and the aicpu_runtime
+ * build (AICPU runs wire_arena_pointers + destroy after attach). The
+ * device-only parts of pto_runtime2.cpp / pto_orchestrator.cpp / pto_scheduler.cpp
+ * (ops table, scope/submit/dispatch business logic, profiling) stay in their
+ * original files and the aicpu build only.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "pto_orchestrator.h"
+#include "pto_runtime2.h"
+#include "pto_ring_buffer.h"
+#include "pto_shared_memory.h"
+#include "pto_tensormap.h"
+#include "scheduler/pto_scheduler.h"
+
+// =============================================================================
+// Ready queue
+// =============================================================================
+
+size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity) {
+    // Align the slots[] base to a full cache line so MPMC CAS traffic on the
+    // first slot cannot false-share with whatever region sits in front of us
+    // (e.g. orchestrator tensormap heads written by the orch thread).
+    return arena.reserve(capacity * sizeof(PTO2ReadyQueueSlot), PTO2_ALIGN_SIZE);
+}
+
+bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity) {
+    // Address the slots region for data writes without storing the pointer in
+    // queue->slots — that field is set by ready_queue_wire_arena_pointers.
+    auto *slots_arena = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
+    queue->capacity = capacity;
+    queue->mask = capacity - 1;
+    queue->enqueue_pos.store(0, std::memory_order_relaxed);
+    queue->dequeue_pos.store(0, std::memory_order_relaxed);
+
+    for (uint64_t i = 0; i < capacity; i++) {
+        slots_arena[i].sequence.store((int64_t)i, std::memory_order_relaxed);
+        slots_arena[i].slot_state = nullptr;
+    }
+
+    return true;
+}
+
+void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off) {
+    queue->slots = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
+}
+
+void ready_queue_destroy(PTO2ReadyQueue *queue) {
+    // Arena owns the slots[] buffer; just forget the pointer.
+    queue->slots = nullptr;
+}
+
+// =============================================================================
+// Scheduler
+// =============================================================================
+
+bool PTO2SchedulerState::RingSchedState::init_data_from_layout(void *sm_dev_base, int32_t ring_id) {
+    // ring stores the device address of the SM ring header — pure offset
+    // arithmetic, no SM load.
+    ring = pto2_sm_layout::ring_header_addr(sm_dev_base, ring_id);
+    last_task_alive = 0;
+    advance_lock.store(0, std::memory_order_relaxed);
+
+    // Per-slot SM-side initialization (bind_ring + reset_for_reuse +
+    // fanin_count/active_mask zero) lives in PTO2SharedMemoryHandle::
+    // init_header_per_ring so the AICPU performs it during SM reset; host
+    // prebuilt-arena init skips SM access here.
+
+    return true;
+}
+
+void PTO2SchedulerState::RingSchedState::destroy() { ring = nullptr; }
+
+PTO2SchedulerLayout PTO2SchedulerState::reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity) {
+    PTO2SchedulerLayout layout{};
+    layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE;
+    layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE;
+    layout.dep_pool_capacity = dep_pool_capacity;
+
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
+    }
+    layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        // Force a cache-line base so writes from scheduler thread 0 (sole
+        // writer of this ring's dep_pool) do not invalidate adjacent
+        // multi-threaded regions like ready_queue.slots.
+        layout.off_dep_pool_entries[r] =
+            arena.reserve(static_cast<size_t>(dep_pool_capacity) * sizeof(PTO2DepListEntry), PTO2_ALIGN_SIZE);
+    }
+    layout.off_wiring_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE);
+    return layout;
+}
+
+bool PTO2SchedulerState::init_data_from_layout(
+    const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base
+) {
+    PTO2SchedulerState *sched = this;
+    sched->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
+#if PTO2_SCHED_PROFILING
+    sched->tasks_completed.store(0, std::memory_order_relaxed);
+    sched->tasks_consumed.store(0, std::memory_order_relaxed);
+#endif
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        if (!sched->ring_sched_states[r].init_data_from_layout(sm_dev_base, r)) {
+            return false;
+        }
+    }
+
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        if (!ready_queue_init_data_from_layout(
+                &sched->ready_queues[i], arena, layout.off_ready_queue_slots[i], layout.ready_queue_capacity
+            )) {
+            return false;
+        }
+    }
+    if (!ready_queue_init_data_from_layout(
+            &sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity
+        )) {
+        return false;
+    }
+
+    auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto *dep_entries = static_cast<PTO2DepListEntry *>(arena.region_ptr(layout.off_dep_pool_entries[r]));
+        memset(dep_entries, 0, static_cast<size_t>(layout.dep_pool_capacity) * sizeof(PTO2DepListEntry));
+        sched->ring_sched_states[r].dep_pool.init(dep_entries, layout.dep_pool_capacity, orch_err);
+    }
+
+    if (!sched->wiring.queue.init_data_from_layout(arena, layout.off_wiring_spsc_buffer, layout.spsc_capacity)) {
+        return false;
+    }
+    sched->wiring.batch_count = 0;
+    sched->wiring.batch_index = 0;
+    sched->wiring.backoff_counter = 0;
+
+    return true;
+}
+
+void PTO2SchedulerState::wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena) {
+    PTO2SchedulerState *sched = this;
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        ready_queue_wire_arena_pointers(&sched->ready_queues[i], arena, layout.off_ready_queue_slots[i]);
+    }
+    ready_queue_wire_arena_pointers(&sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        sched->ring_sched_states[r].dep_pool.base =
+            static_cast<PTO2DepListEntry *>(arena.region_ptr(layout.off_dep_pool_entries[r]));
+    }
+    sched->wiring.queue.wire_arena_pointers(arena, layout.off_wiring_spsc_buffer);
+}
+
+void PTO2SchedulerState::destroy() {
+    PTO2SchedulerState *sched = this;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        sched->ring_sched_states[r].destroy();
+        sched->ring_sched_states[r].dep_pool.base = nullptr;
+    }
+    sched->wiring.queue.destroy();
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        ready_queue_destroy(&sched->ready_queues[i]);
+    }
+    ready_queue_destroy(&sched->dummy_ready_queue);
+}
+
+// =============================================================================
+// Orchestrator
+// =============================================================================
+
+PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout(
+    DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity
+) {
+    PTO2OrchestratorLayout layout{};
+    layout.dep_pool_capacity = dep_pool_capacity;
+    layout.scope_tasks_cap = PTO2_SCOPE_TASKS_CAP;
+    layout.scope_stack_capacity = PTO2_MAX_SCOPE_DEPTH;
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        const size_t fanin_pool_bytes =
+            PTO2_ALIGN_UP(static_cast<size_t>(dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE);
+        layout.off_fanin_pool[r] = arena.reserve(fanin_pool_bytes, PTO2_ALIGN_SIZE);
+    }
+    layout.off_scope_tasks = arena.reserve(
+        static_cast<size_t>(layout.scope_tasks_cap) * sizeof(PTO2TaskSlotState *), alignof(PTO2TaskSlotState *)
+    );
+    layout.off_scope_begins =
+        arena.reserve(static_cast<size_t>(layout.scope_stack_capacity) * sizeof(int32_t), alignof(int32_t));
+    layout.tensor_map = PTO2TensorMap::reserve_layout_default(arena, task_window_sizes);
+    return layout;
+}
+
+bool PTO2OrchestratorState::init_data_from_layout(
+    const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size,
+    uint64_t task_window_size
+) {
+    auto *orch = this;
+    *orch = PTO2OrchestratorState{};
+
+    orch->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
+    orch->gm_heap_base = gm_heap;
+    orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH;
+    orch->fatal = false;
+
+    // Mirror the SM API's per-ring window-size shape so a future per-ring
+    // SM layout cannot silently disagree with the addresses we compute here.
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        task_window_sizes[r] = task_window_size;
+
+    auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        void *ring_heap_base = reinterpret_cast<char *>(gm_heap) + r * heap_size;
+        auto *task_descs_dev = pto2_sm_layout::ring_task_descriptors_addr(sm_dev_base, task_window_sizes, r);
+        auto *cur_idx_dev = pto2_sm_layout::ring_current_task_index_addr(sm_dev_base, r);
+        auto *last_alive_dev = pto2_sm_layout::ring_last_task_alive_addr(sm_dev_base, r);
+
+        orch->rings[r].task_allocator.init(
+            task_descs_dev, static_cast<int32_t>(task_window_size), cur_idx_dev, last_alive_dev, ring_heap_base,
+            heap_size, orch_err
+        );
+
+        const size_t fanin_pool_bytes =
+            PTO2_ALIGN_UP(static_cast<size_t>(layout.dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE);
+        auto *fanin_entries = static_cast<PTO2FaninSpillEntry *>(arena.region_ptr(layout.off_fanin_pool[r]));
+        memset(fanin_entries, 0, fanin_pool_bytes);
+        orch->rings[r].fanin_pool.init(fanin_entries, layout.dep_pool_capacity, orch_err);
+    }
+
+    if (!orch->tensor_map.init_data_from_layout(layout.tensor_map, arena)) {
+        return false;
+    }
+
+    orch->scope_tasks_size = 0;
+    orch->scope_tasks_capacity = layout.scope_tasks_cap;
+    orch->scope_stack_top = -1;
+    orch->scope_stack_capacity = layout.scope_stack_capacity;
+    orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
+
+    return true;
+}
+
+void PTO2OrchestratorState::wire_arena_pointers(
+    const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler_arg
+) {
+    auto *orch = this;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        orch->rings[r].fanin_pool.base = static_cast<PTO2FaninSpillEntry *>(arena.region_ptr(layout.off_fanin_pool[r]));
+    }
+    orch->tensor_map.wire_arena_pointers(layout.tensor_map, arena);
+    orch->scope_tasks = static_cast<PTO2TaskSlotState **>(arena.region_ptr(layout.off_scope_tasks));
+    orch->scope_begins = static_cast<int32_t *>(arena.region_ptr(layout.off_scope_begins));
+    orch->scheduler = scheduler_arg;
+}
+
+void PTO2OrchestratorState::destroy() {
+    auto *orch = this;
+    orch->tensor_map.destroy();
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        orch->rings[r].fanin_pool.base = nullptr;
+    }
+    orch->scope_tasks = nullptr;
+    orch->scope_begins = nullptr;
+}
+
+void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; }
+
+// =============================================================================
+// Top-level runtime arena
+// =============================================================================
+
+PTO2RuntimeArenaLayout
+runtime_reserve_layout(DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity) {
+    PTO2RuntimeArenaLayout layout{};
+    layout.task_window_size = task_window_size;
+    layout.dep_pool_capacity = dep_pool_capacity;
+
+    int32_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_window_sizes[r] = static_cast<int32_t>(task_window_size);
+    }
+
+    layout.off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
+    layout.orch = PTO2OrchestratorState::reserve_layout(arena, task_window_sizes, dep_pool_capacity);
+    layout.sched = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacity);
+    layout.off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE);
+    layout.off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox));
+
+    layout.arena_size = arena.total_size();
+    return layout;
+}
+
+PTO2Runtime *runtime_init_data_from_layout(
+    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base,
+    uint64_t /*sm_size*/, void *gm_heap_dev_base, uint64_t heap_size
+) {
+    PTO2Runtime *rt = static_cast<PTO2Runtime *>(arena.region_ptr(layout.off_runtime));
+    memset(rt, 0, sizeof(*rt));
+
+    auto *sm_wrap = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.off_sm_handle));
+    memset(sm_wrap, 0, sizeof(*sm_wrap));
+
+    // rt->ops is filled by the AICPU at boot.
+    rt->mode = mode;
+    rt->gm_heap = gm_heap_dev_base;
+    rt->gm_heap_size = heap_size > 0 ? heap_size * PTO2_MAX_RING_DEPTH : 0;
+    rt->gm_heap_owned = false;
+    rt->total_cycles = 0;
+
+    if (!rt->orchestrator.init_data_from_layout(
+            layout.orch, arena, sm_dev_base, gm_heap_dev_base, heap_size, layout.task_window_size
+        )) {
+        return nullptr;
+    }
+    if (!rt->scheduler.init_data_from_layout(layout.sched, arena, sm_dev_base)) {
+        return nullptr;
+    }
+
+    auto *mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.off_mailbox));
+    memset(mailbox, 0, sizeof(*mailbox));
+
+    return rt;
+}
+
+void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt) {
+    rt->sm_handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.off_sm_handle));
+    rt->aicore_mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.off_mailbox));
+    rt->orchestrator.wire_arena_pointers(layout.orch, arena, &rt->scheduler);
+    rt->scheduler.wire_arena_pointers(layout.sched, arena);
+}
+
+void runtime_destroy(PTO2Runtime *rt, DeviceArena & /*arena*/) {
+    // Arena buffer is pooled across runs by DeviceRunner — never freed here.
+    if (!rt) return;
+    rt->scheduler.destroy();
+    rt->orchestrator.destroy();
+    rt->aicore_mailbox = nullptr;
+    rt->sm_handle = nullptr;
+}
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp
index 358c87f57..1e1edff92 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp
@@ -167,6 +167,23 @@ void PTO2SharedMemoryHandle::init_header_per_ring(
     header->sched_error_bitmap.store(0, std::memory_order_relaxed);
     header->sched_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
     header->sched_error_thread.store(-1, std::memory_order_relaxed);
+
+    // Per-ring slot_states reset. Previously lived in
+    // PTO2SchedulerState::RingSchedState::init(), but it writes into
+    // ring->slot_states[] which is SM-side storage — keeping it here lets
+    // host-side prebuilt-arena init skip all SM dereferences.
+    // bind_ring() pins the ring_id (slot-invariant after this point);
+    // reset_for_reuse() prepares dynamic fanout/refcount fields so the first
+    // submit doesn't need an explicit reset.
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto &ring = header->rings[r];
+        for (uint64_t i = 0; i < task_window_sizes[r]; i++) {
+            ring.slot_states[i].bind_ring(static_cast<uint8_t>(r));
+            ring.slot_states[i].reset_for_reuse();
+            ring.slot_states[i].fanin_count = 0;
+            ring.slot_states[i].active_mask = ActiveMask{};
+        }
+    }
 }
 
 // =============================================================================
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp
similarity index 82%
rename from src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp
rename to src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp
index c09e6f4f6..b99c67233 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp
@@ -30,7 +30,6 @@
 
 #include "common.h"
 #include "common/unified_log.h"
-#include "pto_orchestrator.h"
 
 // =============================================================================
 // TensorMap Lookup Chain Length Statistics (compile-time toggle)
@@ -82,37 +81,45 @@ PTO2TensorMap::reserve_layout_default(DeviceArena &arena, const int32_t new_task
     return reserve_layout(arena, PTO2_TENSORMAP_NUM_BUCKETS, PTO2_TENSORMAP_POOL_SIZE, new_task_window_sizes);
 }
 
-bool PTO2TensorMap::init_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena) {
+bool PTO2TensorMap::init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena) {
     num_buckets = layout.num_buckets;
     pool_size = layout.pool_size;
 
-    buckets = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
-    entry_pool = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
-    free_entry_list = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
+    // Address arena regions for data writes; do not store these in struct
+    // fields (wire_arena_pointers does that).
+    auto *buckets_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
+    auto *entry_pool_arena = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
+    auto *free_list_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
 
+    // buckets[]: empty == nullptr.
     for (int32_t i = 0; i < num_buckets; i++) {
-        buckets[i] = nullptr;
+        buckets_arena[i] = nullptr;
     }
 
-    memset(entry_pool, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry));
+    // entry_pool: zero-init equivalent to the previous calloc(entry_pool, ...).
+    // The pool's persistent invariant after init is "bucket_index == -1 means
+    // not linked", set explicitly below.
+    memset(entry_pool_arena, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry));
     for (int32_t i = 0; i < pool_size; i++) {
-        entry_pool[i].bucket_index = -1;
-        entry_pool[i].next_in_bucket = nullptr;
-        entry_pool[i].prev_in_bucket = nullptr;
-        entry_pool[i].next_in_task = nullptr;
-        entry_pool[i].prev_in_task = nullptr;
-        entry_pool[i].producer_task_id = PTO2TaskId{};
+        entry_pool_arena[i].bucket_index = -1;
+        entry_pool_arena[i].next_in_bucket = nullptr;
+        entry_pool_arena[i].prev_in_bucket = nullptr;
+        entry_pool_arena[i].next_in_task = nullptr;
+        entry_pool_arena[i].prev_in_task = nullptr;
+        entry_pool_arena[i].producer_task_id = PTO2TaskId{};
     }
 
-    memset(free_entry_list, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry *));
+    // free_entry_list: zeroed (was calloc'd before); contents become meaningful
+    // only after entries are freed back, so the body of the array stays as 0.
+    memset(free_list_arena, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry *));
 
     next_entry_idx = 0;
     free_num = 0;
 
     for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_entry_heads[r] = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
+        auto *heads_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
         for (int32_t i = 0; i < layout.task_window_sizes[r]; i++) {
-            task_entry_heads[r][i] = nullptr;
+            heads_arena[i] = nullptr;
         }
         task_window_sizes[r] = layout.task_window_sizes[r];
         last_task_alives[r] = 0;
@@ -122,6 +129,15 @@ bool PTO2TensorMap::init_from_layout(const PTO2TensorMapLayout &layout, DeviceAr
     return true;
 }
 
+void PTO2TensorMap::wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena) {
+    buckets = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
+    entry_pool = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
+    free_entry_list = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_entry_heads[r] = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
+    }
+}
+
 void PTO2TensorMap::destroy() {
     // Arena owns the backing memory; here we only forget our pointers so any
     // stray post-destroy access trips a nullptr dereference instead of reading
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
index 7daa54ed5..0ebb2ef79 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
@@ -46,6 +46,8 @@ Runtime::Runtime() {
     gm_heap_ptr_ = nullptr;
     slot_states_ptr_ = nullptr;
     orch_args_storage_.clear();
+    prebuilt_arena_base_ = nullptr;
+    prebuilt_runtime_offset_ = 0;
 
     // Initialize device orchestration SO binary
     dev_orch_so_addr_ = 0;
@@ -76,6 +78,13 @@ void Runtime::set_gm_heap(void *p) { gm_heap_ptr_ = p; }
 void Runtime::set_slot_states_ptr(void *p) { slot_states_ptr_ = p; }
 void Runtime::set_orch_args(const ChipStorageTaskArgs &args) { orch_args_storage_ = args; }
 
+void Runtime::set_prebuilt_arena(void *arena_base, size_t runtime_off) {
+    prebuilt_arena_base_ = arena_base;
+    prebuilt_runtime_offset_ = runtime_off;
+}
+void *Runtime::get_prebuilt_arena_base() const { return prebuilt_arena_base_; }
+size_t Runtime::get_prebuilt_runtime_offset() const { return prebuilt_runtime_offset_; }
+
 // Device orchestration SO metadata (bytes live in a separate device buffer
 // owned by DeviceRunner; only the address/size travels in Runtime).
 void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size) {
diff --git a/src/common/device_comm/device_arena.h b/src/common/device_comm/device_arena.h
index a0ade3dc3..ffe34c479 100644
--- a/src/common/device_comm/device_arena.h
+++ b/src/common/device_comm/device_arena.h
@@ -95,12 +95,39 @@ class DeviceArena {
     // the trampoline's free path must therefore be nothrow.)
     void *commit(size_t base_align = kDefaultBaseAlign);
 
+    // Phase 2 alternative: attach to an externally-owned buffer instead of
+    // allocating one. Caller guarantees:
+    //   (a) `external_base` is already `base_align`-aligned — attach does
+    //       NOT forward-align, since the prebuilt image was constructed for
+    //       the address the caller advertised;
+    //   (b) the buffer is at least `total_size()` bytes (the sum of sizes
+    //       passed to reserve()), since attach uses no forward-alignment
+    //       slack of its own;
+    //   (c) all region offsets the caller plans to read back via
+    //       `region_ptr(off)` are held by the caller — attach does NOT
+    //       repopulate the internal region table, and reserve() cannot run
+    //       after attach (it asserts !committed_). `region_size()` likewise
+    //       returns 0 for attached arenas; treat the arena post-attach as
+    //       a base-pointer wrapper.
+    //
+    // Re-attach (release + attach the same or another buffer) is permitted
+    // so the AICPU boot path can rebind the same pooled image each run.
+    //
+    // The external buffer is NOT freed by release()/~DeviceArena(); ownership
+    // stays with the caller. Used for the prebuilt-arena fast path where
+    // a host-built image is rtMemcpy'd into a device buffer that DeviceRunner
+    // owns across runs.
+    void attach(void *external_base, size_t base_align = kDefaultBaseAlign) noexcept;
+
     // Phase 3: pointer to the sub-region at `offset`. Asserts if called
     // before commit().
     void *region_ptr(size_t offset) const noexcept;
 
     // Size of the sub-region whose offset matches `offset`. Linear scan;
-    // intended for debug / assertions, not hot path.
+    // intended for debug / assertions, not hot path. Returns 0 for an
+    // attached arena (attach() does not repopulate the region table) —
+    // callers in the prebuilt-image path keep sizes alongside their offsets
+    // instead.
     size_t region_size(size_t offset) const noexcept;
 
     // Free the backing buffer (if any) and reset to the pre-commit state so
@@ -135,6 +162,9 @@ class DeviceArena {
     size_t raw_size_{0};
     void *base_{nullptr};
     bool committed_{false};
+    // True when committed via attach(): the backing buffer is externally
+    // owned, so release() must not call free_().
+    bool attached_{false};
 
     size_t alloc_count_{0};
     size_t free_count_{0};
@@ -166,6 +196,38 @@ inline void *DeviceArena::commit(size_t base_align) {
     return base_;
 }
 
+inline void DeviceArena::attach(void *external_base, size_t base_align) noexcept {
+    // Re-attach (e.g. AICPU boot path attaches each run) is fine: only an
+    // attached state can be "re-attached" — release() it first to keep
+    // semantics tight. A real commit() (alloc-backed) must not be silently
+    // dropped, so still trap on that.
+    if (committed_) {
+        assert(attached_ && "DeviceArena::attach() called after commit (only re-attach is allowed)");
+        release();
+    }
+    // The external buffer must already be base_align-aligned by the caller —
+    // forward-align in-place would shift the visible base off the address the
+    // caller advertised (and that the prebuilt image was constructed for).
+    // The checks below are promoted to unconditional aborts (rather than
+    // plain assert()) because a misaligned attach silently produces a buffer
+    // whose visible base disagrees with every offset the prebuilt image was
+    // laid out against — release builds, which strip assert(), would still
+    // run on a corrupted arena. Aborting at the breakage point is far cheaper
+    // to triage than the downstream wild-pointer accesses.
+    const auto raw = reinterpret_cast<uintptr_t>(external_base);
+    const bool ok = (external_base != nullptr) && (base_align > 0) && ((base_align & (base_align - 1)) == 0) &&
+                    ((raw & (static_cast<uintptr_t>(base_align) - 1)) == 0);
+    if (!ok) {
+        assert(false && "DeviceArena::attach(): null base, non-power-of-two align, or pre-alignment violated");
+        std::abort();
+    }
+    base_ = external_base;
+    raw_base_ = nullptr;
+    raw_size_ = 0;
+    committed_ = true;
+    attached_ = true;
+}
+
 inline void *DeviceArena::region_ptr(size_t offset) const noexcept {
     assert(committed_ && "DeviceArena::region_ptr() called before commit()");
     return reinterpret_cast<char *>(base_) + offset;
@@ -179,7 +241,8 @@ inline size_t DeviceArena::region_size(size_t offset) const noexcept {
 }
 
 inline void DeviceArena::release() noexcept {
-    if (raw_base_ != nullptr) {
+    // attached arenas wrap externally-owned memory — never free.
+    if (raw_base_ != nullptr && !attached_) {
         free_(ctx_, raw_base_);
         ++free_count_;
     }
@@ -189,4 +252,5 @@ inline void DeviceArena::release() noexcept {
     cursor_ = 0;
     region_count_ = 0;
     committed_ = false;
+    attached_ = false;
 }
diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt
index 89314d800..39cf5977b 100644
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@@ -90,6 +90,7 @@ add_library(a2a3_rt_objs OBJECT
     ${A2A3_RUNTIME_DIR}/shared/pto_shared_memory.cpp
     ${A2A3_RUNTIME_DIR}/scheduler/pto_scheduler.cpp
     ${A2A3_RUNTIME_DIR}/shared/pto_tensormap.cpp
+    ${A2A3_RUNTIME_DIR}/shared/pto_runtime2_init.cpp
     ${CMAKE_SOURCE_DIR}/stubs/test_stubs.cpp
 )
 target_include_directories(a2a3_rt_objs PUBLIC
@@ -193,6 +194,45 @@ function(add_a5_test name src)
     add_test(NAME ${name} COMMAND ${name})
 endfunction()
 
+# ---------------------------------------------------------------------------
+# A5 runtime sources, mirroring a2a3_rt_objs. Bundled into an OBJECT library
+# so the runtime .cpp files compile once and the resulting .o files are
+# reused across every a5 runtime test executable.
+# ---------------------------------------------------------------------------
+set(A5_RUNTIME_DIR ${CMAKE_SOURCE_DIR}/../../../src/a5/runtime/tensormap_and_ringbuffer/runtime)
+
+add_library(a5_rt_objs OBJECT
+    ${A5_RUNTIME_DIR}/pto_ring_buffer.cpp
+    ${A5_RUNTIME_DIR}/shared/pto_shared_memory.cpp
+    ${A5_RUNTIME_DIR}/scheduler/pto_scheduler.cpp
+    ${A5_RUNTIME_DIR}/shared/pto_tensormap.cpp
+    ${A5_RUNTIME_DIR}/shared/pto_runtime2_init.cpp
+    ${CMAKE_SOURCE_DIR}/stubs/test_stubs.cpp
+)
+target_include_directories(a5_rt_objs PUBLIC
+    ${CMAKE_SOURCE_DIR}/../../../src/a5/runtime/tensormap_and_ringbuffer/orchestration
+    ${CMAKE_SOURCE_DIR}/../../../src/a5/runtime/tensormap_and_ringbuffer/runtime
+    ${CMAKE_SOURCE_DIR}/../../../src/a5/runtime/tensormap_and_ringbuffer/common
+    ${CMAKE_SOURCE_DIR}/../../../src/a5/platform/include
+    ${CMAKE_SOURCE_DIR}/../../../src/common/task_interface
+    ${CMAKE_SOURCE_DIR}/../../../src/common/log/include
+    ${CMAKE_SOURCE_DIR}/../../../src/common/device_comm
+)
+target_compile_options(a5_rt_objs PUBLIC -D_GLIBCXX_USE_CXX11_ABI=0)
+
+function(add_a5_runtime_test name src)
+    add_executable(${name} ${src})
+    target_include_directories(${name} PRIVATE ${GTEST_INCLUDE_DIRS})
+    target_link_libraries(${name} PRIVATE
+        a5_rt_objs
+        ${GTEST_MAIN_LIB}
+        ${GTEST_LIB}
+        pthread
+    )
+    add_test(NAME ${name} COMMAND ${name})
+    set_tests_properties(${name} PROPERTIES LABELS "no_hardware")
+endfunction()
+
 function(add_task_interface_test name src)
     add_executable(${name} ${src})
     target_include_directories(${name} PRIVATE
@@ -313,6 +353,21 @@ add_a2a3_runtime_test(test_wiring           a2a3/test_wiring.cpp)
 # ---------------------------------------------------------------------------
 add_a5_test(test_a5_fatal a5/test_a5_fatal.cpp)
 
+# A5 trb runtime UTs — mirror of a2a3 trb runtime UTs, link against a5_rt_objs.
+# Target names carry the a5_ prefix because hierarchical/test_tensormap (and
+# the unprefixed a2a3 runtime targets test_scheduler_state / test_ready_queue
+# / ...) already own those bare names.
+add_a5_runtime_test(test_a5_task_allocator   a5/test_task_allocator.cpp)
+add_a5_runtime_test(test_a5_dep_list_pool    a5/test_dep_list_pool.cpp)
+add_a5_runtime_test(test_a5_scheduler_state  a5/test_scheduler_state.cpp)
+add_a5_runtime_test(test_a5_task_state       a5/test_task_state.cpp)
+add_a5_runtime_test(test_a5_ready_queue      a5/test_ready_queue.cpp)
+add_a5_runtime_test(test_a5_shared_memory    a5/test_shared_memory.cpp)
+add_a5_runtime_test(test_a5_tensormap        a5/test_tensormap.cpp)
+add_a5_runtime_test(test_a5_fanin_pool       a5/test_fanin_pool.cpp)
+add_a5_runtime_test(test_a5_spsc_queue       a5/test_spsc_queue.cpp)
+add_a5_runtime_test(test_a5_wiring           a5/test_wiring.cpp)
+
 # Host logger silent/off behavior — no runtime deps, just compile host_log.cpp
 # alongside the test (faster than dlopen'ing libsimpler_log.so for a unit test).
 set(SIMPLER_LOG_DIR ${CMAKE_SOURCE_DIR}/../../../src/common/log)
diff --git a/tests/ut/cpp/a2a3/test_ready_queue.cpp b/tests/ut/cpp/a2a3/test_ready_queue.cpp
index 413e36cfd..f12b1e7c7 100644
--- a/tests/ut/cpp/a2a3/test_ready_queue.cpp
+++ b/tests/ut/cpp/a2a3/test_ready_queue.cpp
@@ -61,7 +61,8 @@ class ReadyQueueTest : public ::testing::Test {
     void SetUp() override {
         const size_t off = ready_queue_reserve_layout(arena, CAPACITY);
         ASSERT_NE(arena.commit(), nullptr);
-        ASSERT_TRUE(ready_queue_init_from_layout(&queue, arena, off, CAPACITY));
+        ASSERT_TRUE(ready_queue_init_data_from_layout(&queue, arena, off, CAPACITY));
+        ready_queue_wire_arena_pointers(&queue, arena, off);
     }
 
     void TearDown() override {
@@ -231,7 +232,8 @@ class ReadyQueueBoundaryTest : public ::testing::Test {
     void SetUp() override {
         const size_t off = ready_queue_reserve_layout(arena, QUEUE_CAP);
         ASSERT_NE(arena.commit(), nullptr);
-        ASSERT_TRUE(ready_queue_init_from_layout(&queue, arena, off, QUEUE_CAP));
+        ASSERT_TRUE(ready_queue_init_data_from_layout(&queue, arena, off, QUEUE_CAP));
+        ready_queue_wire_arena_pointers(&queue, arena, off);
     }
     void TearDown() override {
         ready_queue_destroy(&queue);
@@ -330,7 +332,8 @@ class ReadyQueueMPMCTest : public ::testing::TestWithParam<MPMCConfig> {
     void SetUp() override {
         const size_t off = ready_queue_reserve_layout(arena, CAPACITY);
         ASSERT_NE(arena.commit(), nullptr);
-        ASSERT_TRUE(ready_queue_init_from_layout(&queue, arena, off, CAPACITY));
+        ASSERT_TRUE(ready_queue_init_data_from_layout(&queue, arena, off, CAPACITY));
+        ready_queue_wire_arena_pointers(&queue, arena, off);
     }
     void TearDown() override {
         ready_queue_destroy(&queue);
diff --git a/tests/ut/cpp/a2a3/test_scheduler_state.cpp b/tests/ut/cpp/a2a3/test_scheduler_state.cpp
index 952aad55a..75476dedf 100644
--- a/tests/ut/cpp/a2a3/test_scheduler_state.cpp
+++ b/tests/ut/cpp/a2a3/test_scheduler_state.cpp
@@ -34,7 +34,8 @@ class SchedulerStateTest : public ::testing::Test {
         ASSERT_NE(sm_handle, nullptr);
         auto layout = PTO2SchedulerState::reserve_layout(sched_arena);
         ASSERT_NE(sched_arena.commit(), nullptr);
-        ASSERT_TRUE(sched.init_from_layout(layout, sched_arena, sm_handle->header));
+        ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header));
+        sched.wire_arena_pointers(layout, sched_arena);
     }
 
     void TearDown() override {
diff --git a/tests/ut/cpp/a2a3/test_spsc_queue.cpp b/tests/ut/cpp/a2a3/test_spsc_queue.cpp
index 28e43d5a2..5dce3ba4a 100644
--- a/tests/ut/cpp/a2a3/test_spsc_queue.cpp
+++ b/tests/ut/cpp/a2a3/test_spsc_queue.cpp
@@ -47,7 +47,8 @@ class SpscQueueTest : public ::testing::Test {
         memset(&queue, 0, sizeof(queue));
         const size_t off = PTO2SpscQueue::reserve_layout(arena, CAPACITY);
         ASSERT_NE(arena.commit(), nullptr);
-        ASSERT_TRUE(queue.init_from_layout(arena, off, CAPACITY));
+        ASSERT_TRUE(queue.init_data_from_layout(arena, off, CAPACITY));
+        queue.wire_arena_pointers(arena, off);
     }
 
     void TearDown() override {
@@ -74,9 +75,9 @@ TEST_F(SpscQueueTest, InitRejectsNonPowerOfTwo) {
     const size_t off = PTO2SpscQueue::reserve_layout(local, 1);  // dummy reservation so commit succeeds
     (void)off;
     ASSERT_NE(local.commit(), nullptr);
-    EXPECT_FALSE(bad.init_from_layout(local, off, 3));
-    EXPECT_FALSE(bad.init_from_layout(local, off, 7));
-    EXPECT_FALSE(bad.init_from_layout(local, off, 0));
+    EXPECT_FALSE(bad.init_data_from_layout(local, off, 3));
+    EXPECT_FALSE(bad.init_data_from_layout(local, off, 7));
+    EXPECT_FALSE(bad.init_data_from_layout(local, off, 0));
 }
 
 TEST_F(SpscQueueTest, InitAcceptsPowerOfTwo) {
@@ -85,9 +86,9 @@ TEST_F(SpscQueueTest, InitAcceptsPowerOfTwo) {
     const size_t off4 = PTO2SpscQueue::reserve_layout(local, 4);
     const size_t off1024 = PTO2SpscQueue::reserve_layout(local, 1024);
     ASSERT_NE(local.commit(), nullptr);
-    EXPECT_TRUE(q.init_from_layout(local, off4, 4));
+    EXPECT_TRUE(q.init_data_from_layout(local, off4, 4));
     q.destroy();
-    EXPECT_TRUE(q.init_from_layout(local, off1024, 1024));
+    EXPECT_TRUE(q.init_data_from_layout(local, off1024, 1024));
     q.destroy();
 }
 
diff --git a/tests/ut/cpp/a2a3/test_task_allocator.cpp b/tests/ut/cpp/a2a3/test_task_allocator.cpp
index 383003900..512e241a2 100644
--- a/tests/ut/cpp/a2a3/test_task_allocator.cpp
+++ b/tests/ut/cpp/a2a3/test_task_allocator.cpp
@@ -388,7 +388,10 @@ TEST_F(TaskAllocatorTest, TaskWindowSaturates) {
 TEST_F(TaskAllocatorTest, TaskIdNearInt32Max) {
     current_index.store(INT32_MAX - 2);
     last_alive.store(INT32_MAX - 2);
-    allocator.init(descriptors.data(), WINDOW_SIZE, &current_index, &last_alive, heap_buf, HEAP_SIZE, &error_code);
+    allocator.init(
+        descriptors.data(), WINDOW_SIZE, &current_index, &last_alive, heap_buf, HEAP_SIZE, &error_code,
+        /*initial_local_task_id=*/INT32_MAX - 2
+    );
 
     auto r1 = allocator.alloc(0);
     ASSERT_FALSE(r1.failed());
diff --git a/tests/ut/cpp/a2a3/test_task_state.cpp b/tests/ut/cpp/a2a3/test_task_state.cpp
index 729b74999..ffced6f9a 100644
--- a/tests/ut/cpp/a2a3/test_task_state.cpp
+++ b/tests/ut/cpp/a2a3/test_task_state.cpp
@@ -43,7 +43,8 @@ class TaskStateTest : public ::testing::Test {
         ASSERT_NE(sm_handle, nullptr);
         auto layout = PTO2SchedulerState::reserve_layout(sched_arena);
         ASSERT_NE(sched_arena.commit(), nullptr);
-        ASSERT_TRUE(sched.init_from_layout(layout, sched_arena, sm_handle->header));
+        ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header));
+        sched.wire_arena_pointers(layout, sched_arena);
     }
 
     void TearDown() override {
diff --git a/tests/ut/cpp/a2a3/test_tensormap.cpp b/tests/ut/cpp/a2a3/test_tensormap.cpp
index 204d00e42..805a9e079 100644
--- a/tests/ut/cpp/a2a3/test_tensormap.cpp
+++ b/tests/ut/cpp/a2a3/test_tensormap.cpp
@@ -83,7 +83,8 @@ class TensorMapTest : public ::testing::Test {
         int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {WINDOW_SIZE, WINDOW_SIZE, WINDOW_SIZE, WINDOW_SIZE};
         auto layout = PTO2TensorMap::reserve_layout(arena, NUM_BUCKETS, POOL_SIZE, window_sizes);
         ASSERT_NE(arena.commit(), nullptr);
-        ASSERT_TRUE(tmap.init_from_layout(layout, arena));
+        ASSERT_TRUE(tmap.init_data_from_layout(layout, arena));
+        tmap.wire_arena_pointers(layout, arena);
     }
 
     void TearDown() override {
@@ -113,7 +114,8 @@ TEST_F(TensorMapTest, InitRequiresPowerOfTwoBuckets) {
     int32_t ws[PTO2_MAX_RING_DEPTH] = {8, 8, 8, 8};
     auto layout = PTO2TensorMap::reserve_layout(bad_arena, 8, 64, ws);
     ASSERT_NE(bad_arena.commit(), nullptr);
-    EXPECT_TRUE(bad.init_from_layout(layout, bad_arena));
+    EXPECT_TRUE(bad.init_data_from_layout(layout, bad_arena));
+    bad.wire_arena_pointers(layout, bad_arena);
     bad.destroy();
 }
 
diff --git a/tests/ut/cpp/a2a3/test_wiring.cpp b/tests/ut/cpp/a2a3/test_wiring.cpp
index b01052a85..1e8fee9c5 100644
--- a/tests/ut/cpp/a2a3/test_wiring.cpp
+++ b/tests/ut/cpp/a2a3/test_wiring.cpp
@@ -48,7 +48,8 @@ class WiringTest : public ::testing::Test {
         ASSERT_NE(sm_handle, nullptr);
         auto layout = PTO2SchedulerState::reserve_layout(sched_arena);
         ASSERT_NE(sched_arena.commit(), nullptr);
-        ASSERT_TRUE(sched.init_from_layout(layout, sched_arena, sm_handle->header));
+        ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header));
+        sched.wire_arena_pointers(layout, sched_arena);
     }
 
     void TearDown() override {
diff --git a/tests/ut/cpp/a5/test_ready_queue.cpp b/tests/ut/cpp/a5/test_ready_queue.cpp
index 9dea3ae94..f12b1e7c7 100644
--- a/tests/ut/cpp/a5/test_ready_queue.cpp
+++ b/tests/ut/cpp/a5/test_ready_queue.cpp
@@ -44,6 +44,7 @@
 #include <thread>
 #include <vector>
 
+#include "device_arena.h"
 #include "scheduler/pto_scheduler.h"
 
 // =============================================================================
@@ -55,10 +56,19 @@ class ReadyQueueTest : public ::testing::Test {
     static constexpr uint64_t CAPACITY = 16;  // Power of 2
 
     PTO2ReadyQueue queue;
+    DeviceArena arena;
 
-    void SetUp() override { ASSERT_TRUE(ready_queue_init(&queue, CAPACITY)); }
+    void SetUp() override {
+        const size_t off = ready_queue_reserve_layout(arena, CAPACITY);
+        ASSERT_NE(arena.commit(), nullptr);
+        ASSERT_TRUE(ready_queue_init_data_from_layout(&queue, arena, off, CAPACITY));
+        ready_queue_wire_arena_pointers(&queue, arena, off);
+    }
 
-    void TearDown() override { ready_queue_destroy(&queue); }
+    void TearDown() override {
+        ready_queue_destroy(&queue);
+        arena.release();
+    }
 };
 
 // =============================================================================
@@ -217,8 +227,18 @@ class ReadyQueueBoundaryTest : public ::testing::Test {
     PTO2ReadyQueue queue{};
     PTO2TaskSlotState dummy[8]{};
 
-    void SetUp() override { ASSERT_TRUE(ready_queue_init(&queue, QUEUE_CAP)); }
-    void TearDown() override { ready_queue_destroy(&queue); }
+    DeviceArena arena;
+
+    void SetUp() override {
+        const size_t off = ready_queue_reserve_layout(arena, QUEUE_CAP);
+        ASSERT_NE(arena.commit(), nullptr);
+        ASSERT_TRUE(ready_queue_init_data_from_layout(&queue, arena, off, QUEUE_CAP));
+        ready_queue_wire_arena_pointers(&queue, arena, off);
+    }
+    void TearDown() override {
+        ready_queue_destroy(&queue);
+        arena.release();
+    }
 };
 
 TEST_F(ReadyQueueBoundaryTest, ExactCapacityFillDrain) {
@@ -307,8 +327,18 @@ class ReadyQueueMPMCTest : public ::testing::TestWithParam<MPMCConfig> {
     static constexpr uint64_t CAPACITY = 1024;
     PTO2ReadyQueue queue;
 
-    void SetUp() override { ASSERT_TRUE(ready_queue_init(&queue, CAPACITY)); }
-    void TearDown() override { ready_queue_destroy(&queue); }
+    DeviceArena arena;
+
+    void SetUp() override {
+        const size_t off = ready_queue_reserve_layout(arena, CAPACITY);
+        ASSERT_NE(arena.commit(), nullptr);
+        ASSERT_TRUE(ready_queue_init_data_from_layout(&queue, arena, off, CAPACITY));
+        ready_queue_wire_arena_pointers(&queue, arena, off);
+    }
+    void TearDown() override {
+        ready_queue_destroy(&queue);
+        arena.release();
+    }
 };
 
 TEST_P(ReadyQueueMPMCTest, NoDuplicateNoLoss) {
diff --git a/tests/ut/cpp/a5/test_scheduler_state.cpp b/tests/ut/cpp/a5/test_scheduler_state.cpp
index 952aad55a..75476dedf 100644
--- a/tests/ut/cpp/a5/test_scheduler_state.cpp
+++ b/tests/ut/cpp/a5/test_scheduler_state.cpp
@@ -34,7 +34,8 @@ class SchedulerStateTest : public ::testing::Test {
         ASSERT_NE(sm_handle, nullptr);
         auto layout = PTO2SchedulerState::reserve_layout(sched_arena);
         ASSERT_NE(sched_arena.commit(), nullptr);
-        ASSERT_TRUE(sched.init_from_layout(layout, sched_arena, sm_handle->header));
+        ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header));
+        sched.wire_arena_pointers(layout, sched_arena);
     }
 
     void TearDown() override {
diff --git a/tests/ut/cpp/a5/test_spsc_queue.cpp b/tests/ut/cpp/a5/test_spsc_queue.cpp
index a2c80ca05..5dce3ba4a 100644
--- a/tests/ut/cpp/a5/test_spsc_queue.cpp
+++ b/tests/ut/cpp/a5/test_spsc_queue.cpp
@@ -27,6 +27,7 @@
 #include <thread>
 #include <vector>
 
+#include "device_arena.h"
 #include "scheduler/pto_scheduler.h"
 
 // =============================================================================
@@ -38,15 +39,22 @@ class SpscQueueTest : public ::testing::Test {
     static constexpr uint64_t CAPACITY = 16;  // must be power of 2
 
     PTO2SpscQueue queue{};
+    DeviceArena arena;
     // Dummy slot states used as push values
     alignas(64) PTO2TaskSlotState slots[64]{};
 
     void SetUp() override {
         memset(&queue, 0, sizeof(queue));
-        ASSERT_TRUE(queue.init(CAPACITY));
+        const size_t off = PTO2SpscQueue::reserve_layout(arena, CAPACITY);
+        ASSERT_NE(arena.commit(), nullptr);
+        ASSERT_TRUE(queue.init_data_from_layout(arena, off, CAPACITY));
+        queue.wire_arena_pointers(arena, off);
     }
 
-    void TearDown() override { queue.destroy(); }
+    void TearDown() override {
+        queue.destroy();
+        arena.release();
+    }
 };
 
 // =============================================================================
@@ -60,17 +68,27 @@ TEST_F(SpscQueueTest, InitValidState) {
 }
 
 TEST_F(SpscQueueTest, InitRejectsNonPowerOfTwo) {
+    // init_from_layout rejects non-power-of-two capacities. Use a fresh arena
+    // each time since reserve runs before commit.
     PTO2SpscQueue bad{};
-    EXPECT_FALSE(bad.init(3));
-    EXPECT_FALSE(bad.init(7));
-    EXPECT_FALSE(bad.init(0));
+    DeviceArena local;
+    const size_t off = PTO2SpscQueue::reserve_layout(local, 1);  // dummy reservation so commit succeeds
+    (void)off;
+    ASSERT_NE(local.commit(), nullptr);
+    EXPECT_FALSE(bad.init_data_from_layout(local, off, 3));
+    EXPECT_FALSE(bad.init_data_from_layout(local, off, 7));
+    EXPECT_FALSE(bad.init_data_from_layout(local, off, 0));
 }
 
 TEST_F(SpscQueueTest, InitAcceptsPowerOfTwo) {
     PTO2SpscQueue q{};
-    EXPECT_TRUE(q.init(4));
+    DeviceArena local;
+    const size_t off4 = PTO2SpscQueue::reserve_layout(local, 4);
+    const size_t off1024 = PTO2SpscQueue::reserve_layout(local, 1024);
+    ASSERT_NE(local.commit(), nullptr);
+    EXPECT_TRUE(q.init_data_from_layout(local, off4, 4));
     q.destroy();
-    EXPECT_TRUE(q.init(1024));
+    EXPECT_TRUE(q.init_data_from_layout(local, off1024, 1024));
     q.destroy();
 }
 
diff --git a/tests/ut/cpp/a5/test_task_allocator.cpp b/tests/ut/cpp/a5/test_task_allocator.cpp
index 383003900..512e241a2 100644
--- a/tests/ut/cpp/a5/test_task_allocator.cpp
+++ b/tests/ut/cpp/a5/test_task_allocator.cpp
@@ -388,7 +388,10 @@ TEST_F(TaskAllocatorTest, TaskWindowSaturates) {
 TEST_F(TaskAllocatorTest, TaskIdNearInt32Max) {
     current_index.store(INT32_MAX - 2);
     last_alive.store(INT32_MAX - 2);
-    allocator.init(descriptors.data(), WINDOW_SIZE, &current_index, &last_alive, heap_buf, HEAP_SIZE, &error_code);
+    allocator.init(
+        descriptors.data(), WINDOW_SIZE, &current_index, &last_alive, heap_buf, HEAP_SIZE, &error_code,
+        /*initial_local_task_id=*/INT32_MAX - 2
+    );
 
     auto r1 = allocator.alloc(0);
     ASSERT_FALSE(r1.failed());
diff --git a/tests/ut/cpp/a5/test_task_state.cpp b/tests/ut/cpp/a5/test_task_state.cpp
index 729b74999..ffced6f9a 100644
--- a/tests/ut/cpp/a5/test_task_state.cpp
+++ b/tests/ut/cpp/a5/test_task_state.cpp
@@ -43,7 +43,8 @@ class TaskStateTest : public ::testing::Test {
         ASSERT_NE(sm_handle, nullptr);
         auto layout = PTO2SchedulerState::reserve_layout(sched_arena);
         ASSERT_NE(sched_arena.commit(), nullptr);
-        ASSERT_TRUE(sched.init_from_layout(layout, sched_arena, sm_handle->header));
+        ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header));
+        sched.wire_arena_pointers(layout, sched_arena);
     }
 
     void TearDown() override {
diff --git a/tests/ut/cpp/a5/test_tensormap.cpp b/tests/ut/cpp/a5/test_tensormap.cpp
index 10eef0317..ec83a064d 100644
--- a/tests/ut/cpp/a5/test_tensormap.cpp
+++ b/tests/ut/cpp/a5/test_tensormap.cpp
@@ -28,6 +28,7 @@
 #include <set>
 #include <vector>
 
+#include "device_arena.h"
 #include "pto_orchestration_api.h"
 #include "pto_tensormap.h"
 
@@ -76,13 +77,20 @@ class TensorMapTest : public ::testing::Test {
     static constexpr int32_t WINDOW_SIZE = 32;
 
     PTO2TensorMap tmap{};
+    DeviceArena arena;
 
     void SetUp() override {
         int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {WINDOW_SIZE, WINDOW_SIZE, WINDOW_SIZE, WINDOW_SIZE};
-        ASSERT_TRUE(tmap.init(NUM_BUCKETS, POOL_SIZE, window_sizes));
+        auto layout = PTO2TensorMap::reserve_layout(arena, NUM_BUCKETS, POOL_SIZE, window_sizes);
+        ASSERT_NE(arena.commit(), nullptr);
+        ASSERT_TRUE(tmap.init_data_from_layout(layout, arena));
+        tmap.wire_arena_pointers(layout, arena);
     }
 
-    void TearDown() override { tmap.destroy(); }
+    void TearDown() override {
+        tmap.destroy();
+        arena.release();
+    }
 };
 
 // =============================================================================
@@ -97,13 +105,19 @@ TEST_F(TensorMapTest, InitValidState) {
     EXPECT_EQ(tmap.valid_count(), 0);
 }
 
-TEST_F(TensorMapTest, InitRequiresPowerOfTwoBuckets) {
-    PTO2TensorMap bad{};
+TEST_F(TensorMapTest, InitWithPowerOfTwoBucketsSucceeds) {
+    // The reject path for non-power-of-2 bucket counts is enforced via an
+    // always_assert inside reserve_layout. It is not asserted here because
+    // EXPECT_DEATH cannot run reliably in release builds where always_assert
+    // may compile out. Cover only the accepted (power-of-2) shape.
+    PTO2TensorMap ok{};
+    DeviceArena ok_arena;
     int32_t ws[PTO2_MAX_RING_DEPTH] = {8, 8, 8, 8};
-    EXPECT_FALSE(bad.init(3, 64, ws)) << "non-power-of-2 bucket count must fail";
-    EXPECT_FALSE(bad.init(7, 64, ws));
-    EXPECT_TRUE(bad.init(8, 64, ws));
-    bad.destroy();
+    auto layout = PTO2TensorMap::reserve_layout(ok_arena, 8, 64, ws);
+    ASSERT_NE(ok_arena.commit(), nullptr);
+    EXPECT_TRUE(ok.init_data_from_layout(layout, ok_arena));
+    ok.wire_arena_pointers(layout, ok_arena);
+    ok.destroy();
 }
 
 // =============================================================================
diff --git a/tests/ut/cpp/a5/test_wiring.cpp b/tests/ut/cpp/a5/test_wiring.cpp
index b01052a85..1e8fee9c5 100644
--- a/tests/ut/cpp/a5/test_wiring.cpp
+++ b/tests/ut/cpp/a5/test_wiring.cpp
@@ -48,7 +48,8 @@ class WiringTest : public ::testing::Test {
         ASSERT_NE(sm_handle, nullptr);
         auto layout = PTO2SchedulerState::reserve_layout(sched_arena);
         ASSERT_NE(sched_arena.commit(), nullptr);
-        ASSERT_TRUE(sched.init_from_layout(layout, sched_arena, sm_handle->header));
+        ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header));
+        sched.wire_arena_pointers(layout, sched_arena);
     }
 
     void TearDown() override {