diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index cf6ddea88..e3e1cfc2d 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -249,42 +249,68 @@ int AicpuSoInfo::finalize() { DeviceRunner::~DeviceRunner() { finalize(); } -int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size) { - if (static_arena_.is_committed()) { - // Idempotent for the production case (sizes do not change across a - // worker's lifetime). If a caller asks for a larger layout, redo it. - if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_) return 0; - static_arena_.release(); - gm_heap_region_off_ = SIZE_MAX; - gm_sm_region_off_ = SIZE_MAX; - cached_gm_heap_size_ = 0; - cached_gm_sm_size_ = 0; - } - gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign); - gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign); - if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { - // Roll back the two reserves: commit() failure leaves committed_=false, - // so the next entry would skip the release branch and stack new - // reserves on top of the stale cursor. release() is idempotent on a - // never-committed arena (just zeroes cursor_ / region_count_). - static_arena_.release(); - gm_heap_region_off_ = SIZE_MAX; - gm_sm_region_off_ = SIZE_MAX; - return -1; - } - cached_gm_heap_size_ = gm_heap_size; - cached_gm_sm_size_ = gm_sm_size; +int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) { + // Three independent device_malloc'd buffers: GM heap, PTO2 SM, prebuilt + // runtime arena. Split out from a single large allocation because the + // combined size can exceed the device allocator's largest contiguous + // block. Each arena commits exactly one region, so its base() is the + // pooled pointer the caller wants. + // + // Idempotent for the production case (sizes do not change across a + // worker's lifetime). If a caller asks for a larger layout on any + // region, redo just that region — already-committed peers stay alive + // so their callers don't have to re-acquire. + auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int { + if (requested_size == 0) { + // hbg's runtime_arena path: caller passed 0 and never reserved + // a region. Leave the arena uncommitted; acquire_pooled_* will + // return nullptr. + if (arena.is_committed() && cached_size != 0) { + arena.release(); + cached_size = 0; + } + return 0; + } + if (arena.is_committed() && requested_size <= cached_size) { + return 0; + } + arena.release(); + cached_size = 0; + arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign); + if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { + // commit() failure leaves committed_=false, so the next entry's + // is_committed() guard skips the release branch. release() is + // idempotent on a never-committed arena (zeroes cursor_). + arena.release(); + return -1; + } + cached_size = requested_size; + return 0; + }; + // Failure of a later region leaves earlier peers committed on purpose: + // pooled pointers previously returned to callers must stay valid even if + // this resize attempt aborts. + if (commit_region(gm_heap_arena_, cached_gm_heap_size_, gm_heap_size) != 0) return -1; + if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) return -1; + if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) return -1; return 0; } void *DeviceRunner::acquire_pooled_gm_heap() { - if (!static_arena_.is_committed()) return nullptr; - return static_arena_.region_ptr(gm_heap_region_off_); + if (!gm_heap_arena_.is_committed()) return nullptr; + return gm_heap_arena_.base(); } void *DeviceRunner::acquire_pooled_gm_sm() { - if (!static_arena_.is_committed()) return nullptr; - return static_arena_.region_ptr(gm_sm_region_off_); + if (!gm_sm_arena_.is_committed()) return nullptr; + return gm_sm_arena_.base(); +} + +void *DeviceRunner::acquire_pooled_runtime_arena() { + // hbg calls setup_static_arena(...,0) and leaves runtime_arena_pool_ + // uncommitted — fail loudly if a caller asks for it anyway. + if (!runtime_arena_pool_.is_committed()) return nullptr; + return runtime_arena_pool_.base(); } std::thread DeviceRunner::create_thread(std::function fn) { @@ -1222,14 +1248,16 @@ int DeviceRunner::finalize() { // perf_cleanup guard; this is the backstop for the no-run-since-init case. finalize_collectors(); - // Release per-Worker static arena (GM heap + PTO2 SM in a single backing - // device allocation). Must precede mem_alloc_.finalize() so the arena - // frees through the still-live allocator, not after it. - static_arena_.release(); - gm_heap_region_off_ = SIZE_MAX; - gm_sm_region_off_ = SIZE_MAX; + // Release the three per-Worker pooled arenas (GM heap, PTO2 SM, optional + // trb prebuilt runtime arena — each its own device_malloc). Must precede + // mem_alloc_.finalize() so the arenas free through the still-live + // allocator, not after it. + gm_heap_arena_.release(); + gm_sm_arena_.release(); + runtime_arena_pool_.release(); cached_gm_heap_size_ = 0; cached_gm_sm_size_ = 0; + cached_runtime_arena_size_ = 0; // Free all remaining allocations (including handshake buffer and binGmAddr) mem_alloc_.finalize(); diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h index 53fb6555f..93501a916 100644 --- a/src/a2a3/platform/onboard/host/device_runner.h +++ b/src/a2a3/platform/onboard/host/device_runner.h @@ -185,25 +185,36 @@ struct KernelArgsHelper { class DeviceRunner { public: DeviceRunner() : - static_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {} + gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), + gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), + runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {} ~DeviceRunner(); /** - * Lay out and commit the per-Worker static device arena that backs the - * PTO2 GM heap and PTO2 shared memory in a single underlying allocation. - * Must be called before acquire_pooled_gm_heap / acquire_pooled_gm_sm. - * Idempotent on identical sizes. Returns 0 on success, -1 on failure. + * Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared + * memory, trb prebuilt runtime arena) as three independent device + * allocations. Must be called before any acquire_pooled_*. Idempotent + * on identical sizes. `runtime_arena_size` is 0 for the hbg path (no + * prebuilt runtime arena) — the corresponding arena stays uncommitted. + * Returns 0 on success, -1 on failure. */ - int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size); + int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); /** - * Return the pooled GM heap / PTO2 SM pointer. setup_static_arena must - * have been called earlier in this Worker; otherwise these return - * nullptr. Both pointers are stable for the lifetime of the Worker and - * the single underlying device buffer is released in `finalize()`. + * Return the pooled GM heap / PTO2 SM / runtime arena pointer. + * setup_static_arena must have already committed the relevant region; + * otherwise these return nullptr. All pointers are stable for the + * Worker's lifetime; the three underlying device buffers are released + * in `finalize()`. + * + * acquire_pooled_runtime_arena() is trb-only — the runtime arena region + * is only committed when setup_static_arena was called with + * runtime_arena_size > 0. Calling it on the hbg path + * (setup_static_arena(...,0)) returns nullptr (well-defined). */ void *acquire_pooled_gm_heap(); void *acquire_pooled_gm_sm(); + void *acquire_pooled_runtime_arena(); /** * Create a thread bound to this device. @@ -602,22 +613,31 @@ class DeviceRunner { // Memory management MemoryAllocator mem_alloc_; - // Per-Worker arena backing the PTO2 GM heap + PTO2 shared memory in a - // single device allocation. Released explicitly in finalize() before - // mem_alloc_.finalize() so it does not free pointers a second time. + // Three independent per-Worker arenas, each backing a single pooled + // region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime + // arena). Split out from a single backing allocation because the + // combined size can exceed the device allocator's largest contiguous + // block — three separate device_malloc calls are friendlier than one + // big one. Released explicitly in finalize() before mem_alloc_.finalize() + // so the underlying buffers do not get freed twice. + // + // `runtime_arena_pool_` stays unreserved when setup_static_arena was + // invoked with runtime_arena_size == 0 (hbg path). // // Trampolines forward DeviceArena's alloc/free calls to mem_alloc_. static void *arena_alloc_trampoline(void *ctx, size_t size) { return static_cast(ctx)->alloc(size); } static void arena_free_trampoline(void *ctx, void *p) { static_cast(ctx)->free(p); } - DeviceArena static_arena_; - size_t gm_heap_region_off_{SIZE_MAX}; - size_t gm_sm_region_off_{SIZE_MAX}; - // Cached sizes for setup_static_arena's "fits" check — avoids calling - // region_size() on the arena's public API for the two regions we own. + DeviceArena gm_heap_arena_; + DeviceArena gm_sm_arena_; + DeviceArena runtime_arena_pool_; + // Cached sizes for setup_static_arena's "fits" check — avoids re-allocating + // the same buffer when a later worker init asks for an equal-or-smaller + // layout on an already-committed arena. size_t cached_gm_heap_size_{0}; size_t cached_gm_sm_size_{0}; + size_t cached_runtime_arena_size_{0}; // Device resources rtStream_t stream_aicpu_{nullptr}; diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp index 744b7291c..29c14d862 100644 --- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp @@ -108,9 +108,9 @@ static uint64_t upload_chip_callable_buffer_wrapper(const void *callable) { } } -static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size) { +static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) { try { - return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size); + return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size, runtime_arena_size); } catch (...) { return -1; } @@ -132,6 +132,14 @@ static void *acquire_pooled_gm_sm_wrapper() { } } +static void *acquire_pooled_runtime_arena_wrapper() { + try { + return current_runner()->acquire_pooled_runtime_arena(); + } catch (...) { + return nullptr; + } +} + /* =========================================================================== * Public C API (resolved by ChipWorker via dlsym) * =========================================================================== */ @@ -370,6 +378,7 @@ int run_prepared( r->host_api.setup_static_arena = setup_static_arena_wrapper; r->host_api.acquire_pooled_gm_heap = acquire_pooled_gm_heap_wrapper; r->host_api.acquire_pooled_gm_sm = acquire_pooled_gm_sm_wrapper; + r->host_api.acquire_pooled_runtime_arena = acquire_pooled_runtime_arena_wrapper; r->host_api.upload_chip_callable_buffer = upload_chip_callable_buffer_wrapper; // Restore kernel addrs + orch symbol names + active_callable_id; the diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp index 1635f3a7a..9a9cbbabf 100644 --- a/src/a2a3/platform/sim/host/device_runner.cpp +++ b/src/a2a3/platform/sim/host/device_runner.cpp @@ -122,40 +122,68 @@ bool create_temp_so_file(const std::string &path_template, const uint8_t *data, DeviceRunner::~DeviceRunner() { finalize(); } -int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size) { - if (static_arena_.is_committed()) { - if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_) return 0; - static_arena_.release(); - gm_heap_region_off_ = SIZE_MAX; - gm_sm_region_off_ = SIZE_MAX; +int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) { + // Three independent device_malloc'd buffers: GM heap, PTO2 SM, prebuilt + // runtime arena. Split out from a single large allocation because the + // combined size can exceed the device allocator's largest contiguous + // block. Each arena commits exactly one region, so its base() is the + // pooled pointer the caller wants. + // + // Idempotent for the production case (sizes do not change across a + // worker's lifetime). If a caller asks for a larger layout on any + // region, redo just that region. + auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int { + if (requested_size == 0) { + if (arena.is_committed() && cached_size != 0) { + arena.release(); + cached_size = 0; + } + return 0; + } + if (arena.is_committed() && requested_size <= cached_size) { + return 0; + } + arena.release(); + cached_size = 0; + arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign); + if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { + arena.release(); + return -1; + } + cached_size = requested_size; + return 0; + }; + if (commit_region(gm_heap_arena_, cached_gm_heap_size_, gm_heap_size) != 0) return -1; + if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) { + gm_heap_arena_.release(); cached_gm_heap_size_ = 0; - cached_gm_sm_size_ = 0; + return -1; } - gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign); - gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign); - if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { - // Roll back the two reserves: commit() failure leaves committed_=false, - // so the next entry would skip the release branch and stack new - // reserves on top of the stale cursor. release() is idempotent on a - // never-committed arena (just zeroes cursor_ / region_count_). - static_arena_.release(); - gm_heap_region_off_ = SIZE_MAX; - gm_sm_region_off_ = SIZE_MAX; + if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) { + gm_heap_arena_.release(); + gm_sm_arena_.release(); + cached_gm_heap_size_ = 0; + cached_gm_sm_size_ = 0; return -1; } - cached_gm_heap_size_ = gm_heap_size; - cached_gm_sm_size_ = gm_sm_size; return 0; } void *DeviceRunner::acquire_pooled_gm_heap() { - if (!static_arena_.is_committed()) return nullptr; - return static_arena_.region_ptr(gm_heap_region_off_); + if (!gm_heap_arena_.is_committed()) return nullptr; + return gm_heap_arena_.base(); } void *DeviceRunner::acquire_pooled_gm_sm() { - if (!static_arena_.is_committed()) return nullptr; - return static_arena_.region_ptr(gm_sm_region_off_); + if (!gm_sm_arena_.is_committed()) return nullptr; + return gm_sm_arena_.base(); +} + +void *DeviceRunner::acquire_pooled_runtime_arena() { + // hbg calls setup_static_arena(...,0) and leaves runtime_arena_pool_ + // uncommitted — fail loudly if a caller asks for it anyway. + if (!runtime_arena_pool_.is_committed()) return nullptr; + return runtime_arena_pool_.base(); } std::thread DeviceRunner::create_thread(std::function fn) { @@ -1032,24 +1060,29 @@ int DeviceRunner::finalize() { // Close executor .so files (typically already closed by run(), this is a safety net) unload_executor_binaries(); - // Release per-Worker static arena (GM heap + PTO2 SM in a single backing - // device allocation). Must precede mem_alloc_.finalize() so the arena - // frees through the still-live allocator, not after it. - static_arena_.release(); - gm_heap_region_off_ = SIZE_MAX; - gm_sm_region_off_ = SIZE_MAX; + // Release the three per-Worker pooled arenas (GM heap, PTO2 SM, optional + // trb prebuilt runtime arena — each its own device_malloc). Must precede + // mem_alloc_.finalize() so the arenas free through the still-live + // allocator, not after it. + gm_heap_arena_.release(); + gm_sm_arena_.release(); + runtime_arena_pool_.release(); cached_gm_heap_size_ = 0; cached_gm_sm_size_ = 0; + cached_runtime_arena_size_ = 0; - // Free all remaining allocations - mem_alloc_.finalize(); - clear_cpu_sim_shared_storage(); - - // Free the 8-byte device_wall buffer (allocated lazily in run()). + // Free the 8-byte device_wall buffer (allocated lazily in run()) before + // mem_alloc_.finalize(): free_tensor() routes back through mem_alloc_, + // so doing it after finalize would be a use-after-finalize. if (device_wall_dev_ptr_ != nullptr) { free_tensor(device_wall_dev_ptr_); device_wall_dev_ptr_ = nullptr; } + + // Free all remaining allocations + mem_alloc_.finalize(); + clear_cpu_sim_shared_storage(); + device_id_ = -1; worker_count_ = 0; last_runtime_ = nullptr; diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h index 73b3dfea2..46ee45913 100644 --- a/src/a2a3/platform/sim/host/device_runner.h +++ b/src/a2a3/platform/sim/host/device_runner.h @@ -75,24 +75,33 @@ class DeviceRunner { public: DeviceRunner() : - static_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {} + gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), + gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), + runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {} ~DeviceRunner(); /** - * Lay out and commit the per-Worker static device arena that backs the - * PTO2 GM heap and PTO2 shared memory in a single underlying allocation. - * Must be called before acquire_pooled_gm_heap / acquire_pooled_gm_sm. - * Idempotent on identical sizes. Returns 0 on success, -1 on failure. + * Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared + * memory, trb prebuilt runtime arena) as three independent device + * allocations. Must be called before any acquire_pooled_*. + * `runtime_arena_size` is 0 for the hbg path (leaves that arena + * uncommitted). Idempotent on identical sizes. Returns 0 on success, + * -1 on failure. */ - int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size); + int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); /** - * Return the pooled GM heap / PTO2 SM pointer. setup_static_arena must - * have been called earlier in this Worker; otherwise these return - * nullptr. Pointers are stable for the lifetime of the Worker. + * Return the pooled GM heap / PTO2 SM / runtime arena pointer. + * setup_static_arena must have already committed the relevant region. + * + * acquire_pooled_runtime_arena() is trb-only — the runtime arena region + * is only committed when setup_static_arena was called with + * runtime_arena_size > 0. Calling it on the hbg path + * (setup_static_arena(...,0)) returns nullptr (well-defined). */ void *acquire_pooled_gm_heap(); void *acquire_pooled_gm_sm(); + void *acquire_pooled_runtime_arena(); /** * Create a thread bound to this device. @@ -280,22 +289,29 @@ class DeviceRunner { // Memory management MemoryAllocator mem_alloc_; - // Per-Worker arena backing the PTO2 GM heap + PTO2 shared memory in a - // single device allocation. Released explicitly in finalize() before - // mem_alloc_.finalize() so it does not free pointers a second time. + // Three independent per-Worker arenas, each backing a single pooled + // region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime + // arena). Split out from a single backing allocation because the + // combined size can exceed the device allocator's largest contiguous + // block. Released explicitly in finalize() before mem_alloc_.finalize() + // so the underlying buffers do not get freed twice. + // + // `runtime_arena_pool_` stays unreserved when setup_static_arena was + // invoked with runtime_arena_size == 0 (hbg path). // // Trampolines forward DeviceArena's alloc/free to mem_alloc_. static void *arena_alloc_trampoline(void *ctx, size_t size) { return static_cast(ctx)->alloc(size); } static void arena_free_trampoline(void *ctx, void *p) { static_cast(ctx)->free(p); } - DeviceArena static_arena_; - size_t gm_heap_region_off_{SIZE_MAX}; - size_t gm_sm_region_off_{SIZE_MAX}; - // Cached sizes for setup_static_arena's "fits" check — avoids calling - // region_size() on the arena's public API for the two regions we own. + DeviceArena gm_heap_arena_; + DeviceArena gm_sm_arena_; + DeviceArena runtime_arena_pool_; + // Cached sizes for setup_static_arena's "fits" check — avoids re-allocating + // a buffer when a later worker init asks for an equal-or-smaller layout. size_t cached_gm_heap_size_{0}; size_t cached_gm_sm_size_{0}; + size_t cached_runtime_arena_size_{0}; // Simulation state (no actual device resources) KernelArgs kernel_args_; diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp index 4ad438a9c..fca663610 100644 --- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp @@ -103,9 +103,9 @@ static uint64_t upload_chip_callable_buffer_wrapper(const void *callable) { } } -static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size) { +static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) { try { - return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size); + return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size, runtime_arena_size); } catch (...) { return -1; } @@ -127,6 +127,14 @@ static void *acquire_pooled_gm_sm_wrapper() { } } +static void *acquire_pooled_runtime_arena_wrapper() { + try { + return current_runner()->acquire_pooled_runtime_arena(); + } catch (...) { + return nullptr; + } +} + /* =========================================================================== * Public C API (resolved by ChipWorker via dlsym) * =========================================================================== */ @@ -333,6 +341,7 @@ int run_prepared( r->host_api.setup_static_arena = setup_static_arena_wrapper; r->host_api.acquire_pooled_gm_heap = acquire_pooled_gm_heap_wrapper; r->host_api.acquire_pooled_gm_sm = acquire_pooled_gm_sm_wrapper; + r->host_api.acquire_pooled_runtime_arena = acquire_pooled_runtime_arena_wrapper; r->host_api.upload_chip_callable_buffer = upload_chip_callable_buffer_wrapper; auto bind_result = runner->bind_prepared_callable_to_runtime(*r, callable_id); diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.h b/src/a2a3/runtime/host_build_graph/runtime/runtime.h index 41845bdf0..ccdc05ce0 100644 --- a/src/a2a3/runtime/host_build_graph/runtime/runtime.h +++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.h @@ -140,9 +140,16 @@ struct HostApi { // pto_runtime_c_api.cpp can populate the same HostApi struct regardless of // which runtime variant it is built against. Unset for this variant; do // not call. - int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size); + // PTO2 static-arena hooks. The host_build_graph runtime does not currently + // use these — the fields exist only so the platform layer's + // pto_runtime_c_api.cpp can populate the same HostApi struct regardless of + // which runtime variant it is built against. Unset for this variant; do + // not call. hbg-side callers pass runtime_arena_size == 0 (hbg has no + // prebuilt runtime arena). + int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); void *(*acquire_pooled_gm_heap)(); void *(*acquire_pooled_gm_sm)(); + void *(*acquire_pooled_runtime_arena)(); // Single-shot upload of the entire ChipCallable buffer. `callable` is a // `const ChipCallable *` (declared void* to avoid pulling task_interface // headers into runtime.h). DeviceRunner walks child_offsets_ to compute diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index f8e35917b..5c31c5b9a 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -467,29 +467,60 @@ int32_t AicpuExecutor::run(Runtime *runtime) { static_cast(task_window_size), static_cast(heap_size), dep_pool_capacity ); - void *sm_ptr = runtime->get_gm_sm_ptr(); - void *gm_heap = runtime->get_gm_heap_ptr(); + // gm_heap pointer / dep_pool_capacity are encoded into the prebuilt + // runtime arena image at host build time, so we no longer fetch + // them here. They remain on the host Runtime instance and on the + // PTO2Runtime header for diagnostic purposes only. + (void)dep_pool_capacity; + void *sm_ptr = runtime->get_gm_sm_ptr(); uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(task_window_size); - rt = runtime_create_from_sm( - PTO2_MODE_EXECUTE, sm_ptr, sm_size, task_window_size, gm_heap, heap_size, runtime_arena_, - dep_pool_capacity - ); - if (!rt) { - LOG_ERROR("Thread %d: Failed to create PTO2Runtime", thread_idx); - // Unblock scheduler threads before returning so they don't spin forever. + + // Prebuilt-arena fast path. Host has pre-populated the entire + // runtime arena (PTO2Runtime + orchestrator/scheduler/tensor_map + // sub-regions + sm_handle wrapper + mailbox) and uploaded it via + // rtMemcpy into the pooled runtime_arena buffer. We attach to it, + // wire arena-internal pointers to their device addresses, reset + // the SM, and finalize the few device-only fields the host could + // not know at image-build time. + void *prebuilt_arena = runtime->get_prebuilt_arena_base(); + size_t off_runtime = runtime->get_prebuilt_runtime_offset(); + if (prebuilt_arena == nullptr) { + LOG_ERROR("Thread %d: prebuilt_arena_base is null", thread_idx); + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + runtime_arena_.attach(prebuilt_arena, DeviceArena::kDefaultBaseAlign); + rt = reinterpret_cast(static_cast(prebuilt_arena) + off_runtime); + + // Wire every arena-internal pointer field (host wrote host-mirror + // addresses; we overwrite them with device addresses). + runtime_wire_arena_pointers(runtime_arena_, rt->prebuilt_layout, rt); + + // Reset SM state. setup_pointers + init_header_per_ring restore + // ring flow-control counters, layout metadata, error flags, and + // the per-slot ring->slot_states[] (bind_ring + reset_for_reuse + + // fanin_count/active_mask zero — previously done inside + // RingSchedState::init). + memset(rt->sm_handle, 0, sizeof(*rt->sm_handle)); + if (!rt->sm_handle->init(sm_ptr, sm_size, task_window_size, heap_size)) { + LOG_ERROR("Thread %d: sm_handle->init failed", thread_idx); runtime_init_ready_.store(true, std::memory_order_release); return -1; } + // AICore completion mailbox lives in the arena; reset it each + // boot so stale completion notifications from a previous run do + // not leak. + memset(rt->aicore_mailbox, 0, sizeof(*rt->aicore_mailbox)); + + // Fill ops / core counts (host can't resolve s_runtime_ops's + // device address nor know the SchedulerContext's core fan-out). + runtime_finalize_after_wire(rt, sched_ctx_.aic_count(), sched_ctx_.aiv_count()); #if PTO2_PROFILING rt->orchestrator.l2_perf_level = get_l2_perf_level(); #endif - // Total core counts = aic_count_ / aiv_count_ (set once at runtime init). - rt->orchestrator.total_cluster_count = sched_ctx_.aic_count(); - rt->orchestrator.total_aiv_count = sched_ctx_.aiv_count(); - // With multi-ring, slot_states are per-ring inside the scheduler. runtime->set_slot_states_ptr(nullptr); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp index 027805918..71a482632 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp @@ -487,11 +487,15 @@ dep_gen_replay_emit_deps_json(const DepGenRecord *records, size_t num_records, c PTO2TensorMap::reserve_layout(replay_arena, PTO2_TENSORMAP_NUM_BUCKETS, pool_size, task_window_sizes); auto annot_layout = PTO2TensorMap::reserve_layout(replay_arena, PTO2_TENSORMAP_NUM_BUCKETS, pool_size, task_window_sizes); - if (replay_arena.commit() == nullptr || !tm_oracle.init_from_layout(oracle_layout, replay_arena) || - !tm_annot.init_from_layout(annot_layout, replay_arena)) { + if (replay_arena.commit() == nullptr || !tm_oracle.init_data_from_layout(oracle_layout, replay_arena) || + !tm_annot.init_data_from_layout(annot_layout, replay_arena)) { LOG_ERROR("dep_gen replay: tensormap.init failed (buckets=%d, pool=%d)", PTO2_TENSORMAP_NUM_BUCKETS, pool_size); return -3; } + // Replay tensormaps live entirely on host; only arena-internal pointer + // fields need wiring (no parent-orch back-reference exists anymore). + tm_oracle.wire_arena_pointers(oracle_layout, replay_arena); + tm_annot.wire_arena_pointers(annot_layout, replay_arena); // JSON output accumulators. std::vector task_table; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index a75205196..e40aa5ae7 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -36,11 +36,13 @@ #include #include "../common/pto_runtime_status.h" +#include "../runtime/pto_runtime2.h" #include "../runtime/pto_shared_memory.h" #include "../runtime/runtime.h" #include "callable.h" #include "common/platform_config.h" #include "common/unified_log.h" +#include "device_arena.h" #include "prepare_callable_common.h" // Helper: return current time in milliseconds @@ -271,15 +273,35 @@ extern "C" int bind_prepared_to_runtime_impl( uint64_t eff_heap_size = runtime->heap_size ? runtime->heap_size : PTO2_HEAP_SIZE; uint64_t eff_task_window_size = runtime->task_window_size ? runtime->task_window_size : PTO2_TASK_WINDOW_SIZE; - // Lay out the per-Worker static device arena. GM heap (orchestrator output - // buffers, all rings combined) and PTO2 shared memory live in a single - // backing allocation; setup_static_arena reserves both regions and - // commits in one shot. Owned by DeviceRunner across runs — do NOT record - // in tensor_pairs_; the free is deferred to DeviceRunner::finalize(). + // Lay out the per-Worker static device arena. GM heap, PTO2 shared memory, + // and the prebuilt runtime arena all live in a single backing allocation; + // setup_static_arena reserves the three regions and commits in one shot. + // Owned by DeviceRunner across runs — do NOT record in tensor_pairs_; the + // free is deferred to DeviceRunner::finalize(). The runtime-arena size is + // determined by replaying the reserve sequence on a host-side arena. uint64_t total_heap_size = eff_heap_size * PTO2_MAX_RING_DEPTH; uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(eff_task_window_size); + // dep_pool_size comes from a uint64 env var; reject values that don't fit + // the int32_t layout-sizing path rather than silently truncating. + int32_t eff_dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE; + if (runtime->dep_pool_size != 0) { + if (runtime->dep_pool_size > static_cast(INT32_MAX)) { + LOG_ERROR("PTO2_RING_DEP_POOL=%" PRIu64 " exceeds INT32_MAX", runtime->dep_pool_size); + return -1; + } + eff_dep_pool_capacity = static_cast(runtime->dep_pool_size); + } + + int64_t t_prebuilt_start = _now_ms(); + DeviceArena host_arena; // libc malloc backend by default + PTO2RuntimeArenaLayout layout = runtime_reserve_layout(host_arena, eff_task_window_size, eff_dep_pool_capacity); + if (host_arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { + LOG_ERROR("Failed to commit host arena for prebuilt runtime image"); + return -1; + } + int64_t t_setup_start = _now_ms(); - if (runtime->host_api.setup_static_arena(total_heap_size, sm_size) != 0) { + if (runtime->host_api.setup_static_arena(total_heap_size, sm_size, layout.arena_size) != 0) { LOG_ERROR("Failed to setup pooled static arena"); return -1; } @@ -303,9 +325,48 @@ extern "C" int bind_prepared_to_runtime_impl( } runtime->set_gm_sm_ptr(sm_ptr); + void *runtime_arena_dev = runtime->host_api.acquire_pooled_runtime_arena(); + if (runtime_arena_dev == nullptr) { + LOG_ERROR("Failed to acquire pooled runtime arena"); + return -1; + } + // Set up device orchestration state runtime->set_orch_args(device_args); + // ------------------------------------------------------------------------- + // Build the prebuilt runtime-arena image on host. + // + // We pre-compute every byte the AICPU's runtime arena would otherwise have + // to write at boot: layout offsets, sub-structure init data, and pointers + // back to the SM / GM heap. Then we rtMemcpy the image into the pooled + // runtime-arena region that DeviceRunner keeps alive across runs. AICPU + // boot becomes attach + wire (cheap pointer fixup) + sm_handle->init (SM + // reset) + a handful of device-only field fixups. + // ------------------------------------------------------------------------- + PTO2Runtime *rt = + runtime_init_data_from_layout(host_arena, layout, PTO2_MODE_EXECUTE, sm_ptr, sm_size, gm_heap, eff_heap_size); + if (rt == nullptr) { + LOG_ERROR("runtime_init_data_from_layout failed"); + return -1; + } + runtime_wire_arena_pointers(host_arena, layout, rt); + + // Stash the layout inside the PTO2Runtime image so the AICPU can recover + // every arena-internal offset after rtMemcpy. The runtime arena's device + // base does NOT travel in this image — it's on the host Runtime + // (set_prebuilt_arena below), since the AICPU needs that pointer + // *before* it can dereference the image. + rt->prebuilt_layout = layout; + + int rc_upload = runtime->host_api.copy_to_device(runtime_arena_dev, host_arena.base(), layout.arena_size); + if (rc_upload != 0) { + LOG_ERROR("Failed to rtMemcpy prebuilt runtime arena to device (rc=%d)", rc_upload); + return -1; + } + runtime->set_prebuilt_arena(runtime_arena_dev, layout.off_runtime); + int64_t t_prebuilt_end = _now_ms(); + LOG_INFO_V0("Device orchestration ready: %d tensors + %d scalars", tensor_count, scalar_count); int64_t t_total_end = _now_ms(); @@ -313,6 +374,7 @@ extern "C" int bind_prepared_to_runtime_impl( LOG_INFO_V0("TIMING: static_arena_setup = %" PRId64 "ms", t_setup_end - t_setup_start); LOG_INFO_V0("TIMING: gm_heap_acquire = %" PRId64 "ms", t_heap_end - t_heap_start); LOG_INFO_V0("TIMING: shared_mem_acquire = %" PRId64 "ms", t_sm_end - t_sm_start); + LOG_INFO_V0("TIMING: prebuilt_runtime_arena = %" PRId64 "ms", t_prebuilt_end - t_prebuilt_start); LOG_INFO_V0("TIMING: total_init_runtime_impl = %" PRId64 "ms", t_total_end - t_total_start); return 0; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index 5f6d20855..f80c7a655 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -355,11 +355,21 @@ static bool prepare_task( prefetch_payload(out->payload, args.tensor_count(), args.scalar_count()); + // Re-bind payload/task pointers each submit. Value is per-slot constant + // (same as &task_payloads[slot] / &task_descriptors[slot]), but writing + // here lets RingSchedState::init() skip the O(window_size) bind loop. + // Both writes hit the same 64B slot_state cache line we're about to + // dirty below, so the extra cost is two stores on an already-hot line. + // Must precede the scheduler wiring.queue.push at the end of + // submit_task_common — that push is the first read of slot_state->task / + // slot_state->payload by another thread. + out->slot_state->bind_buffers(out->payload, out->task); + // Fields already reset by advance_ring_pointers (eager reset after CONSUMED): // fanout_lock=0, fanout_count=1, fanout_head=nullptr, // fanin_refcount=0, fanout_refcount=0, completed_subtasks=0, next_block_idx=0 // Fields immutable after RingSchedState::init(): - // payload, task, ring_id + // ring_id // task_state left as CONSUMED by eager reset (safe for stale wait_for_tensor // observers); set to PENDING here when orchestrator actually reuses the slot. out->slot_state->task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed); @@ -374,89 +384,6 @@ static bool prepare_task( return true; } -// ============================================================================= -// Orchestrator Initialization -// ============================================================================= - -PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout( - DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity -) { - PTO2OrchestratorLayout layout{}; - layout.dep_pool_capacity = dep_pool_capacity; - layout.scope_tasks_cap = PTO2_SCOPE_TASKS_CAP; - layout.scope_stack_capacity = PTO2_MAX_SCOPE_DEPTH; - - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - const size_t fanin_pool_bytes = - PTO2_ALIGN_UP(static_cast(dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE); - layout.off_fanin_pool[r] = arena.reserve(fanin_pool_bytes, PTO2_ALIGN_SIZE); - } - layout.off_scope_tasks = arena.reserve( - static_cast(layout.scope_tasks_cap) * sizeof(PTO2TaskSlotState *), alignof(PTO2TaskSlotState *) - ); - layout.off_scope_begins = - arena.reserve(static_cast(layout.scope_stack_capacity) * sizeof(int32_t), alignof(int32_t)); - layout.tensor_map = PTO2TensorMap::reserve_layout_default(arena, task_window_sizes); - return layout; -} - -bool PTO2OrchestratorState::init_from_layout( - const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header_arg, void *gm_heap, - uint64_t heap_size -) { - auto *orch = this; - *orch = PTO2OrchestratorState{}; - - orch->sm_header = sm_header_arg; - orch->gm_heap_base = gm_heap; - orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH; - orch->fatal = false; - - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - void *ring_heap_base = reinterpret_cast(gm_heap) + r * heap_size; - auto &ring = sm_header_arg->rings[r]; - - orch->rings[r].task_allocator.init( - ring.task_descriptors, ring.task_window_size, &ring.fc.current_task_index, &ring.fc.last_task_alive, - ring_heap_base, heap_size, &sm_header_arg->orch_error_code - ); - - const size_t fanin_pool_bytes = - PTO2_ALIGN_UP(static_cast(layout.dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE); - auto *fanin_entries = static_cast(arena.region_ptr(layout.off_fanin_pool[r])); - // aligned_zalloc-equivalent: pool relies on zeroed entries. - memset(fanin_entries, 0, fanin_pool_bytes); - orch->rings[r].fanin_pool.init(fanin_entries, layout.dep_pool_capacity, &sm_header_arg->orch_error_code); - } - - if (!orch->tensor_map.init_from_layout(layout.tensor_map, arena)) { - return false; - } - orch->tensor_map.orch = orch; - - orch->scope_tasks = static_cast(arena.region_ptr(layout.off_scope_tasks)); - orch->scope_begins = static_cast(arena.region_ptr(layout.off_scope_begins)); - orch->scope_tasks_size = 0; - orch->scope_tasks_capacity = layout.scope_tasks_cap; - orch->scope_stack_top = -1; - orch->scope_stack_capacity = layout.scope_stack_capacity; - orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH; - - return true; -} - -void PTO2OrchestratorState::destroy() { - auto *orch = this; - orch->tensor_map.destroy(); - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - orch->rings[r].fanin_pool.base = nullptr; - } - orch->scope_tasks = nullptr; - orch->scope_begins = nullptr; -} - -void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; } - // ============================================================================= // Scope Management // ============================================================================= diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h index 37fd0dcac..7dd47b19a 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h @@ -142,14 +142,21 @@ struct PTO2OrchestratorState { int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE ); - // Phase 3: bind region pointers, wire per-ring task_allocator + fanin_pool - // and tensor_map. Arena must be committed; layout must come from - // reserve_layout() against the same arena. - bool init_from_layout( - const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header, void *gm_heap, - uint64_t heap_size + // Phase 3a: write everything *except* arena-internal pointer fields. + // sm_dev_base is the SM device address (only stored, never dereferenced); + // task_window_size feeds the per-ring SM address arithmetic. Safe to call + // on a host arena that holds the prebuilt image. + bool init_data_from_layout( + const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size, + uint64_t task_window_size ); + // Phase 3b: write the arena-internal pointer fields (scope_tasks, + // scope_begins, rings[].fanin_pool.base, tensor_map.{buckets,entry_pool, + // free_entry_list,task_entry_heads}, scheduler reference). + // Idempotent — host runs once on the image, AICPU runs once after attach. + void wire_arena_pointers(const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler); + // Forget pointers; arena owns the backing buffers. void destroy(); void set_scheduler(PTO2SchedulerState *scheduler); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h index 5a3e3d3d3..abd2a7510 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h @@ -68,10 +68,22 @@ class PTO2TaskAllocator { public: /** * Initialize the allocator with task ring and heap ring resources. + * + * All pointer arguments are device addresses (live in SM / GM heap); this + * function only stores them, no dereferences, so it is safe to invoke + * from host code that constructs a prebuilt arena image. + * + * Production callers leave `initial_local_task_id` at 0: the SM ring + * flow-control counters that current_index_ptr / last_alive_ptr point at + * start at zero (PTO2RingFlowControl::init() runs on the AICPU during SM + * reset), so we keep local_task_id_ aligned with that without reading the + * SM. Tests that drive SM state directly may pass a non-zero seed to + * exercise corner cases like task IDs near INT32_MAX. */ void init( PTO2TaskDescriptor *descriptors, int32_t window_size, std::atomic *current_index_ptr, - std::atomic *last_alive_ptr, void *heap_base, uint64_t heap_size, std::atomic *error_code_ptr + std::atomic *last_alive_ptr, void *heap_base, uint64_t heap_size, std::atomic *error_code_ptr, + int32_t initial_local_task_id = 0 ) { descriptors_ = descriptors; window_size_ = window_size; @@ -81,7 +93,7 @@ class PTO2TaskAllocator { heap_base_ = heap_base; heap_size_ = heap_size; error_code_ptr_ = error_code_ptr; - local_task_id_ = current_index_ptr->load(std::memory_order_relaxed); + local_task_id_ = initial_local_task_id; heap_top_ = 0; heap_tail_ = 0; last_alive_seen_ = 0; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp index c801d5c15..f39bac365 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp @@ -249,81 +249,19 @@ static const PTO2RuntimeOps s_runtime_ops = { }; // ============================================================================= -// Runtime Creation and Destruction +// Runtime Lifecycle (AICPU-only fixup) // ============================================================================= - -PTO2Runtime *runtime_create_from_sm( - PTO2RuntimeMode mode, void *sm_base, uint64_t sm_size, uint64_t task_window_size, void *gm_heap, uint64_t heap_size, - DeviceArena &arena, int32_t dep_pool_capacity -) { - if (!sm_base || sm_size == 0) return nullptr; - - // Phase 1: layout. Reserve every sub-region the runtime needs (including - // the SM handle wrapper itself) without touching memory yet. - int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]; - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - task_window_sizes[r] = static_cast(task_window_size); - } - const size_t off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle)); - PTO2OrchestratorLayout orch_layout = - PTO2OrchestratorState::reserve_layout(arena, task_window_sizes, dep_pool_capacity); - PTO2SchedulerLayout sched_layout = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacity); - const size_t off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE); - const size_t off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox)); - - // Phase 2: single backing allocation. - if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) return nullptr; - - // Phase 3: bind region pointers and initialize. - PTO2Runtime *rt = static_cast(arena.region_ptr(off_runtime)); - memset(rt, 0, sizeof(*rt)); // calloc-equivalent for the runtime header. - - // Initialize the SM handle wrapper in-place on its arena region before - // anything that reads sm_handle->header (orchestrator / scheduler init). - rt->sm_handle = static_cast(arena.region_ptr(off_sm_handle)); - memset(rt->sm_handle, 0, sizeof(*rt->sm_handle)); - if (!rt->sm_handle->init(sm_base, sm_size, task_window_size, heap_size)) { - arena.release(); - return nullptr; - } - +// +// Layout / init_data / wire / destroy live in +// runtime/shared/pto_runtime2_init.cpp so the host build can pre-populate the +// prebuilt arena image. The pieces below — wiring the ops table and the +// SPMD core counts — depend on the device-side s_runtime_ops global and the +// AICPU SchedulerContext respectively, so they remain in the AICPU build. + +void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count) { rt->ops = &s_runtime_ops; - rt->mode = mode; - rt->gm_heap = gm_heap; - rt->gm_heap_size = heap_size > 0 ? heap_size * PTO2_MAX_RING_DEPTH : 0; - rt->gm_heap_owned = false; - - if (!rt->orchestrator.init_from_layout(orch_layout, arena, rt->sm_handle->header, gm_heap, heap_size)) { - arena.release(); - return nullptr; - } - if (!rt->scheduler.init_from_layout(sched_layout, arena, rt->sm_handle->header)) { - rt->orchestrator.destroy(); - arena.release(); - return nullptr; - } - rt->orchestrator.set_scheduler(&rt->scheduler); - - rt->aicore_mailbox = static_cast(arena.region_ptr(off_mailbox)); - memset(rt->aicore_mailbox, 0, sizeof(*rt->aicore_mailbox)); - - return rt; -} - -void runtime_destroy(PTO2Runtime *rt, DeviceArena &arena) { - if (!rt) { - arena.release(); // safe: idempotent if nothing's committed. - return; - } - - rt->scheduler.destroy(); - rt->orchestrator.destroy(); - rt->aicore_mailbox = nullptr; // arena-owned. - rt->sm_handle = nullptr; // wrapper lives in arena; release() reclaims it. - - // arena.release() frees the single backing buffer that holds rt, - // mailbox, sm_handle, orchestrator and scheduler sub-regions in one shot. - arena.release(); + rt->orchestrator.total_cluster_count = aic_count; + rt->orchestrator.total_aiv_count = aiv_count; } void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode) { diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h index 5709a85b7..460624e69 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h @@ -91,6 +91,30 @@ struct PTO2RuntimeOps { TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const Arg &args); }; +/** + * Layout descriptor for the prebuilt runtime arena. Holds all sub-region + * offsets (orchestrator / scheduler / sm_handle wrapper / runtime header / + * AICore mailbox) plus the layout-defining capacities. Produced once on the + * host by runtime_reserve_layout(); consumed by runtime_init_data_from_layout + * and runtime_wire_arena_pointers. + */ +struct PTO2RuntimeArenaLayout { + size_t off_sm_handle{0}; + PTO2OrchestratorLayout orch; + PTO2SchedulerLayout sched; + size_t off_runtime{0}; + size_t off_mailbox{0}; + + // Cached parameters (re-used by init_data + wire stages). + uint64_t task_window_size{0}; + uint64_t heap_size{0}; + int32_t dep_pool_capacity{0}; + + // Total arena byte size post-commit. Used by host to size the prebuilt + // image buffer and as the rtMemcpy length. + size_t arena_size{0}; +}; + /** * PTO Runtime2 context * @@ -118,6 +142,16 @@ struct PTO2Runtime { // Statistics int64_t total_cycles; + + // Prebuilt-arena fast path metadata. Carries every offset + // wire_arena_pointers needs at AICPU boot so the AICPU can reconstruct + // all arena-internal pointer fields without re-running init_data. The + // device base of the runtime arena travels separately on the host-side + // Runtime (Runtime::prebuilt_arena_base_), since the AICPU needs it + // *before* dereferencing this image. Populated on host by + // runtime_init_data_from_layout + runtime_wire_arena_pointers; read by + // aicpu_executor.cpp. + PTO2RuntimeArenaLayout prebuilt_layout; }; // ============================================================================= @@ -125,38 +159,60 @@ struct PTO2Runtime { // ============================================================================= /** - * Create runtime from caller-provided GM SM buffer + GM heap. - * - * All AICPU-side runtime state (PTO2SharedMemoryHandle wrapper, PTO2Runtime, - * AICoreCompletionMailbox, plus the orchestrator/scheduler/tensor_map - * sub-regions) is laid out on the supplied arena and committed in a single - * backing allocation — including the SM handle wrapper itself. The arena is - * owned by the caller (typically the per-Worker AicpuExecutor); - * runtime_destroy() calls arena.release() once to free the lot. + * Phase 1 — declare every sub-region (sm_handle wrapper, orchestrator / + * scheduler / tensor_map / mailbox / PTO2Runtime header) on the supplied + * arena. Pure arithmetic; does not touch device memory and may run on host. + * Returns the layout descriptor; caller commits/attaches the arena before + * Phase 2/3. + */ +PTO2RuntimeArenaLayout runtime_reserve_layout( + DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE +); + +/** + * Phase 2 — write the data half of the runtime arena: standalone fields, + * memset'd arena regions, sub-structure initializers, and SM-side device + * pointers. The arena must already be committed (or attached); writes go + * into arena.base() + sub-region offsets. * - * `sm_base` / `sm_size` describe the SM buffer that host has already placed - * for the runtime to use; the SM handle wrapper is constructed in-place on - * an arena-reserved region pointing at that buffer. + * `sm_dev_base` / `gm_heap_dev_base` are device addresses; we only store + * them (never dereference). Safe to run on a host arena that owns a host + * mirror of the runtime image — the resulting buffer is rtMemcpy-ready. * - * @param mode Execution mode - * @param sm_base Pre-allocated SM buffer base (host-owned) - * @param sm_size Size of the SM buffer in bytes - * @param task_window_size Per-ring task window size used to lay out SM - * @param gm_heap GM heap base for output buffers (or NULL if not used) - * @param heap_size GM heap size in bytes - * @param arena Caller-owned arena that sources all runtime sub-regions. - * Must be freshly constructed (no prior commit) — - * runtime_create_from_sm reserves + commits internally. - * @return Runtime context, or NULL on failure - */ -PTO2Runtime *runtime_create_from_sm( - PTO2RuntimeMode mode, void *sm_base, uint64_t sm_size, uint64_t task_window_size, void *gm_heap, uint64_t heap_size, - DeviceArena &arena, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE + * Returns the PTO2Runtime* that sits at layout.off_runtime within the arena. + * Caller must follow up with runtime_wire_arena_pointers; rt->ops and the + * AICore-side count fields are left untouched and must be filled by the + * AICPU at boot. + */ +PTO2Runtime *runtime_init_data_from_layout( + DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, uint64_t sm_size, + void *gm_heap_dev_base, uint64_t heap_size ); /** - * Destroy runtime and free all resources. arena.release() is the actual - * memory free; the rt pointer is no longer valid afterward. + * Phase 3 — wire every arena-internal pointer field (rt->sm_handle, + * rt->aicore_mailbox, orchestrator.{scope_tasks, scope_begins, scheduler, + * tensor_map.*, rings[].fanin_pool.base}, scheduler.{ready_queues, dep_pool, + * wiring.queue}) so each holds arena.base() + offset. Idempotent — runs on + * both host (writing host-mirror addresses) and AICPU (writing device + * addresses) sides. + */ +void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt); + +/** + * AICPU-only Phase 4 — fill in the few fields the host could not know at + * prebuilt-image build time: the ops table (s_runtime_ops is a device-side + * file-local global, host cannot resolve its device address) and the + * orchestrator's core counts (depend on the executor's scheduler context). + * Call once per boot after runtime_wire_arena_pointers. + */ +void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count); + +/** + * Destroy runtime. With the prebuilt-arena fast path the arena buffer is + * pooled across runs by DeviceRunner, so we never call arena.release() + * here — the destructor only forgets sub-structure pointers (idempotent + * cleanup). */ void runtime_destroy(PTO2Runtime *rt, DeviceArena &arena); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h index fcd8a27bd..f217e7ac3 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h @@ -330,7 +330,11 @@ struct alignas(64) PTO2TaskSlotState { // Fanout refcount (accessed with fanout_count in check_and_handle_consumed) std::atomic fanout_refcount; // Dynamic: counts released references - // --- Immutable after RingSchedState::init() (same value on every slot reuse) --- + // --- Per-slot constant, re-bound by orch::prepare_task each submit --- + // Value is the same on every reuse (&task_payloads[slot] / &task_descriptors[slot]), + // but written here per-submit instead of in an O(window_size) init loop — + // these are the only "scale-dependent" pointers in this struct, so moving + // them out of init makes startup cost independent of task_window_size. PTO2TaskPayload *payload; PTO2TaskDescriptor *task; @@ -345,14 +349,21 @@ struct alignas(64) PTO2TaskSlotState { int16_t next_block_idx{0}; // Next block to dispatch (scheduler state) /** - * One-time binding of slot-invariant fields. - * Called during RingSchedState::init() — these values are determined by - * the slot's position in the ring and never change across reuses. + * Bind the slot-invariant ring id. Called once per slot during + * RingSchedState::init(); ring_id never changes across reuses. */ - void bind(PTO2TaskPayload *p, PTO2TaskDescriptor *t, uint8_t rid) { + void bind_ring(uint8_t rid) { ring_id = rid; } + + /** + * Re-bind the per-slot payload/task pointers. Called by + * orch::prepare_task on every submit. Value is constant for a given + * slot, but we pay the cheap re-write each submit (both fields land on + * the same 64B slot_state cache line that prepare_task is already + * dirtying) to avoid the init-time per-slot loop. + */ + void bind_buffers(PTO2TaskPayload *p, PTO2TaskDescriptor *t) { payload = p; task = t; - ring_id = rid; } /** diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h index 5e1b6faa8..98b832510 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h @@ -58,6 +58,13 @@ struct alignas(64) PTO2RingFlowControl { // === Cache Line 1: Written by Scheduler, Read by Orchestrator (for back-pressure) === alignas(64) std::atomic last_task_alive; // Task ring tail (oldest active task) + // Per-boot SM reset. PTO2TaskAllocator::init() seeds its private + // local_task_id_ from initial_local_task_id (default 0 in production) + // *without* dereferencing current_task_index — it relies on this reset + // running on every AICPU boot so 0 stays in sync. If you ever change + // the initial fc value or the boot ordering, update the default in + // PTO2TaskAllocator::init (pto_ring_buffer.h) in the same change, or + // submit IDs will be off by the divergence. void init() { current_task_index.store(0, std::memory_order_relaxed); last_task_alive.store(0, std::memory_order_relaxed); @@ -187,3 +194,67 @@ struct PTO2SharedMemoryHandle { void setup_pointers(uint64_t task_window_size); void setup_pointers_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]); }; + +// ============================================================================= +// SM Device Layout Helpers +// ============================================================================= +// +// When the host pre-builds a runtime-arena image, it needs the device-side +// addresses of several SM sub-fields (ring flow-control counters, +// task_descriptors arrays, orch_error_code) so it can wire them into the +// orchestrator / scheduler init_data path without dereferencing the SM — +// the SM lives in device memory and cannot be touched from host. +// +// These helpers compute those addresses by offset arithmetic on the SM +// device base. Pure pointer math, no loads/stores; safe to call from host. +// The same arithmetic happens on AICPU too (via PTO2SharedMemoryHandle's +// own setup_pointers), so values are guaranteed consistent across sides. +namespace pto2_sm_layout { + +inline std::atomic *orch_error_code_addr(void *sm_dev_base) noexcept { + return reinterpret_cast *>( + static_cast(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, orch_error_code) + ); +} + +inline PTO2SharedMemoryRingHeader *ring_header_addr(void *sm_dev_base, int ring_id) noexcept { + return reinterpret_cast( + static_cast(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, rings) + + static_cast(ring_id) * sizeof(PTO2SharedMemoryRingHeader) + ); +} + +inline std::atomic *ring_current_task_index_addr(void *sm_dev_base, int ring_id) noexcept { + return reinterpret_cast *>( + reinterpret_cast(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) + + offsetof(PTO2RingFlowControl, current_task_index) + ); +} + +inline std::atomic *ring_last_task_alive_addr(void *sm_dev_base, int ring_id) noexcept { + return reinterpret_cast *>( + reinterpret_cast(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) + + offsetof(PTO2RingFlowControl, last_task_alive) + ); +} + +// Walk the per-ring SM layout (same arithmetic as setup_pointers_per_ring) +// to compute ring `ring_id`'s task_descriptors device address. Accepts a +// per-ring window-size array so the helper's signature mirrors +// `PTO2SharedMemoryHandle::setup_pointers_per_ring` and cannot silently +// disagree with the SM layout when (hypothetically) ring sizes diverge. +inline PTO2TaskDescriptor *ring_task_descriptors_addr( + void *sm_dev_base, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], int ring_id +) noexcept { + assert(ring_id >= 0 && ring_id < PTO2_MAX_RING_DEPTH && "pto2_sm_layout: ring_id out of range"); + char *p = static_cast(sm_dev_base); + p += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE); + for (int r = 0; r < ring_id; r++) { + p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); + p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE); + p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE); + } + return reinterpret_cast(p); +} + +} // namespace pto2_sm_layout diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h index cf1f2d28d..b63f20676 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h @@ -47,8 +47,6 @@ #include "pto_runtime2_types.h" #include "tensor.h" -struct PTO2OrchestratorState; // forward declare - /** * Layout descriptor produced by PTO2TensorMap::reserve_layout(). Stores the * region offsets returned by DeviceArena::reserve() so init_from_layout() @@ -369,8 +367,6 @@ struct PTO2TensorMap { // Per-ring cleanup progress (for periodic cleanup_retired) int32_t last_cleanup[PTO2_MAX_RING_DEPTH]{}; - PTO2OrchestratorState *orch{nullptr}; - uint32_t get_task_local_id_slot(uint8_t ring_id, uint32_t task_local_id) const { return task_local_id & (task_window_sizes[ring_id] - 1); } @@ -435,11 +431,19 @@ struct PTO2TensorMap { reserve_layout_default(DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]); /** - * Phase 3: bind region pointers and initialize state. The arena must already - * be committed; layout must have been produced by reserve_layout() against - * the same arena. + * Phase 3a: write everything *except* arena-internal pointer fields + * (buckets, entry_pool, free_entry_list, task_entry_heads[r]). + * Uses arena.region_ptr to address the arena regions for data writes, + * but does not store those addresses in struct fields. Safe to call on + * a host arena that holds the prebuilt image. + */ + bool init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena); + + /** + * Phase 3b: write the arena-internal pointer fields. Idempotent; + * called once on the host arena and once on the AICPU after attach. */ - bool init_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena); + void wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena); /** * Tear down state. Does not free memory — the arena owns the backing diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h index 73b6027c4..8e1bb1567 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -119,19 +119,25 @@ struct HostApi { void (*device_free)(void *dev_ptr); int (*copy_to_device)(void *dev_ptr, const void *host_ptr, size_t size); int (*copy_from_device)(void *host_ptr, const void *dev_ptr, size_t size); - // Lay out and commit the per-Worker static device arena that backs both - // the PTO2 GM heap and the PTO2 shared memory in a single underlying - // allocation. Must be called once before acquire_pooled_gm_heap / - // acquire_pooled_gm_sm. Idempotent on identical sizes; returns 0 on - // success, -1 on allocation failure. - int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size); + // Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared + // memory, trb prebuilt runtime arena) as three independent device + // allocations. `runtime_arena_size == 0` skips the third region (hbg + // path: hbg has no prebuilt runtime arena). Idempotent on identical + // sizes; returns 0 on success, -1 on allocation failure. + int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); // Return the per-Worker pooled pointer for the PTO2 GM heap / shared - // memory. The static arena must already be committed via - // setup_static_arena; the returned pointer is owned by the DeviceRunner - // and freed in `DeviceRunner::finalize()` — do NOT pass it to - // device_free or record it in `tensor_pairs_`. + // memory / prebuilt runtime arena. setup_static_arena must have already + // committed the relevant region; the returned pointer is owned by the + // DeviceRunner and freed in `DeviceRunner::finalize()` — do NOT pass it + // to device_free or record it in `tensor_pairs_`. + // + // acquire_pooled_runtime_arena is trb-only — the runtime-arena region is + // only committed when setup_static_arena was invoked with + // runtime_arena_size > 0. Calling it on the hbg path + // (setup_static_arena(...,0)) returns nullptr (not undefined). void *(*acquire_pooled_gm_heap)(); void *(*acquire_pooled_gm_sm)(); + void *(*acquire_pooled_runtime_arena)(); // Single-shot upload of the entire ChipCallable buffer. `callable` is a // `const ChipCallable *` (declared void* to avoid pulling task_interface // headers into runtime.h). DeviceRunner walks child_offsets_ to compute @@ -211,6 +217,13 @@ class Runtime { void *slot_states_ptr_; // Pointer to PTO2TaskSlotState array (scheduler-private, for profiling) ChipStorageTaskArgs orch_args_storage_; // Copy of args for device + // Prebuilt-arena fast path (trb only). Set by the host before rtMemcpy'ing + // Runtime to device; AICPU reads them in the boot path to skip + // runtime_create_from_sm and reuse the pooled, prebuilt arena buffer + // (already populated by runtime_init_data_from_layout + wire on host). + void *prebuilt_arena_base_; + size_t prebuilt_runtime_offset_; + // Device orchestration SO (for dlopen on AICPU thread 3). // The SO bytes themselves live in a separately-allocated device buffer // owned by DeviceRunner; only the metadata below travels inside Runtime. @@ -247,6 +260,16 @@ class Runtime { void set_slot_states_ptr(void *p); void set_orch_args(const ChipStorageTaskArgs &args); + // Prebuilt-arena fast path (trb only). Set by host's + // bind_prepared_to_runtime_impl; consumed by AICPU at boot to attach a + // DeviceArena to `prebuilt_arena_base_` and pick up the PTO2Runtime at + // `prebuilt_arena_base_ + prebuilt_runtime_offset_`. Both stay zero on + // first construction (Runtime() ctor zeros them) so a non-prebuilt boot + // path can still detect "no prebuilt image set" via nullptr. + void set_prebuilt_arena(void *arena_base, size_t runtime_off); + void *get_prebuilt_arena_base() const; + size_t get_prebuilt_runtime_offset() const; + // Device orchestration SO binary (for dlopen on AICPU thread 3) void set_dev_orch_so(uint64_t dev_addr, uint64_t size); uint64_t get_dev_orch_so_addr() const; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp index 281a714fb..2d777e9b0 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp @@ -61,152 +61,6 @@ PTO2SchedProfilingData scheduler_get_profiling(int thread_idx) { } #endif -// ============================================================================= -// Ready Queue Implementation -// ============================================================================= - -size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity) { - // Align the slots[] base to a full cache line so MPMC CAS traffic on the - // first slot cannot false-share with whatever region sits in front of us - // (e.g. orchestrator tensormap heads written by the orch thread). - return arena.reserve(capacity * sizeof(PTO2ReadyQueueSlot), PTO2_ALIGN_SIZE); -} - -bool ready_queue_init_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity) { - queue->slots = static_cast(arena.region_ptr(slots_off)); - queue->capacity = capacity; - queue->mask = capacity - 1; - queue->enqueue_pos.store(0, std::memory_order_relaxed); - queue->dequeue_pos.store(0, std::memory_order_relaxed); - - for (uint64_t i = 0; i < capacity; i++) { - queue->slots[i].sequence.store((int64_t)i, std::memory_order_relaxed); - queue->slots[i].slot_state = nullptr; - } - - return true; -} - -void ready_queue_destroy(PTO2ReadyQueue *queue) { - // Arena owns the slots[] buffer; just forget the pointer. - queue->slots = nullptr; -} - -// ============================================================================= -// Scheduler Initialization -// ============================================================================= - -bool PTO2SchedulerState::RingSchedState::init(PTO2SharedMemoryHeader *sm_header, int32_t ring_id) { - ring = &sm_header->rings[ring_id]; - last_task_alive = 0; - advance_lock.store(0, std::memory_order_relaxed); - - // Initialize all per-task slot state fields. - // bind() sets payload, task, ring_id — immutable after init, bound once - // to their fixed shared-memory addresses. - // reset_for_reuse() sets dynamic fields to reclaim defaults (fanout_count=1, - // rest zero) so the first submit needs no reset. - for (uint64_t i = 0; i < ring->task_window_size; i++) { - ring->slot_states[i].bind(&ring->task_payloads[i], &ring->task_descriptors[i], static_cast(ring_id)); - ring->slot_states[i].reset_for_reuse(); - ring->slot_states[i].fanin_count = 0; - ring->slot_states[i].active_mask = ActiveMask{}; - } - - return true; -} - -void PTO2SchedulerState::RingSchedState::destroy() { ring = nullptr; } - -PTO2SchedulerLayout PTO2SchedulerState::reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity) { - PTO2SchedulerLayout layout{}; - layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE; - layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE; - layout.dep_pool_capacity = dep_pool_capacity; - - for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { - layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE); - } - layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE); - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - // Force a cache-line base so writes from scheduler thread 0 (sole - // writer of this ring's dep_pool) do not invalidate adjacent - // multi-threaded regions like ready_queue.slots. - layout.off_dep_pool_entries[r] = - arena.reserve(static_cast(dep_pool_capacity) * sizeof(PTO2DepListEntry), PTO2_ALIGN_SIZE); - } - layout.off_wiring_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE); - return layout; -} - -bool PTO2SchedulerState::init_from_layout( - const PTO2SchedulerLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header_arg -) { - PTO2SchedulerState *sched = this; - sched->sm_header = sm_header_arg; -#if PTO2_SCHED_PROFILING - sched->tasks_completed.store(0, std::memory_order_relaxed); - sched->tasks_consumed.store(0, std::memory_order_relaxed); -#endif - - // Per-ring scheduler state — no arena buffers, just field init. - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - if (!sched->ring_sched_states[r].init(sm_header_arg, r)) { - return false; - } - } - - // Ready queues — one per resource shape plus DUMMY. - for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { - if (!ready_queue_init_from_layout( - &sched->ready_queues[i], arena, layout.off_ready_queue_slots[i], layout.ready_queue_capacity - )) { - return false; - } - } - if (!ready_queue_init_from_layout( - &sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity - )) { - return false; - } - - // Per-ring dep_pool: PTO2DepListPool::init takes an externally-allocated - // base + capacity, so we just plumb the arena region into it. - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - auto *dep_entries = static_cast(arena.region_ptr(layout.off_dep_pool_entries[r])); - // calloc-equivalent: pool expects entries zeroed at construction. - memset(dep_entries, 0, static_cast(layout.dep_pool_capacity) * sizeof(PTO2DepListEntry)); - sched->ring_sched_states[r].dep_pool.init( - dep_entries, layout.dep_pool_capacity, &sm_header_arg->orch_error_code - ); - } - - // Wiring SPSC queue (orchestrator push, scheduler thread 0 pop). - if (!sched->wiring.queue.init_from_layout(arena, layout.off_wiring_spsc_buffer, layout.spsc_capacity)) { - return false; - } - sched->wiring.batch_count = 0; - sched->wiring.batch_index = 0; - sched->wiring.backoff_counter = 0; - - return true; -} - -void PTO2SchedulerState::destroy() { - PTO2SchedulerState *sched = this; - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - sched->ring_sched_states[r].destroy(); - sched->ring_sched_states[r].dep_pool.base = nullptr; - } - - sched->wiring.queue.destroy(); - - for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { - ready_queue_destroy(&sched->ready_queues[i]); - } - ready_queue_destroy(&sched->dummy_ready_queue); -} - // ============================================================================= // Debug Utilities // ============================================================================= diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h index 8d50681ba..510187feb 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h @@ -409,7 +409,14 @@ struct alignas(64) PTO2ReadyQueue { // initialize sequence counters // destroy: forget the slots pointer (arena owns the buffer) size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity); -bool ready_queue_init_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity); +// Writes everything *except* the arena-internal `slots` pointer field +// (sequences/positions on the slot array, capacity, mask). Uses +// arena.region_ptr(slots_off) only to address the slot array for writes; +// does NOT store the pointer in `queue->slots`. Call +// `ready_queue_wire_arena_pointers` afterwards to set the field itself. +bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity); +// Stores queue->slots = arena.region_ptr(slots_off). Idempotent. +void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off); void ready_queue_destroy(PTO2ReadyQueue *queue); // ============================================================================= @@ -449,15 +456,17 @@ struct alignas(64) PTO2SpscQueue { return arena.reserve(capacity * sizeof(PTO2TaskSlotState *), PTO2_ALIGN_SIZE); } - // Bind buffer pointer + reset indices. The capacity must be a power of two - // and match the value passed to reserve_layout. - bool init_from_layout(DeviceArena &arena, size_t buffer_off, uint64_t capacity) { + // Writes everything except the arena-internal `buffer_` pointer field + // (zeros the slot pointer array, mask/head/tail). The host pre-builds the + // image without storing a host address in buffer_; the AICPU wires + // buffer_ at boot via wire_arena_pointers(). + bool init_data_from_layout(DeviceArena &arena, size_t buffer_off, uint64_t capacity) { if (capacity == 0 || (capacity & (capacity - 1)) != 0) return false; - buffer_ = static_cast(arena.region_ptr(buffer_off)); + auto *buf = static_cast(arena.region_ptr(buffer_off)); // calloc'd-equivalent: zero the slot pointers so spurious early pops // observe nullptr. for (uint64_t i = 0; i < capacity; i++) - buffer_[i] = nullptr; + buf[i] = nullptr; mask_ = capacity - 1; head_.store(0, std::memory_order_relaxed); tail_.store(0, std::memory_order_relaxed); @@ -466,6 +475,12 @@ struct alignas(64) PTO2SpscQueue { return true; } + // Wire the arena-internal pointer. Called by both host (with host arena) + // and AICPU (with device arena attached to the prebuilt image). + void wire_arena_pointers(DeviceArena &arena, size_t buffer_off) { + buffer_ = static_cast(arena.region_ptr(buffer_off)); + } + // Arena owns the buffer; here we only forget our pointer. void destroy() { buffer_ = nullptr; } @@ -563,7 +578,12 @@ struct PTO2SchedulerState { // --- Cache Line 1+: Thread 0 only (wiring dep_pool) --- alignas(64) PTO2DepListPool dep_pool; - bool init(PTO2SharedMemoryHeader *sm_header, int32_t ring_id); + // Initialize arena-internal data + arena-external pointers; does NOT + // store dep_pool.base (that lives in the runtime arena and is wired + // by SchedulerState::wire_arena_pointers). The `ring` field stores + // the device address of the SM ring header — computed via offset + // arithmetic, no SM dereference. + bool init_data_from_layout(void *sm_dev_base, int32_t ring_id); void destroy(); void sync_to_sm() { ring->fc.last_task_alive.store(last_task_alive, std::memory_order_release); } @@ -1042,13 +1062,23 @@ struct PTO2SchedulerState { // Phase 1: declare every sub-region (ready_queue slots, dummy queue slots, // per-ring dep_pool entries, wiring SPSC buffer) on the supplied arena. - // Capacities are baked into the returned layout; init_from_layout uses + // Capacities are baked into the returned layout; init_data_from_layout uses // the same values. static PTO2SchedulerLayout reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE); - // Phase 3: bind region pointers and initialize state. The arena must be - // committed; layout must come from reserve_layout() against the same arena. - bool init_from_layout(const PTO2SchedulerLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header); + // Phase 3a: write everything *except* arena-internal pointer fields. + // `sm_dev_base` is the device address of the SM (only stored, never + // dereferenced here). Safe to call on a host arena that holds the + // prebuilt image buffer. (The orchestrator counterpart takes + // task_window_size for ring task_descriptors address arithmetic; the + // scheduler only needs the SM header / ring header base addresses, + // both window-size-independent.) + bool init_data_from_layout(const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base); + + // Phase 3b: write the arena-internal pointer fields + // (ready_queues[].slots, dummy_ready_queue.slots, dep_pool.base for each + // ring, wiring.queue.buffer_). Called on both host and device sides. + void wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena); // Forget per-region pointers; arena owns the backing memory. void destroy(); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp new file mode 100644 index 000000000..d66acfcc4 --- /dev/null +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp @@ -0,0 +1,355 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Host/AICPU shared runtime-arena layout, init_data and wire implementations. + * + * Lives under runtime/shared/ so it is included in both the host_runtime.so + * build (host pre-populates the prebuilt arena image) and the aicpu_runtime + * build (AICPU runs wire_arena_pointers + destroy after attach). The + * device-only parts of pto_runtime2.cpp / pto_orchestrator.cpp / pto_scheduler.cpp + * (ops table, scope/submit/dispatch business logic, profiling) stay in their + * original files and the aicpu build only. + */ + +#include +#include + +#include "pto_orchestrator.h" +#include "pto_runtime2.h" +#include "pto_ring_buffer.h" +#include "pto_shared_memory.h" +#include "pto_tensormap.h" +#include "scheduler/pto_scheduler.h" + +// ============================================================================= +// Ready queue +// ============================================================================= + +size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity) { + // Align the slots[] base to a full cache line so MPMC CAS traffic on the + // first slot cannot false-share with whatever region sits in front of us + // (e.g. orchestrator tensormap heads written by the orch thread). + return arena.reserve(capacity * sizeof(PTO2ReadyQueueSlot), PTO2_ALIGN_SIZE); +} + +bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity) { + // Address the slots region for data writes without storing the pointer in + // queue->slots — that field is set by ready_queue_wire_arena_pointers. + auto *slots_arena = static_cast(arena.region_ptr(slots_off)); + queue->capacity = capacity; + queue->mask = capacity - 1; + queue->enqueue_pos.store(0, std::memory_order_relaxed); + queue->dequeue_pos.store(0, std::memory_order_relaxed); + + for (uint64_t i = 0; i < capacity; i++) { + slots_arena[i].sequence.store((int64_t)i, std::memory_order_relaxed); + slots_arena[i].slot_state = nullptr; + } + + return true; +} + +void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off) { + queue->slots = static_cast(arena.region_ptr(slots_off)); +} + +void ready_queue_destroy(PTO2ReadyQueue *queue) { + // Arena owns the slots[] buffer; just forget the pointer. + queue->slots = nullptr; +} + +// ============================================================================= +// Scheduler +// ============================================================================= + +bool PTO2SchedulerState::RingSchedState::init_data_from_layout(void *sm_dev_base, int32_t ring_id) { + // ring stores the device address of the SM ring header — pure offset + // arithmetic, no SM load. + ring = pto2_sm_layout::ring_header_addr(sm_dev_base, ring_id); + last_task_alive = 0; + advance_lock.store(0, std::memory_order_relaxed); + + // Per-slot SM-side initialization (bind_ring + reset_for_reuse + + // fanin_count/active_mask zero) lives in PTO2SharedMemoryHandle:: + // init_header_per_ring so the AICPU performs it during SM reset; host + // prebuilt-arena init skips SM access here. + + return true; +} + +void PTO2SchedulerState::RingSchedState::destroy() { ring = nullptr; } + +PTO2SchedulerLayout PTO2SchedulerState::reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity) { + PTO2SchedulerLayout layout{}; + layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE; + layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE; + layout.dep_pool_capacity = dep_pool_capacity; + + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE); + } + layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + // Force a cache-line base so writes from scheduler thread 0 (sole + // writer of this ring's dep_pool) do not invalidate adjacent + // multi-threaded regions like ready_queue.slots. + layout.off_dep_pool_entries[r] = + arena.reserve(static_cast(dep_pool_capacity) * sizeof(PTO2DepListEntry), PTO2_ALIGN_SIZE); + } + layout.off_wiring_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE); + return layout; +} + +bool PTO2SchedulerState::init_data_from_layout( + const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base +) { + PTO2SchedulerState *sched = this; + sched->sm_header = reinterpret_cast(sm_dev_base); +#if PTO2_SCHED_PROFILING + sched->tasks_completed.store(0, std::memory_order_relaxed); + sched->tasks_consumed.store(0, std::memory_order_relaxed); +#endif + + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + if (!sched->ring_sched_states[r].init_data_from_layout(sm_dev_base, r)) { + return false; + } + } + + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + if (!ready_queue_init_data_from_layout( + &sched->ready_queues[i], arena, layout.off_ready_queue_slots[i], layout.ready_queue_capacity + )) { + return false; + } + } + if (!ready_queue_init_data_from_layout( + &sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity + )) { + return false; + } + + auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto *dep_entries = static_cast(arena.region_ptr(layout.off_dep_pool_entries[r])); + memset(dep_entries, 0, static_cast(layout.dep_pool_capacity) * sizeof(PTO2DepListEntry)); + sched->ring_sched_states[r].dep_pool.init(dep_entries, layout.dep_pool_capacity, orch_err); + } + + if (!sched->wiring.queue.init_data_from_layout(arena, layout.off_wiring_spsc_buffer, layout.spsc_capacity)) { + return false; + } + sched->wiring.batch_count = 0; + sched->wiring.batch_index = 0; + sched->wiring.backoff_counter = 0; + + return true; +} + +void PTO2SchedulerState::wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena) { + PTO2SchedulerState *sched = this; + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + ready_queue_wire_arena_pointers(&sched->ready_queues[i], arena, layout.off_ready_queue_slots[i]); + } + ready_queue_wire_arena_pointers(&sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + sched->ring_sched_states[r].dep_pool.base = + static_cast(arena.region_ptr(layout.off_dep_pool_entries[r])); + } + sched->wiring.queue.wire_arena_pointers(arena, layout.off_wiring_spsc_buffer); +} + +void PTO2SchedulerState::destroy() { + PTO2SchedulerState *sched = this; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + sched->ring_sched_states[r].destroy(); + sched->ring_sched_states[r].dep_pool.base = nullptr; + } + sched->wiring.queue.destroy(); + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + ready_queue_destroy(&sched->ready_queues[i]); + } + ready_queue_destroy(&sched->dummy_ready_queue); +} + +// ============================================================================= +// Orchestrator +// ============================================================================= + +PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout( + DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity +) { + PTO2OrchestratorLayout layout{}; + layout.dep_pool_capacity = dep_pool_capacity; + layout.scope_tasks_cap = PTO2_SCOPE_TASKS_CAP; + layout.scope_stack_capacity = PTO2_MAX_SCOPE_DEPTH; + + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + const size_t fanin_pool_bytes = + PTO2_ALIGN_UP(static_cast(dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE); + layout.off_fanin_pool[r] = arena.reserve(fanin_pool_bytes, PTO2_ALIGN_SIZE); + } + layout.off_scope_tasks = arena.reserve( + static_cast(layout.scope_tasks_cap) * sizeof(PTO2TaskSlotState *), alignof(PTO2TaskSlotState *) + ); + layout.off_scope_begins = + arena.reserve(static_cast(layout.scope_stack_capacity) * sizeof(int32_t), alignof(int32_t)); + layout.tensor_map = PTO2TensorMap::reserve_layout_default(arena, task_window_sizes); + return layout; +} + +bool PTO2OrchestratorState::init_data_from_layout( + const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size, + uint64_t task_window_size +) { + auto *orch = this; + *orch = PTO2OrchestratorState{}; + + orch->sm_header = reinterpret_cast(sm_dev_base); + orch->gm_heap_base = gm_heap; + orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH; + orch->fatal = false; + + // Mirror the SM API's per-ring window-size shape so a future per-ring + // SM layout cannot silently disagree with the addresses we compute here. + uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) + task_window_sizes[r] = task_window_size; + + auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + void *ring_heap_base = reinterpret_cast(gm_heap) + r * heap_size; + auto *task_descs_dev = pto2_sm_layout::ring_task_descriptors_addr(sm_dev_base, task_window_sizes, r); + auto *cur_idx_dev = pto2_sm_layout::ring_current_task_index_addr(sm_dev_base, r); + auto *last_alive_dev = pto2_sm_layout::ring_last_task_alive_addr(sm_dev_base, r); + + orch->rings[r].task_allocator.init( + task_descs_dev, static_cast(task_window_size), cur_idx_dev, last_alive_dev, ring_heap_base, + heap_size, orch_err + ); + + const size_t fanin_pool_bytes = + PTO2_ALIGN_UP(static_cast(layout.dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE); + auto *fanin_entries = static_cast(arena.region_ptr(layout.off_fanin_pool[r])); + memset(fanin_entries, 0, fanin_pool_bytes); + orch->rings[r].fanin_pool.init(fanin_entries, layout.dep_pool_capacity, orch_err); + } + + if (!orch->tensor_map.init_data_from_layout(layout.tensor_map, arena)) { + return false; + } + + orch->scope_tasks_size = 0; + orch->scope_tasks_capacity = layout.scope_tasks_cap; + orch->scope_stack_top = -1; + orch->scope_stack_capacity = layout.scope_stack_capacity; + orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH; + + return true; +} + +void PTO2OrchestratorState::wire_arena_pointers( + const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler_arg +) { + auto *orch = this; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + orch->rings[r].fanin_pool.base = static_cast(arena.region_ptr(layout.off_fanin_pool[r])); + } + orch->tensor_map.wire_arena_pointers(layout.tensor_map, arena); + orch->scope_tasks = static_cast(arena.region_ptr(layout.off_scope_tasks)); + orch->scope_begins = static_cast(arena.region_ptr(layout.off_scope_begins)); + orch->scheduler = scheduler_arg; +} + +void PTO2OrchestratorState::destroy() { + auto *orch = this; + orch->tensor_map.destroy(); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + orch->rings[r].fanin_pool.base = nullptr; + } + orch->scope_tasks = nullptr; + orch->scope_begins = nullptr; +} + +void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; } + +// ============================================================================= +// Top-level runtime arena +// ============================================================================= + +PTO2RuntimeArenaLayout +runtime_reserve_layout(DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity) { + PTO2RuntimeArenaLayout layout{}; + layout.task_window_size = task_window_size; + layout.dep_pool_capacity = dep_pool_capacity; + + int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + task_window_sizes[r] = static_cast(task_window_size); + } + + layout.off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle)); + layout.orch = PTO2OrchestratorState::reserve_layout(arena, task_window_sizes, dep_pool_capacity); + layout.sched = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacity); + layout.off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE); + layout.off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox)); + + layout.arena_size = arena.total_size(); + return layout; +} + +PTO2Runtime *runtime_init_data_from_layout( + DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, + uint64_t /*sm_size*/, void *gm_heap_dev_base, uint64_t heap_size +) { + PTO2Runtime *rt = static_cast(arena.region_ptr(layout.off_runtime)); + memset(rt, 0, sizeof(*rt)); + + auto *sm_wrap = static_cast(arena.region_ptr(layout.off_sm_handle)); + memset(sm_wrap, 0, sizeof(*sm_wrap)); + + // rt->ops is filled by the AICPU at boot. + rt->mode = mode; + rt->gm_heap = gm_heap_dev_base; + rt->gm_heap_size = heap_size > 0 ? heap_size * PTO2_MAX_RING_DEPTH : 0; + rt->gm_heap_owned = false; + rt->total_cycles = 0; + + if (!rt->orchestrator.init_data_from_layout( + layout.orch, arena, sm_dev_base, gm_heap_dev_base, heap_size, layout.task_window_size + )) { + return nullptr; + } + if (!rt->scheduler.init_data_from_layout(layout.sched, arena, sm_dev_base)) { + return nullptr; + } + + auto *mailbox = static_cast(arena.region_ptr(layout.off_mailbox)); + memset(mailbox, 0, sizeof(*mailbox)); + + return rt; +} + +void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt) { + rt->sm_handle = static_cast(arena.region_ptr(layout.off_sm_handle)); + rt->aicore_mailbox = static_cast(arena.region_ptr(layout.off_mailbox)); + rt->orchestrator.wire_arena_pointers(layout.orch, arena, &rt->scheduler); + rt->scheduler.wire_arena_pointers(layout.sched, arena); +} + +void runtime_destroy(PTO2Runtime *rt, DeviceArena & /*arena*/) { + // Arena buffer is pooled across runs by DeviceRunner — never freed here. + if (!rt) return; + rt->scheduler.destroy(); + rt->orchestrator.destroy(); + rt->aicore_mailbox = nullptr; + rt->sm_handle = nullptr; +} diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp index 358c87f57..1e1edff92 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp @@ -167,6 +167,23 @@ void PTO2SharedMemoryHandle::init_header_per_ring( header->sched_error_bitmap.store(0, std::memory_order_relaxed); header->sched_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed); header->sched_error_thread.store(-1, std::memory_order_relaxed); + + // Per-ring slot_states reset. Previously lived in + // PTO2SchedulerState::RingSchedState::init(), but it writes into + // ring->slot_states[] which is SM-side storage — keeping it here lets + // host-side prebuilt-arena init skip all SM dereferences. + // bind_ring() pins the ring_id (slot-invariant after this point); + // reset_for_reuse() prepares dynamic fanout/refcount fields so the first + // submit doesn't need an explicit reset. + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto &ring = header->rings[r]; + for (uint64_t i = 0; i < task_window_sizes[r]; i++) { + ring.slot_states[i].bind_ring(static_cast(r)); + ring.slot_states[i].reset_for_reuse(); + ring.slot_states[i].fanin_count = 0; + ring.slot_states[i].active_mask = ActiveMask{}; + } + } } // ============================================================================= diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp index a0b98bd09..b99c67233 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp @@ -81,43 +81,45 @@ PTO2TensorMap::reserve_layout_default(DeviceArena &arena, const int32_t new_task return reserve_layout(arena, PTO2_TENSORMAP_NUM_BUCKETS, PTO2_TENSORMAP_POOL_SIZE, new_task_window_sizes); } -bool PTO2TensorMap::init_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena) { +bool PTO2TensorMap::init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena) { num_buckets = layout.num_buckets; pool_size = layout.pool_size; - buckets = static_cast(arena.region_ptr(layout.off_buckets)); - entry_pool = static_cast(arena.region_ptr(layout.off_entry_pool)); - free_entry_list = static_cast(arena.region_ptr(layout.off_free_entry_list)); + // Address arena regions for data writes; do not store these in struct + // fields (wire_arena_pointers does that). + auto *buckets_arena = static_cast(arena.region_ptr(layout.off_buckets)); + auto *entry_pool_arena = static_cast(arena.region_ptr(layout.off_entry_pool)); + auto *free_list_arena = static_cast(arena.region_ptr(layout.off_free_entry_list)); // buckets[]: empty == nullptr. for (int32_t i = 0; i < num_buckets; i++) { - buckets[i] = nullptr; + buckets_arena[i] = nullptr; } // entry_pool: zero-init equivalent to the previous calloc(entry_pool, ...). // The pool's persistent invariant after init is "bucket_index == -1 means // not linked", set explicitly below. - memset(entry_pool, 0, static_cast(pool_size) * sizeof(PTO2TensorMapEntry)); + memset(entry_pool_arena, 0, static_cast(pool_size) * sizeof(PTO2TensorMapEntry)); for (int32_t i = 0; i < pool_size; i++) { - entry_pool[i].bucket_index = -1; - entry_pool[i].next_in_bucket = nullptr; - entry_pool[i].prev_in_bucket = nullptr; - entry_pool[i].next_in_task = nullptr; - entry_pool[i].prev_in_task = nullptr; - entry_pool[i].producer_task_id = PTO2TaskId{}; + entry_pool_arena[i].bucket_index = -1; + entry_pool_arena[i].next_in_bucket = nullptr; + entry_pool_arena[i].prev_in_bucket = nullptr; + entry_pool_arena[i].next_in_task = nullptr; + entry_pool_arena[i].prev_in_task = nullptr; + entry_pool_arena[i].producer_task_id = PTO2TaskId{}; } // free_entry_list: zeroed (was calloc'd before); contents become meaningful // only after entries are freed back, so the body of the array stays as 0. - memset(free_entry_list, 0, static_cast(pool_size) * sizeof(PTO2TensorMapEntry *)); + memset(free_list_arena, 0, static_cast(pool_size) * sizeof(PTO2TensorMapEntry *)); next_entry_idx = 0; free_num = 0; for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - task_entry_heads[r] = static_cast(arena.region_ptr(layout.off_task_entry_heads[r])); + auto *heads_arena = static_cast(arena.region_ptr(layout.off_task_entry_heads[r])); for (int32_t i = 0; i < layout.task_window_sizes[r]; i++) { - task_entry_heads[r][i] = nullptr; + heads_arena[i] = nullptr; } task_window_sizes[r] = layout.task_window_sizes[r]; last_task_alives[r] = 0; @@ -127,6 +129,15 @@ bool PTO2TensorMap::init_from_layout(const PTO2TensorMapLayout &layout, DeviceAr return true; } +void PTO2TensorMap::wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena) { + buckets = static_cast(arena.region_ptr(layout.off_buckets)); + entry_pool = static_cast(arena.region_ptr(layout.off_entry_pool)); + free_entry_list = static_cast(arena.region_ptr(layout.off_free_entry_list)); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + task_entry_heads[r] = static_cast(arena.region_ptr(layout.off_task_entry_heads[r])); + } +} + void PTO2TensorMap::destroy() { // Arena owns the backing memory; here we only forget our pointers so any // stray post-destroy access trips a nullptr dereference instead of reading diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp index 6a7ab65da..b3347b53c 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp @@ -44,6 +44,8 @@ Runtime::Runtime() { gm_heap_ptr_ = nullptr; slot_states_ptr_ = nullptr; orch_args_storage_.clear(); + prebuilt_arena_base_ = nullptr; + prebuilt_runtime_offset_ = 0; // Initialize device orchestration SO binary dev_orch_so_addr_ = 0; @@ -74,6 +76,13 @@ void Runtime::set_gm_heap(void *p) { gm_heap_ptr_ = p; } void Runtime::set_slot_states_ptr(void *p) { slot_states_ptr_ = p; } void Runtime::set_orch_args(const ChipStorageTaskArgs &args) { orch_args_storage_ = args; } +void Runtime::set_prebuilt_arena(void *arena_base, size_t runtime_off) { + prebuilt_arena_base_ = arena_base; + prebuilt_runtime_offset_ = runtime_off; +} +void *Runtime::get_prebuilt_arena_base() const { return prebuilt_arena_base_; } +size_t Runtime::get_prebuilt_runtime_offset() const { return prebuilt_runtime_offset_; } + // Device orchestration SO metadata (bytes live in a separate device buffer // owned by DeviceRunner; only the address/size travels in Runtime). void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size) { diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp index 38242555d..506613dcd 100644 --- a/src/a5/platform/onboard/host/device_runner.cpp +++ b/src/a5/platform/onboard/host/device_runner.cpp @@ -195,40 +195,75 @@ static int prof_free_cb(void *dev_ptr) { return rtFree(dev_ptr); } DeviceRunner::~DeviceRunner() { finalize(); } -int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size) { - if (static_arena_.is_committed()) { - if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_) return 0; - static_arena_.release(); - gm_heap_region_off_ = SIZE_MAX; - gm_sm_region_off_ = SIZE_MAX; +int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) { + // Three independent device_malloc'd buffers: GM heap, PTO2 SM, prebuilt + // runtime arena. Split out from a single large allocation because the + // combined size can exceed the device allocator's largest contiguous + // block. Each arena commits exactly one region, so its base() is the + // pooled pointer the caller wants. + // + // Idempotent for the production case (sizes do not change across a + // worker's lifetime). If a caller asks for a larger layout on any + // region, redo just that region — already-committed peers stay alive + // so their callers don't have to re-acquire. + auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int { + if (requested_size == 0) { + // hbg's runtime_arena path: caller passed 0 and never reserved + // a region. Leave the arena uncommitted; acquire_pooled_* will + // return nullptr. + if (arena.is_committed() && cached_size != 0) { + arena.release(); + cached_size = 0; + } + return 0; + } + if (arena.is_committed() && requested_size <= cached_size) { + return 0; + } + arena.release(); + cached_size = 0; + arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign); + if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { + // commit() failure leaves committed_=false, so the next entry's + // is_committed() guard skips the release branch. release() is + // idempotent on a never-committed arena (zeroes cursor_). + arena.release(); + return -1; + } + cached_size = requested_size; + return 0; + }; + if (commit_region(gm_heap_arena_, cached_gm_heap_size_, gm_heap_size) != 0) return -1; + if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) { + gm_heap_arena_.release(); cached_gm_heap_size_ = 0; - cached_gm_sm_size_ = 0; + return -1; } - gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign); - gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign); - if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { - // Roll back the two reserves: commit() failure leaves committed_=false, - // so the next entry would skip the release branch and stack new - // reserves on top of the stale cursor. release() is idempotent on a - // never-committed arena (just zeroes cursor_ / region_count_). - static_arena_.release(); - gm_heap_region_off_ = SIZE_MAX; - gm_sm_region_off_ = SIZE_MAX; + if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) { + gm_heap_arena_.release(); + gm_sm_arena_.release(); + cached_gm_heap_size_ = 0; + cached_gm_sm_size_ = 0; return -1; } - cached_gm_heap_size_ = gm_heap_size; - cached_gm_sm_size_ = gm_sm_size; return 0; } void *DeviceRunner::acquire_pooled_gm_heap() { - if (!static_arena_.is_committed()) return nullptr; - return static_arena_.region_ptr(gm_heap_region_off_); + if (!gm_heap_arena_.is_committed()) return nullptr; + return gm_heap_arena_.base(); } void *DeviceRunner::acquire_pooled_gm_sm() { - if (!static_arena_.is_committed()) return nullptr; - return static_arena_.region_ptr(gm_sm_region_off_); + if (!gm_sm_arena_.is_committed()) return nullptr; + return gm_sm_arena_.base(); +} + +void *DeviceRunner::acquire_pooled_runtime_arena() { + // hbg calls setup_static_arena(...,0) and leaves runtime_arena_pool_ + // uncommitted — fail loudly if a caller asks for it anyway. + if (!runtime_arena_pool_.is_committed()) return nullptr; + return runtime_arena_pool_.base(); } std::thread DeviceRunner::create_thread(std::function fn) { @@ -1039,14 +1074,25 @@ int DeviceRunner::finalize() { pmu_collector_.finalize(/*unregister_cb=*/nullptr, prof_free_cb); } - // Release per-Worker static arena (GM heap + PTO2 SM in a single backing - // device allocation). Must precede mem_alloc_.finalize() so the arena - // frees through the still-live allocator, not after it. - static_arena_.release(); - gm_heap_region_off_ = SIZE_MAX; - gm_sm_region_off_ = SIZE_MAX; + // Release the three per-Worker pooled arenas (GM heap, PTO2 SM, optional + // trb prebuilt runtime arena — each its own device_malloc). Must precede + // mem_alloc_.finalize() so the arenas free through the still-live + // allocator, not after it. + gm_heap_arena_.release(); + gm_sm_arena_.release(); + runtime_arena_pool_.release(); cached_gm_heap_size_ = 0; cached_gm_sm_size_ = 0; + cached_runtime_arena_size_ = 0; + + // Free the 8-byte device_wall buffer (allocated lazily in run()) while + // mem_alloc_ and the device context are still live. free_tensor() routes + // through mem_alloc_.free(), so it must run before finalize() and before + // rtDeviceReset() tears down the device runtime. + if (device_wall_dev_ptr_ != nullptr) { + free_tensor(device_wall_dev_ptr_); + device_wall_dev_ptr_ = nullptr; + } // Free all remaining allocations (including handshake buffer and binGmAddr) mem_alloc_.finalize(); @@ -1057,11 +1103,6 @@ int DeviceRunner::finalize() { return rc; } - // Free the 8-byte device_wall buffer (allocated lazily in run()). - if (device_wall_dev_ptr_ != nullptr) { - free_tensor(device_wall_dev_ptr_); - device_wall_dev_ptr_ = nullptr; - } device_id_ = -1; block_dim_ = 0; worker_count_ = 0; diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h index a07ab28bb..0d8cc0397 100644 --- a/src/a5/platform/onboard/host/device_runner.h +++ b/src/a5/platform/onboard/host/device_runner.h @@ -174,24 +174,36 @@ struct KernelArgsHelper { class DeviceRunner { public: DeviceRunner() : - static_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {} + gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), + gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), + runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {} ~DeviceRunner(); /** - * Lay out and commit the per-Worker static device arena that backs the - * PTO2 GM heap and PTO2 shared memory in a single underlying allocation. - * Must be called before acquire_pooled_gm_heap / acquire_pooled_gm_sm. - * Idempotent on identical sizes. Returns 0 on success, -1 on failure. + * Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared + * memory, trb prebuilt runtime arena) as three independent device + * allocations. Must be called before any acquire_pooled_*. Idempotent + * on identical sizes. `runtime_arena_size` is 0 for the hbg path (no + * prebuilt runtime arena) — the corresponding arena stays uncommitted. + * Returns 0 on success, -1 on failure. */ - int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size); + int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); /** - * Return the pooled GM heap / PTO2 SM pointer. setup_static_arena must - * have been called earlier in this Worker; otherwise these return - * nullptr. Pointers are stable for the lifetime of the Worker. + * Return the pooled GM heap / PTO2 SM / runtime arena pointer. + * setup_static_arena must have already committed the relevant region; + * otherwise these return nullptr. All pointers are stable for the + * Worker's lifetime; the three underlying device buffers are released + * in `finalize()`. + * + * acquire_pooled_runtime_arena() is trb-only — the runtime arena region + * is only committed when setup_static_arena was called with + * runtime_arena_size > 0. Calling it on the hbg path + * (setup_static_arena(...,0)) returns nullptr (well-defined). */ void *acquire_pooled_gm_heap(); void *acquire_pooled_gm_sm(); + void *acquire_pooled_runtime_arena(); /** * Create a thread bound to this device. @@ -511,22 +523,30 @@ class DeviceRunner { // Memory management MemoryAllocator mem_alloc_; - // Per-Worker arena backing the PTO2 GM heap + PTO2 shared memory in a - // single device allocation. Released explicitly in finalize() before - // mem_alloc_.finalize() so it does not free pointers a second time. + // Three independent per-Worker arenas, each backing a single pooled + // region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime + // arena). Split out from a single backing allocation because the + // combined size can exceed the device allocator's largest contiguous + // block — three separate device_malloc calls are friendlier than one + // big one. Released explicitly in finalize() before mem_alloc_.finalize() + // so the underlying buffers do not get freed twice. + // + // `runtime_arena_pool_` stays unreserved when setup_static_arena was + // invoked with runtime_arena_size == 0 (hbg path). // // Trampolines forward DeviceArena's alloc/free calls to mem_alloc_. static void *arena_alloc_trampoline(void *ctx, size_t size) { return static_cast(ctx)->alloc(size); } static void arena_free_trampoline(void *ctx, void *p) { static_cast(ctx)->free(p); } - DeviceArena static_arena_; - size_t gm_heap_region_off_{SIZE_MAX}; - size_t gm_sm_region_off_{SIZE_MAX}; - // Cached sizes for setup_static_arena's "fits" check — avoids calling - // region_size() on the arena's public API for the two regions we own. + DeviceArena gm_heap_arena_; + DeviceArena gm_sm_arena_; + DeviceArena runtime_arena_pool_; + // Cached sizes for setup_static_arena's "fits" check — avoids re-allocating + // a buffer when a later worker init asks for an equal-or-smaller layout. size_t cached_gm_heap_size_{0}; size_t cached_gm_sm_size_{0}; + size_t cached_runtime_arena_size_{0}; // Device resources rtStream_t stream_aicpu_{nullptr}; diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp index 0cc17c81f..1a2bb32a9 100644 --- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp @@ -108,9 +108,9 @@ static uint64_t upload_chip_callable_buffer_wrapper(const void *callable) { } } -static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size) { +static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) { try { - return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size); + return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size, runtime_arena_size); } catch (...) { return -1; } @@ -132,6 +132,14 @@ static void *acquire_pooled_gm_sm_wrapper() { } } +static void *acquire_pooled_runtime_arena_wrapper() { + try { + return current_runner()->acquire_pooled_runtime_arena(); + } catch (...) { + return nullptr; + } +} + /* =========================================================================== * Public C API (resolved by ChipWorker via dlsym) * =========================================================================== */ @@ -426,6 +434,7 @@ int run_prepared( r->host_api.setup_static_arena = setup_static_arena_wrapper; r->host_api.acquire_pooled_gm_heap = acquire_pooled_gm_heap_wrapper; r->host_api.acquire_pooled_gm_sm = acquire_pooled_gm_sm_wrapper; + r->host_api.acquire_pooled_runtime_arena = acquire_pooled_runtime_arena_wrapper; r->host_api.upload_chip_callable_buffer = upload_chip_callable_buffer_wrapper; // Restore kernel addrs + orch symbol names + active_callable_id; the diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp index c0d26fbe1..b3072919c 100644 --- a/src/a5/platform/sim/host/device_runner.cpp +++ b/src/a5/platform/sim/host/device_runner.cpp @@ -112,40 +112,66 @@ static int prof_free_cb(void *dev_ptr) { DeviceRunner::~DeviceRunner() { finalize(); } -int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size) { - if (static_arena_.is_committed()) { - if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_) return 0; - static_arena_.release(); - gm_heap_region_off_ = SIZE_MAX; - gm_sm_region_off_ = SIZE_MAX; - cached_gm_heap_size_ = 0; - cached_gm_sm_size_ = 0; - } - gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign); - gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign); - if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { - // Roll back the two reserves: commit() failure leaves committed_=false, - // so the next entry would skip the release branch and stack new - // reserves on top of the stale cursor. release() is idempotent on a - // never-committed arena (just zeroes cursor_ / region_count_). - static_arena_.release(); - gm_heap_region_off_ = SIZE_MAX; - gm_sm_region_off_ = SIZE_MAX; - return -1; - } - cached_gm_heap_size_ = gm_heap_size; - cached_gm_sm_size_ = gm_sm_size; +int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) { + // Three independent device_malloc'd buffers: GM heap, PTO2 SM, prebuilt + // runtime arena. Split out from a single large allocation because the + // combined size can exceed the device allocator's largest contiguous + // block. Each arena commits exactly one region, so its base() is the + // pooled pointer the caller wants. + // + // Idempotent for the production case (sizes do not change across a + // worker's lifetime). If a caller asks for a larger layout on any + // region, redo just that region — already-committed peers stay alive + // so their callers don't have to re-acquire. + auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int { + if (requested_size == 0) { + // hbg's runtime_arena path: caller passed 0 and never reserved + // a region. Leave the arena uncommitted; acquire_pooled_* will + // return nullptr. + if (arena.is_committed() && cached_size != 0) { + arena.release(); + cached_size = 0; + } + return 0; + } + if (arena.is_committed() && requested_size <= cached_size) { + return 0; + } + arena.release(); + cached_size = 0; + arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign); + if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { + // commit() failure leaves committed_=false, so the next entry's + // is_committed() guard skips the release branch. release() is + // idempotent on a never-committed arena (zeroes cursor_). + arena.release(); + return -1; + } + cached_size = requested_size; + return 0; + }; + // Failure of a later region leaves earlier peers committed on purpose: + // pooled pointers previously returned to callers must stay valid even if + // this resize attempt aborts. + if (commit_region(gm_heap_arena_, cached_gm_heap_size_, gm_heap_size) != 0) return -1; + if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) return -1; + if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) return -1; return 0; } void *DeviceRunner::acquire_pooled_gm_heap() { - if (!static_arena_.is_committed()) return nullptr; - return static_arena_.region_ptr(gm_heap_region_off_); + if (!gm_heap_arena_.is_committed()) return nullptr; + return gm_heap_arena_.base(); } void *DeviceRunner::acquire_pooled_gm_sm() { - if (!static_arena_.is_committed()) return nullptr; - return static_arena_.region_ptr(gm_sm_region_off_); + if (!gm_sm_arena_.is_committed()) return nullptr; + return gm_sm_arena_.base(); +} + +void *DeviceRunner::acquire_pooled_runtime_arena() { + if (!runtime_arena_pool_.is_committed()) return nullptr; + return runtime_arena_pool_.base(); } std::thread DeviceRunner::create_thread(std::function fn) { @@ -929,14 +955,16 @@ int DeviceRunner::finalize() { // Close executor .so files (typically already closed by run(), this is a safety net) unload_executor_binaries(); - // Release per-Worker static arena (GM heap + PTO2 SM in a single backing - // device allocation). Must precede mem_alloc_.finalize() so the arena - // frees through the still-live allocator, not after it. - static_arena_.release(); - gm_heap_region_off_ = SIZE_MAX; - gm_sm_region_off_ = SIZE_MAX; + // Release the three per-Worker pooled arenas (GM heap, PTO2 SM, optional + // trb prebuilt runtime arena — each its own device_malloc). Must precede + // mem_alloc_.finalize() so the arenas free through the still-live + // allocator, not after it. + gm_heap_arena_.release(); + gm_sm_arena_.release(); + runtime_arena_pool_.release(); cached_gm_heap_size_ = 0; cached_gm_sm_size_ = 0; + cached_runtime_arena_size_ = 0; // Free all remaining allocations mem_alloc_.finalize(); diff --git a/src/a5/platform/sim/host/device_runner.h b/src/a5/platform/sim/host/device_runner.h index 0aa6e6fa1..59b685572 100644 --- a/src/a5/platform/sim/host/device_runner.h +++ b/src/a5/platform/sim/host/device_runner.h @@ -72,24 +72,33 @@ class DeviceRunner { public: DeviceRunner() : - static_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {} + gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), + gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), + runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {} ~DeviceRunner(); /** - * Lay out and commit the per-Worker static device arena that backs the - * PTO2 GM heap and PTO2 shared memory in a single underlying allocation. - * Must be called before acquire_pooled_gm_heap / acquire_pooled_gm_sm. - * Idempotent on identical sizes. Returns 0 on success, -1 on failure. + * Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared + * memory, trb prebuilt runtime arena) as three independent device + * allocations. Must be called before any acquire_pooled_*. Idempotent + * on identical sizes. `runtime_arena_size` is 0 for the hbg path + * (leaves that arena uncommitted). Returns 0 on success, -1 on + * failure. */ - int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size); + int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); /** - * Return the pooled GM heap / PTO2 SM pointer. setup_static_arena must - * have been called earlier in this Worker; otherwise these return - * nullptr. Pointers are stable for the lifetime of the Worker. + * Return the pooled GM heap / PTO2 SM / runtime arena pointer. + * setup_static_arena must have already committed the relevant region; + * otherwise these return nullptr. + * + * acquire_pooled_runtime_arena() is trb-only — the region is only + * committed when setup_static_arena was called with + * runtime_arena_size > 0. Calling it on the hbg path returns nullptr. */ void *acquire_pooled_gm_heap(); void *acquire_pooled_gm_sm(); + void *acquire_pooled_runtime_arena(); /** * Create a thread bound to this device. @@ -280,22 +289,30 @@ class DeviceRunner { // Memory management MemoryAllocator mem_alloc_; - // Per-Worker arena backing the PTO2 GM heap + PTO2 shared memory in a - // single device allocation. Released explicitly in finalize() before - // mem_alloc_.finalize() so it does not free pointers a second time. + // Three independent per-Worker arenas, each backing a single pooled + // region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime + // arena). Split out from a single backing allocation because the + // combined size can exceed the device allocator's largest contiguous + // block — three separate device_malloc calls are friendlier than one + // big one. Released explicitly in finalize() before mem_alloc_.finalize() + // so the underlying buffers do not get freed twice. + // + // `runtime_arena_pool_` stays unreserved when setup_static_arena was + // invoked with runtime_arena_size == 0 (hbg path). // // Trampolines forward DeviceArena's alloc/free to mem_alloc_. static void *arena_alloc_trampoline(void *ctx, size_t size) { return static_cast(ctx)->alloc(size); } static void arena_free_trampoline(void *ctx, void *p) { static_cast(ctx)->free(p); } - DeviceArena static_arena_; - size_t gm_heap_region_off_{SIZE_MAX}; - size_t gm_sm_region_off_{SIZE_MAX}; - // Cached sizes for setup_static_arena's "fits" check — avoids calling - // region_size() on the arena's public API for the two regions we own. + DeviceArena gm_heap_arena_; + DeviceArena gm_sm_arena_; + DeviceArena runtime_arena_pool_; + // Cached sizes for setup_static_arena's "fits" check — avoids re-allocating + // a buffer when a later worker init asks for an equal-or-smaller layout. size_t cached_gm_heap_size_{0}; size_t cached_gm_sm_size_{0}; + size_t cached_runtime_arena_size_{0}; // Simulation state (no actual device resources) KernelArgs kernel_args_; diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp index 81e9b138f..f2dc10b4e 100644 --- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp @@ -103,9 +103,9 @@ static uint64_t upload_chip_callable_buffer_wrapper(const void *callable) { } } -static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size) { +static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) { try { - return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size); + return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size, runtime_arena_size); } catch (...) { return -1; } @@ -127,6 +127,14 @@ static void *acquire_pooled_gm_sm_wrapper() { } } +static void *acquire_pooled_runtime_arena_wrapper() { + try { + return current_runner()->acquire_pooled_runtime_arena(); + } catch (...) { + return nullptr; + } +} + /* =========================================================================== * Public C API (resolved by ChipWorker via dlsym) * =========================================================================== */ @@ -328,6 +336,7 @@ int run_prepared( r->host_api.setup_static_arena = setup_static_arena_wrapper; r->host_api.acquire_pooled_gm_heap = acquire_pooled_gm_heap_wrapper; r->host_api.acquire_pooled_gm_sm = acquire_pooled_gm_sm_wrapper; + r->host_api.acquire_pooled_runtime_arena = acquire_pooled_runtime_arena_wrapper; r->host_api.upload_chip_callable_buffer = upload_chip_callable_buffer_wrapper; auto bind_result = runner->bind_prepared_callable_to_runtime(*r, callable_id); diff --git a/src/a5/runtime/host_build_graph/runtime/runtime.h b/src/a5/runtime/host_build_graph/runtime/runtime.h index b9edf7020..25c6c13f4 100644 --- a/src/a5/runtime/host_build_graph/runtime/runtime.h +++ b/src/a5/runtime/host_build_graph/runtime/runtime.h @@ -146,9 +146,10 @@ struct HostApi { // pto_runtime_c_api.cpp can populate the same HostApi struct regardless of // which runtime variant it is built against. Unset for this variant; do // not call. - int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size); + int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); void *(*acquire_pooled_gm_heap)(); void *(*acquire_pooled_gm_sm)(); + void *(*acquire_pooled_runtime_arena)(); // Single-shot upload of the entire ChipCallable buffer. `callable` is a // `const ChipCallable *` (declared void* to avoid pulling task_interface // headers into runtime.h). DeviceRunner walks child_offsets_ to compute diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index bcea9b09e..49d55380f 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -125,8 +125,10 @@ struct AicpuExecutor { std::atomic finished_count_{0}; std::atomic runtime_init_ready_{false}; - // Per-Worker arena backing the PTO2Runtime + sm_handle + orch/sched/mailbox - // sub-regions (created in runtime_create_from_sm, released in runtime_destroy). + // Per-Worker arena attaching to the pooled prebuilt runtime image. Host + // populates the layout + data on its own arena, rtMemcpys into a pooled + // device buffer owned by DeviceRunner, and the AICPU attach()es to that + // buffer on each boot — no AICPU-side commit, no per-boot rtMalloc. // Default-constructed: libc-backed backend, no ctx. DeviceArena runtime_arena_; @@ -466,29 +468,61 @@ int32_t AicpuExecutor::run(Runtime *runtime) { static_cast(task_window_size), static_cast(heap_size), dep_pool_capacity ); - void *sm_ptr = runtime->get_gm_sm_ptr(); - void *gm_heap = runtime->get_gm_heap_ptr(); + // gm_heap pointer / dep_pool_capacity are encoded into the prebuilt + // runtime arena image at host build time, so we no longer fetch + // them here. They remain on the host Runtime instance and on the + // PTO2Runtime header for diagnostic purposes only. + (void)dep_pool_capacity; + void *sm_ptr = runtime->get_gm_sm_ptr(); uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(task_window_size); - rt = runtime_create_from_sm( - PTO2_MODE_EXECUTE, sm_ptr, sm_size, task_window_size, gm_heap, heap_size, runtime_arena_, - dep_pool_capacity - ); - if (!rt) { - LOG_ERROR("Thread %d: Failed to create PTO2Runtime", thread_idx); - // Unblock scheduler threads before returning so they don't spin forever. + + // Prebuilt-arena fast path. Host has pre-populated the entire + // runtime arena (PTO2Runtime + orchestrator/scheduler/tensor_map + // sub-regions + sm_handle wrapper + mailbox) and uploaded it via + // rtMemcpy into the pooled runtime_arena buffer. We attach to it, + // wire arena-internal pointers to their device addresses, reset + // the SM, and finalize the few device-only fields the host could + // not know at image-build time. + void *prebuilt_arena = runtime->get_prebuilt_arena_base(); + size_t off_runtime = runtime->get_prebuilt_runtime_offset(); + if (prebuilt_arena == nullptr) { + LOG_ERROR("Thread %d: prebuilt_arena_base is null", thread_idx); + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + runtime_arena_.attach(prebuilt_arena, DeviceArena::kDefaultBaseAlign); + rt = reinterpret_cast(static_cast(prebuilt_arena) + off_runtime); + + // Wire every arena-internal pointer field (host wrote host-mirror + // addresses; we overwrite them with device addresses). + runtime_wire_arena_pointers(runtime_arena_, rt->prebuilt_layout, rt); + + // Reset SM state. setup_pointers + init_header_per_ring restore + // ring flow-control counters, layout metadata, error flags, and + // the per-slot ring->slot_states[] (bind_ring + reset_for_reuse + + // fanin_count/active_mask zero — previously done inside + // RingSchedState::init). + memset(rt->sm_handle, 0, sizeof(*rt->sm_handle)); + if (!rt->sm_handle->init(sm_ptr, sm_size, task_window_size, heap_size)) { + LOG_ERROR("Thread %d: sm_handle->init failed", thread_idx); runtime_init_ready_.store(true, std::memory_order_release); return -1; } + // AICore completion mailbox lives in the arena; reset it each + // boot so stale completion notifications from a previous run do + // not leak. + memset(rt->aicore_mailbox, 0, sizeof(*rt->aicore_mailbox)); + + // Fill ops / core counts (host can't resolve s_runtime_ops's + // device address nor know the SchedulerContext's core fan-out). + runtime_finalize_after_wire(rt, sched_ctx_.aic_count(), sched_ctx_.aiv_count()); + #if PTO2_PROFILING rt->orchestrator.l2_perf_level = get_l2_perf_level(); #endif - // Total core counts = aic_count_ / aiv_count_ (set once at runtime init). - rt->orchestrator.total_cluster_count = sched_ctx_.aic_count(); - rt->orchestrator.total_aiv_count = sched_ctx_.aiv_count(); - // With multi-ring, slot_states are per-ring inside the scheduler. runtime->set_slot_states_ptr(nullptr); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index 0c7ac3872..037d3ab04 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -36,8 +36,10 @@ #include #include "../common/pto_runtime_status.h" +#include "../runtime/pto_runtime2.h" #include "../runtime/pto_shared_memory.h" #include "../runtime/runtime.h" +#include "device_arena.h" #include "callable.h" #include "common/platform_config.h" #include "common/unified_log.h" @@ -271,15 +273,35 @@ extern "C" int bind_prepared_to_runtime_impl( uint64_t eff_heap_size = runtime->heap_size ? runtime->heap_size : PTO2_HEAP_SIZE; uint64_t eff_task_window_size = runtime->task_window_size ? runtime->task_window_size : PTO2_TASK_WINDOW_SIZE; - // Lay out the per-Worker static device arena. GM heap (orchestrator output - // buffers, all rings combined) and PTO2 shared memory live in a single - // backing allocation; setup_static_arena reserves both regions and - // commits in one shot. Owned by DeviceRunner across runs — do NOT record - // in tensor_pairs_; the free is deferred to DeviceRunner::finalize(). + // Lay out the per-Worker static device arena. GM heap, PTO2 shared memory, + // and the prebuilt runtime arena all live in a single backing allocation; + // setup_static_arena reserves the three regions and commits in one shot. + // Owned by DeviceRunner across runs — do NOT record in tensor_pairs_; the + // free is deferred to DeviceRunner::finalize(). The runtime-arena size is + // determined by replaying the reserve sequence on a host-side arena. uint64_t total_heap_size = eff_heap_size * PTO2_MAX_RING_DEPTH; uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(eff_task_window_size); + // dep_pool_size comes from a uint64 env var; reject values that don't fit + // the int32_t layout-sizing path rather than silently truncating. + int32_t eff_dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE; + if (runtime->dep_pool_size != 0) { + if (runtime->dep_pool_size > static_cast(INT32_MAX)) { + LOG_ERROR("PTO2_RING_DEP_POOL=%" PRIu64 " exceeds INT32_MAX", runtime->dep_pool_size); + return -1; + } + eff_dep_pool_capacity = static_cast(runtime->dep_pool_size); + } + + int64_t t_prebuilt_start = _now_ms(); + DeviceArena host_arena; // libc malloc backend by default + PTO2RuntimeArenaLayout layout = runtime_reserve_layout(host_arena, eff_task_window_size, eff_dep_pool_capacity); + if (host_arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { + LOG_ERROR("Failed to commit host arena for prebuilt runtime image"); + return -1; + } + int64_t t_setup_start = _now_ms(); - if (runtime->host_api.setup_static_arena(total_heap_size, sm_size) != 0) { + if (runtime->host_api.setup_static_arena(total_heap_size, sm_size, layout.arena_size) != 0) { LOG_ERROR("Failed to setup pooled static arena"); return -1; } @@ -303,9 +325,48 @@ extern "C" int bind_prepared_to_runtime_impl( } runtime->set_gm_sm_ptr(sm_ptr); + void *runtime_arena_dev = runtime->host_api.acquire_pooled_runtime_arena(); + if (runtime_arena_dev == nullptr) { + LOG_ERROR("Failed to acquire pooled runtime arena"); + return -1; + } + // Set up device orchestration state runtime->set_orch_args(device_args); + // ------------------------------------------------------------------------- + // Build the prebuilt runtime-arena image on host. + // + // We pre-compute every byte the AICPU's runtime arena would otherwise have + // to write at boot: layout offsets, sub-structure init data, and pointers + // back to the SM / GM heap. Then we rtMemcpy the image into the pooled + // runtime-arena region that DeviceRunner keeps alive across runs. AICPU + // boot becomes attach + wire (cheap pointer fixup) + sm_handle->init (SM + // reset) + a handful of device-only field fixups. + // ------------------------------------------------------------------------- + PTO2Runtime *rt = + runtime_init_data_from_layout(host_arena, layout, PTO2_MODE_EXECUTE, sm_ptr, sm_size, gm_heap, eff_heap_size); + if (rt == nullptr) { + LOG_ERROR("runtime_init_data_from_layout failed"); + return -1; + } + runtime_wire_arena_pointers(host_arena, layout, rt); + + // Stash the layout inside the PTO2Runtime image so the AICPU can recover + // every arena-internal offset after rtMemcpy. The runtime arena's device + // base does NOT travel in this image — it's on the host Runtime + // (set_prebuilt_arena below), since the AICPU needs that pointer + // *before* it can dereference the image. + rt->prebuilt_layout = layout; + + int rc_upload = runtime->host_api.copy_to_device(runtime_arena_dev, host_arena.base(), layout.arena_size); + if (rc_upload != 0) { + LOG_ERROR("Failed to rtMemcpy prebuilt runtime arena to device (rc=%d)", rc_upload); + return -1; + } + runtime->set_prebuilt_arena(runtime_arena_dev, layout.off_runtime); + int64_t t_prebuilt_end = _now_ms(); + LOG_INFO_V0("Device orchestration ready: %d tensors + %d scalars", tensor_count, scalar_count); int64_t t_total_end = _now_ms(); @@ -313,6 +374,7 @@ extern "C" int bind_prepared_to_runtime_impl( LOG_INFO_V0("TIMING: static_arena_setup = %" PRId64 "ms", t_setup_end - t_setup_start); LOG_INFO_V0("TIMING: gm_heap_acquire = %" PRId64 "ms", t_heap_end - t_heap_start); LOG_INFO_V0("TIMING: shared_mem_acquire = %" PRId64 "ms", t_sm_end - t_sm_start); + LOG_INFO_V0("TIMING: prebuilt_runtime_arena = %" PRId64 "ms", t_prebuilt_end - t_prebuilt_start); LOG_INFO_V0("TIMING: total_init_runtime_impl = %" PRId64 "ms", t_total_end - t_total_start); return 0; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index 05ac105a8..c937fd986 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -329,11 +329,22 @@ static bool prepare_task( prefetch_payload(out->payload, args.tensor_count(), args.scalar_count()); + // Re-bind payload/task pointers each submit. Value is per-slot constant + // (same as &task_payloads[slot] / &task_descriptors[slot]), but writing + // here lets RingSchedState::init_data_from_layout() skip the + // O(window_size) bind loop. Both writes hit the same 64B slot_state + // cache line we're about to dirty below, so the extra cost is two + // stores on an already-hot line. Must precede the scheduler + // wiring.queue.push at the end of submit_task_common — that push is + // the first read of slot_state->task / slot_state->payload by another + // thread. + out->slot_state->bind_buffers(out->payload, out->task); + // Fields already reset by advance_ring_pointers (eager reset after CONSUMED): // fanout_lock=0, fanout_count=1, fanout_head=nullptr, // fanin_refcount=0, fanout_refcount=0, completed_subtasks=0, next_block_idx=0 - // Fields immutable after RingSchedState::init(): - // payload, task, ring_id + // Fields immutable after RingSchedState::init_data_from_layout(): + // ring_id // task_state left as CONSUMED by eager reset (safe for stale wait_for_tensor // observers); set to PENDING here when orchestrator actually reuses the slot. out->slot_state->task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed); @@ -348,88 +359,6 @@ static bool prepare_task( return true; } -// ============================================================================= -// Orchestrator Initialization -// ============================================================================= - -PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout( - DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity -) { - PTO2OrchestratorLayout layout{}; - layout.dep_pool_capacity = dep_pool_capacity; - layout.scope_tasks_cap = PTO2_SCOPE_TASKS_CAP; - layout.scope_stack_capacity = PTO2_MAX_SCOPE_DEPTH; - - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - const size_t fanin_pool_bytes = - PTO2_ALIGN_UP(static_cast(dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE); - layout.off_fanin_pool[r] = arena.reserve(fanin_pool_bytes, PTO2_ALIGN_SIZE); - } - layout.off_scope_tasks = arena.reserve( - static_cast(layout.scope_tasks_cap) * sizeof(PTO2TaskSlotState *), alignof(PTO2TaskSlotState *) - ); - layout.off_scope_begins = - arena.reserve(static_cast(layout.scope_stack_capacity) * sizeof(int32_t), alignof(int32_t)); - layout.tensor_map = PTO2TensorMap::reserve_layout_default(arena, task_window_sizes); - return layout; -} - -bool PTO2OrchestratorState::init_from_layout( - const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header_arg, void *gm_heap, - uint64_t heap_size -) { - auto *orch = this; - *orch = PTO2OrchestratorState{}; - - orch->sm_header = sm_header_arg; - orch->gm_heap_base = gm_heap; - orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH; - orch->fatal = false; - - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - void *ring_heap_base = reinterpret_cast(gm_heap) + r * heap_size; - auto &ring = sm_header_arg->rings[r]; - - orch->rings[r].task_allocator.init( - ring.task_descriptors, ring.task_window_size, &ring.fc.current_task_index, &ring.fc.last_task_alive, - ring_heap_base, heap_size, &sm_header_arg->orch_error_code - ); - - const size_t fanin_pool_bytes = - PTO2_ALIGN_UP(static_cast(layout.dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE); - auto *fanin_entries = static_cast(arena.region_ptr(layout.off_fanin_pool[r])); - memset(fanin_entries, 0, fanin_pool_bytes); - orch->rings[r].fanin_pool.init(fanin_entries, layout.dep_pool_capacity, &sm_header_arg->orch_error_code); - } - - if (!orch->tensor_map.init_from_layout(layout.tensor_map, arena)) { - return false; - } - orch->tensor_map.orch = orch; - - orch->scope_tasks = static_cast(arena.region_ptr(layout.off_scope_tasks)); - orch->scope_begins = static_cast(arena.region_ptr(layout.off_scope_begins)); - orch->scope_tasks_size = 0; - orch->scope_tasks_capacity = layout.scope_tasks_cap; - orch->scope_stack_top = -1; - orch->scope_stack_capacity = layout.scope_stack_capacity; - orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH; - - return true; -} - -void PTO2OrchestratorState::destroy() { - auto *orch = this; - orch->tensor_map.destroy(); - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - orch->rings[r].fanin_pool.base = nullptr; - } - orch->scope_tasks = nullptr; - orch->scope_begins = nullptr; -} - -void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; } - // ============================================================================= // Scope Management // ============================================================================= @@ -578,9 +507,6 @@ static TaskOutputTensors submit_task_common( auto runtime_emit = [&](PTO2TaskId producer_task_id) -> bool { PTO2TaskSlotState *prod_state = &orch->sm_header->rings[producer_task_id.ring()].get_slot_state_by_task_id(producer_task_id.local()); - if (prod_state->task == nullptr || prod_state->task->task_id != producer_task_id) { - return true; // producer slot reused for a different task — dep is moot - } return append_fanin_or_fail(orch, prod_state, &fanin_builder, ring_id); }; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h index e24b85b4e..9a73714c0 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h @@ -133,19 +133,29 @@ struct PTO2OrchestratorState { // === Cold-path API (defined in pto_orchestrator.cpp) === // Phase 1: declare every sub-region (per-ring fanin pool, scope arrays, - // tensor_map sub-layout) on the supplied arena. + // tensor_map sub-layout) on the supplied arena. task_window_sizes feeds + // the nested tensor_map layout. Returned layout is consumed by + // init_data_from_layout. static PTO2OrchestratorLayout reserve_layout( DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE ); - // Phase 3: bind region pointers, wire per-ring task_allocator + fanin_pool - // and tensor_map. Arena must be committed. - bool init_from_layout( - const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header, void *gm_heap, - uint64_t heap_size + // Phase 3a: write everything *except* arena-internal pointer fields. + // sm_dev_base is the SM device address (only stored, never dereferenced); + // task_window_size feeds the per-ring SM address arithmetic. Safe to call + // on a host arena that holds the prebuilt image. + bool init_data_from_layout( + const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size, + uint64_t task_window_size ); + // Phase 3b: write the arena-internal pointer fields (scope_tasks, + // scope_begins, rings[].fanin_pool.base, tensor_map.{buckets,entry_pool, + // free_entry_list,task_entry_heads}, scheduler reference). + // Idempotent — host runs once on the image, AICPU runs once after attach. + void wire_arena_pointers(const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler); + // Forget pointers; arena owns the backing buffers. void destroy(); void set_scheduler(PTO2SchedulerState *scheduler); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h index 5a3e3d3d3..abd2a7510 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h @@ -68,10 +68,22 @@ class PTO2TaskAllocator { public: /** * Initialize the allocator with task ring and heap ring resources. + * + * All pointer arguments are device addresses (live in SM / GM heap); this + * function only stores them, no dereferences, so it is safe to invoke + * from host code that constructs a prebuilt arena image. + * + * Production callers leave `initial_local_task_id` at 0: the SM ring + * flow-control counters that current_index_ptr / last_alive_ptr point at + * start at zero (PTO2RingFlowControl::init() runs on the AICPU during SM + * reset), so we keep local_task_id_ aligned with that without reading the + * SM. Tests that drive SM state directly may pass a non-zero seed to + * exercise corner cases like task IDs near INT32_MAX. */ void init( PTO2TaskDescriptor *descriptors, int32_t window_size, std::atomic *current_index_ptr, - std::atomic *last_alive_ptr, void *heap_base, uint64_t heap_size, std::atomic *error_code_ptr + std::atomic *last_alive_ptr, void *heap_base, uint64_t heap_size, std::atomic *error_code_ptr, + int32_t initial_local_task_id = 0 ) { descriptors_ = descriptors; window_size_ = window_size; @@ -81,7 +93,7 @@ class PTO2TaskAllocator { heap_base_ = heap_base; heap_size_ = heap_size; error_code_ptr_ = error_code_ptr; - local_task_id_ = current_index_ptr->load(std::memory_order_relaxed); + local_task_id_ = initial_local_task_id; heap_top_ = 0; heap_tail_ = 0; last_alive_seen_ = 0; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp index c801d5c15..f39bac365 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp @@ -249,81 +249,19 @@ static const PTO2RuntimeOps s_runtime_ops = { }; // ============================================================================= -// Runtime Creation and Destruction +// Runtime Lifecycle (AICPU-only fixup) // ============================================================================= - -PTO2Runtime *runtime_create_from_sm( - PTO2RuntimeMode mode, void *sm_base, uint64_t sm_size, uint64_t task_window_size, void *gm_heap, uint64_t heap_size, - DeviceArena &arena, int32_t dep_pool_capacity -) { - if (!sm_base || sm_size == 0) return nullptr; - - // Phase 1: layout. Reserve every sub-region the runtime needs (including - // the SM handle wrapper itself) without touching memory yet. - int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]; - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - task_window_sizes[r] = static_cast(task_window_size); - } - const size_t off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle)); - PTO2OrchestratorLayout orch_layout = - PTO2OrchestratorState::reserve_layout(arena, task_window_sizes, dep_pool_capacity); - PTO2SchedulerLayout sched_layout = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacity); - const size_t off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE); - const size_t off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox)); - - // Phase 2: single backing allocation. - if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) return nullptr; - - // Phase 3: bind region pointers and initialize. - PTO2Runtime *rt = static_cast(arena.region_ptr(off_runtime)); - memset(rt, 0, sizeof(*rt)); // calloc-equivalent for the runtime header. - - // Initialize the SM handle wrapper in-place on its arena region before - // anything that reads sm_handle->header (orchestrator / scheduler init). - rt->sm_handle = static_cast(arena.region_ptr(off_sm_handle)); - memset(rt->sm_handle, 0, sizeof(*rt->sm_handle)); - if (!rt->sm_handle->init(sm_base, sm_size, task_window_size, heap_size)) { - arena.release(); - return nullptr; - } - +// +// Layout / init_data / wire / destroy live in +// runtime/shared/pto_runtime2_init.cpp so the host build can pre-populate the +// prebuilt arena image. The pieces below — wiring the ops table and the +// SPMD core counts — depend on the device-side s_runtime_ops global and the +// AICPU SchedulerContext respectively, so they remain in the AICPU build. + +void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count) { rt->ops = &s_runtime_ops; - rt->mode = mode; - rt->gm_heap = gm_heap; - rt->gm_heap_size = heap_size > 0 ? heap_size * PTO2_MAX_RING_DEPTH : 0; - rt->gm_heap_owned = false; - - if (!rt->orchestrator.init_from_layout(orch_layout, arena, rt->sm_handle->header, gm_heap, heap_size)) { - arena.release(); - return nullptr; - } - if (!rt->scheduler.init_from_layout(sched_layout, arena, rt->sm_handle->header)) { - rt->orchestrator.destroy(); - arena.release(); - return nullptr; - } - rt->orchestrator.set_scheduler(&rt->scheduler); - - rt->aicore_mailbox = static_cast(arena.region_ptr(off_mailbox)); - memset(rt->aicore_mailbox, 0, sizeof(*rt->aicore_mailbox)); - - return rt; -} - -void runtime_destroy(PTO2Runtime *rt, DeviceArena &arena) { - if (!rt) { - arena.release(); // safe: idempotent if nothing's committed. - return; - } - - rt->scheduler.destroy(); - rt->orchestrator.destroy(); - rt->aicore_mailbox = nullptr; // arena-owned. - rt->sm_handle = nullptr; // wrapper lives in arena; release() reclaims it. - - // arena.release() frees the single backing buffer that holds rt, - // mailbox, sm_handle, orchestrator and scheduler sub-regions in one shot. - arena.release(); + rt->orchestrator.total_cluster_count = aic_count; + rt->orchestrator.total_aiv_count = aiv_count; } void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode) { diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h index 1da622407..460624e69 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h @@ -91,6 +91,30 @@ struct PTO2RuntimeOps { TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const Arg &args); }; +/** + * Layout descriptor for the prebuilt runtime arena. Holds all sub-region + * offsets (orchestrator / scheduler / sm_handle wrapper / runtime header / + * AICore mailbox) plus the layout-defining capacities. Produced once on the + * host by runtime_reserve_layout(); consumed by runtime_init_data_from_layout + * and runtime_wire_arena_pointers. + */ +struct PTO2RuntimeArenaLayout { + size_t off_sm_handle{0}; + PTO2OrchestratorLayout orch; + PTO2SchedulerLayout sched; + size_t off_runtime{0}; + size_t off_mailbox{0}; + + // Cached parameters (re-used by init_data + wire stages). + uint64_t task_window_size{0}; + uint64_t heap_size{0}; + int32_t dep_pool_capacity{0}; + + // Total arena byte size post-commit. Used by host to size the prebuilt + // image buffer and as the rtMemcpy length. + size_t arena_size{0}; +}; + /** * PTO Runtime2 context * @@ -118,6 +142,16 @@ struct PTO2Runtime { // Statistics int64_t total_cycles; + + // Prebuilt-arena fast path metadata. Carries every offset + // wire_arena_pointers needs at AICPU boot so the AICPU can reconstruct + // all arena-internal pointer fields without re-running init_data. The + // device base of the runtime arena travels separately on the host-side + // Runtime (Runtime::prebuilt_arena_base_), since the AICPU needs it + // *before* dereferencing this image. Populated on host by + // runtime_init_data_from_layout + runtime_wire_arena_pointers; read by + // aicpu_executor.cpp. + PTO2RuntimeArenaLayout prebuilt_layout; }; // ============================================================================= @@ -125,31 +159,60 @@ struct PTO2Runtime { // ============================================================================= /** - * Create runtime from caller-provided GM SM buffer + GM heap. + * Phase 1 — declare every sub-region (sm_handle wrapper, orchestrator / + * scheduler / tensor_map / mailbox / PTO2Runtime header) on the supplied + * arena. Pure arithmetic; does not touch device memory and may run on host. + * Returns the layout descriptor; caller commits/attaches the arena before + * Phase 2/3. + */ +PTO2RuntimeArenaLayout runtime_reserve_layout( + DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE +); + +/** + * Phase 2 — write the data half of the runtime arena: standalone fields, + * memset'd arena regions, sub-structure initializers, and SM-side device + * pointers. The arena must already be committed (or attached); writes go + * into arena.base() + sub-region offsets. * - * All AICPU-side runtime state (PTO2SharedMemoryHandle wrapper, PTO2Runtime, - * AICoreCompletionMailbox, plus the orchestrator/scheduler/tensor_map - * sub-regions) is laid out on the supplied arena and committed in a single - * backing allocation. runtime_destroy() calls arena.release() once to free - * the lot. + * `sm_dev_base` / `gm_heap_dev_base` are device addresses; we only store + * them (never dereference). Safe to run on a host arena that owns a host + * mirror of the runtime image — the resulting buffer is rtMemcpy-ready. * - * @param mode Execution mode - * @param sm_base Pre-allocated SM buffer base (host-owned) - * @param sm_size Size of the SM buffer in bytes - * @param task_window_size Per-ring task window size used to lay out SM - * @param gm_heap GM heap base for output buffers (or NULL if not used) - * @param heap_size GM heap size in bytes - * @param arena Caller-owned arena that sources all runtime sub-regions. - * @return Runtime context, or NULL on failure - */ -PTO2Runtime *runtime_create_from_sm( - PTO2RuntimeMode mode, void *sm_base, uint64_t sm_size, uint64_t task_window_size, void *gm_heap, uint64_t heap_size, - DeviceArena &arena, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE + * Returns the PTO2Runtime* that sits at layout.off_runtime within the arena. + * Caller must follow up with runtime_wire_arena_pointers; rt->ops and the + * AICore-side count fields are left untouched and must be filled by the + * AICPU at boot. + */ +PTO2Runtime *runtime_init_data_from_layout( + DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, uint64_t sm_size, + void *gm_heap_dev_base, uint64_t heap_size ); /** - * Destroy runtime and free all resources. arena.release() is the actual - * memory free; the rt pointer is no longer valid afterward. + * Phase 3 — wire every arena-internal pointer field (rt->sm_handle, + * rt->aicore_mailbox, orchestrator.{scope_tasks, scope_begins, scheduler, + * tensor_map.*, rings[].fanin_pool.base}, scheduler.{ready_queues, dep_pool, + * wiring.queue}) so each holds arena.base() + offset. Idempotent — runs on + * both host (writing host-mirror addresses) and AICPU (writing device + * addresses) sides. + */ +void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt); + +/** + * AICPU-only Phase 4 — fill in the few fields the host could not know at + * prebuilt-image build time: the ops table (s_runtime_ops is a device-side + * file-local global, host cannot resolve its device address) and the + * orchestrator's core counts (depend on the executor's scheduler context). + * Call once per boot after runtime_wire_arena_pointers. + */ +void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count); + +/** + * Destroy runtime. With the prebuilt-arena fast path the arena buffer is + * pooled across runs by DeviceRunner, so we never call arena.release() + * here — the destructor only forgets sub-structure pointers (idempotent + * cleanup). */ void runtime_destroy(PTO2Runtime *rt, DeviceArena &arena); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h index 999dbf6c5..a0dfbd9ef 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h @@ -92,7 +92,7 @@ // Task management // NOTE: PTO2_TASK_WINDOW_SIZE is now a per-ring default value. -// Actual window size is passed at runtime to runtime_create_from_sm(). +// Actual window size is passed at runtime to runtime_reserve_layout(). // Use pto2_task_slot(sched, task_id) for slot calculation. #define PTO2_TASK_WINDOW_SIZE 16384 // Default per-ring task window size (power of 2) @@ -330,7 +330,11 @@ struct alignas(64) PTO2TaskSlotState { // Fanout refcount (accessed with fanout_count in check_and_handle_consumed) std::atomic fanout_refcount; // Dynamic: counts released references - // --- Immutable after RingSchedState::init() (same value on every slot reuse) --- + // --- Per-slot constant, re-bound by orch::prepare_task each submit --- + // Value is the same on every reuse (&task_payloads[slot] / &task_descriptors[slot]), + // but written here per-submit instead of in an O(window_size) init loop — + // these are the only "scale-dependent" pointers in this struct, so moving + // them out of init makes startup cost independent of task_window_size. PTO2TaskPayload *payload; PTO2TaskDescriptor *task; @@ -345,14 +349,21 @@ struct alignas(64) PTO2TaskSlotState { int16_t next_block_idx{0}; // Next block to dispatch (scheduler state) /** - * One-time binding of slot-invariant fields. - * Called during RingSchedState::init() — these values are determined by - * the slot's position in the ring and never change across reuses. + * Bind the slot-invariant ring id. Called once per slot during + * RingSchedState::init(); ring_id never changes across reuses. */ - void bind(PTO2TaskPayload *p, PTO2TaskDescriptor *t, uint8_t rid) { + void bind_ring(uint8_t rid) { ring_id = rid; } + + /** + * Re-bind the per-slot payload/task pointers. Called by + * orch::prepare_task on every submit. Value is constant for a given + * slot, but we pay the cheap re-write each submit (both fields land on + * the same 64B slot_state cache line that prepare_task is already + * dirtying) to avoid the init-time per-slot loop. + */ + void bind_buffers(PTO2TaskPayload *p, PTO2TaskDescriptor *t) { payload = p; task = t; - ring_id = rid; } /** diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h index cf8dbb780..98b832510 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h @@ -53,11 +53,18 @@ struct PTO2SharedMemoryHandle; */ struct alignas(64) PTO2RingFlowControl { // === Cache Line 0: Written by Orchestrator, Read by Scheduler === - std::atomic current_task_index; // Task ring head (next to allocate) + alignas(64) std::atomic current_task_index; // Task ring head (next to allocate) // === Cache Line 1: Written by Scheduler, Read by Orchestrator (for back-pressure) === alignas(64) std::atomic last_task_alive; // Task ring tail (oldest active task) + // Per-boot SM reset. PTO2TaskAllocator::init() seeds its private + // local_task_id_ from initial_local_task_id (default 0 in production) + // *without* dereferencing current_task_index — it relies on this reset + // running on every AICPU boot so 0 stays in sync. If you ever change + // the initial fc value or the boot ordering, update the default in + // PTO2TaskAllocator::init (pto_ring_buffer.h) in the same change, or + // submit IDs will be off by the divergence. void init() { current_task_index.store(0, std::memory_order_relaxed); last_task_alive.store(0, std::memory_order_relaxed); @@ -187,3 +194,67 @@ struct PTO2SharedMemoryHandle { void setup_pointers(uint64_t task_window_size); void setup_pointers_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]); }; + +// ============================================================================= +// SM Device Layout Helpers +// ============================================================================= +// +// When the host pre-builds a runtime-arena image, it needs the device-side +// addresses of several SM sub-fields (ring flow-control counters, +// task_descriptors arrays, orch_error_code) so it can wire them into the +// orchestrator / scheduler init_data path without dereferencing the SM — +// the SM lives in device memory and cannot be touched from host. +// +// These helpers compute those addresses by offset arithmetic on the SM +// device base. Pure pointer math, no loads/stores; safe to call from host. +// The same arithmetic happens on AICPU too (via PTO2SharedMemoryHandle's +// own setup_pointers), so values are guaranteed consistent across sides. +namespace pto2_sm_layout { + +inline std::atomic *orch_error_code_addr(void *sm_dev_base) noexcept { + return reinterpret_cast *>( + static_cast(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, orch_error_code) + ); +} + +inline PTO2SharedMemoryRingHeader *ring_header_addr(void *sm_dev_base, int ring_id) noexcept { + return reinterpret_cast( + static_cast(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, rings) + + static_cast(ring_id) * sizeof(PTO2SharedMemoryRingHeader) + ); +} + +inline std::atomic *ring_current_task_index_addr(void *sm_dev_base, int ring_id) noexcept { + return reinterpret_cast *>( + reinterpret_cast(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) + + offsetof(PTO2RingFlowControl, current_task_index) + ); +} + +inline std::atomic *ring_last_task_alive_addr(void *sm_dev_base, int ring_id) noexcept { + return reinterpret_cast *>( + reinterpret_cast(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) + + offsetof(PTO2RingFlowControl, last_task_alive) + ); +} + +// Walk the per-ring SM layout (same arithmetic as setup_pointers_per_ring) +// to compute ring `ring_id`'s task_descriptors device address. Accepts a +// per-ring window-size array so the helper's signature mirrors +// `PTO2SharedMemoryHandle::setup_pointers_per_ring` and cannot silently +// disagree with the SM layout when (hypothetically) ring sizes diverge. +inline PTO2TaskDescriptor *ring_task_descriptors_addr( + void *sm_dev_base, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], int ring_id +) noexcept { + assert(ring_id >= 0 && ring_id < PTO2_MAX_RING_DEPTH && "pto2_sm_layout: ring_id out of range"); + char *p = static_cast(sm_dev_base); + p += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE); + for (int r = 0; r < ring_id; r++) { + p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); + p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE); + p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE); + } + return reinterpret_cast(p); +} + +} // namespace pto2_sm_layout diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h index 39d6e4ad2..b63f20676 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h @@ -47,12 +47,12 @@ #include "pto_runtime2_types.h" #include "tensor.h" -struct PTO2OrchestratorState; // forward declare - /** * Layout descriptor produced by PTO2TensorMap::reserve_layout(). Stores the * region offsets returned by DeviceArena::reserve() so init_from_layout() * can fetch the matching pointers after the arena is committed. + * + * All offsets are relative to the arena's base. */ struct PTO2TensorMapLayout { size_t off_buckets; @@ -367,8 +367,6 @@ struct PTO2TensorMap { // Per-ring cleanup progress (for periodic cleanup_retired) int32_t last_cleanup[PTO2_MAX_RING_DEPTH]{}; - PTO2OrchestratorState *orch{nullptr}; - uint32_t get_task_local_id_slot(uint8_t ring_id, uint32_t task_local_id) const { return task_local_id & (task_window_sizes[ring_id] - 1); } @@ -433,11 +431,19 @@ struct PTO2TensorMap { reserve_layout_default(DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]); /** - * Phase 3: bind region pointers and initialize state. The arena must already - * be committed; layout must have been produced by reserve_layout() against - * the same arena. + * Phase 3a: write everything *except* arena-internal pointer fields + * (buckets, entry_pool, free_entry_list, task_entry_heads[r]). + * Uses arena.region_ptr to address the arena regions for data writes, + * but does not store those addresses in struct fields. Safe to call on + * a host arena that holds the prebuilt image. + */ + bool init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena); + + /** + * Phase 3b: write the arena-internal pointer fields. Idempotent; + * called once on the host arena and once on the AICPU after attach. */ - bool init_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena); + void wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena); /** * Tear down state. Does not free memory — the arena owns the backing diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h index a4aef9c04..4a690e8ca 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -48,7 +48,7 @@ #define RUNTIME_MAX_ARGS 128 #define RUNTIME_MAX_WORKER 108 // 36 AIC + 72 AIV cores #define RUNTIME_MAX_FUNC_ID 1024 -#define RUNTIME_MAX_ORCH_SO_SIZE (4 * 1024 * 1024) // 1MB max for orchestration SO +#define RUNTIME_MAX_ORCH_SO_SIZE (4 * 1024 * 1024) // 4MB max for orchestration SO #define RUNTIME_MAX_ORCH_SYMBOL_NAME 64 // Default ready queue shards: one shard per worker thread (total minus orchestrator) @@ -127,18 +127,25 @@ struct HostApi { void (*device_free)(void *dev_ptr); int (*copy_to_device)(void *dev_ptr, const void *host_ptr, size_t size); int (*copy_from_device)(void *host_ptr, const void *dev_ptr, size_t size); - // Lay out and commit the per-Worker static device arena that backs both - // the PTO2 GM heap and the PTO2 shared memory in a single underlying - // allocation. Must be called once before acquire_pooled_gm_heap / - // acquire_pooled_gm_sm. Returns 0 on success, -1 on allocation failure. - int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size); + // Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared + // memory, trb prebuilt runtime arena) as three independent device + // allocations. `runtime_arena_size == 0` skips the third region (hbg + // path: hbg has no prebuilt runtime arena). Idempotent on identical + // sizes; returns 0 on success, -1 on allocation failure. + int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); // Return the per-Worker pooled pointer for the PTO2 GM heap / shared - // memory. The static arena must already be committed via - // setup_static_arena; the returned pointer is owned by the DeviceRunner - // and freed in `DeviceRunner::finalize()` — do NOT pass it to - // device_free or record it in `tensor_pairs_`. + // memory / prebuilt runtime arena. setup_static_arena must have already + // committed the relevant region; the returned pointer is owned by the + // DeviceRunner and freed in `DeviceRunner::finalize()` — do NOT pass it + // to device_free or record it in `tensor_pairs_`. + // + // acquire_pooled_runtime_arena is trb-only — the runtime-arena region is + // only committed when setup_static_arena was invoked with + // runtime_arena_size > 0. Calling it on the hbg path + // (setup_static_arena(...,0)) returns nullptr (not undefined). void *(*acquire_pooled_gm_heap)(); void *(*acquire_pooled_gm_sm)(); + void *(*acquire_pooled_runtime_arena)(); // Single-shot upload of the entire ChipCallable buffer. `callable` is a // `const ChipCallable *` (declared void* to avoid pulling task_interface // headers into runtime.h). DeviceRunner walks child_offsets_ to compute @@ -218,6 +225,13 @@ class Runtime { void *slot_states_ptr_; // Pointer to PTO2TaskSlotState array (scheduler-private, for profiling) ChipStorageTaskArgs orch_args_storage_; // Copy of args for device + // Prebuilt-arena fast path (trb only). Set by the host before rtMemcpy'ing + // Runtime to device; AICPU reads them in the boot path to skip + // runtime_create_from_sm and reuse the pooled, prebuilt arena buffer + // (already populated by runtime_init_data_from_layout + wire on host). + void *prebuilt_arena_base_; + size_t prebuilt_runtime_offset_; + // Device orchestration SO (for dlopen on AICPU thread 3). // The SO bytes themselves live in a separately-allocated device buffer // owned by DeviceRunner; only the metadata below travels inside Runtime. @@ -254,6 +268,16 @@ class Runtime { void set_slot_states_ptr(void *p); void set_orch_args(const ChipStorageTaskArgs &args); + // Prebuilt-arena fast path (trb only). Set by host's + // bind_prepared_to_runtime_impl; consumed by AICPU at boot to attach a + // DeviceArena to `prebuilt_arena_base_` and pick up the PTO2Runtime at + // `prebuilt_arena_base_ + prebuilt_runtime_offset_`. Both stay zero on + // first construction (Runtime() ctor zeros them) so a non-prebuilt boot + // path can still detect "no prebuilt image set" via nullptr. + void set_prebuilt_arena(void *arena_base, size_t runtime_off); + void *get_prebuilt_arena_base() const; + size_t get_prebuilt_runtime_offset() const; + // Device orchestration SO binary (for dlopen on AICPU thread 3) void set_dev_orch_so(uint64_t dev_addr, uint64_t size); uint64_t get_dev_orch_so_addr() const; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp index 281a714fb..2d777e9b0 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp @@ -61,152 +61,6 @@ PTO2SchedProfilingData scheduler_get_profiling(int thread_idx) { } #endif -// ============================================================================= -// Ready Queue Implementation -// ============================================================================= - -size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity) { - // Align the slots[] base to a full cache line so MPMC CAS traffic on the - // first slot cannot false-share with whatever region sits in front of us - // (e.g. orchestrator tensormap heads written by the orch thread). - return arena.reserve(capacity * sizeof(PTO2ReadyQueueSlot), PTO2_ALIGN_SIZE); -} - -bool ready_queue_init_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity) { - queue->slots = static_cast(arena.region_ptr(slots_off)); - queue->capacity = capacity; - queue->mask = capacity - 1; - queue->enqueue_pos.store(0, std::memory_order_relaxed); - queue->dequeue_pos.store(0, std::memory_order_relaxed); - - for (uint64_t i = 0; i < capacity; i++) { - queue->slots[i].sequence.store((int64_t)i, std::memory_order_relaxed); - queue->slots[i].slot_state = nullptr; - } - - return true; -} - -void ready_queue_destroy(PTO2ReadyQueue *queue) { - // Arena owns the slots[] buffer; just forget the pointer. - queue->slots = nullptr; -} - -// ============================================================================= -// Scheduler Initialization -// ============================================================================= - -bool PTO2SchedulerState::RingSchedState::init(PTO2SharedMemoryHeader *sm_header, int32_t ring_id) { - ring = &sm_header->rings[ring_id]; - last_task_alive = 0; - advance_lock.store(0, std::memory_order_relaxed); - - // Initialize all per-task slot state fields. - // bind() sets payload, task, ring_id — immutable after init, bound once - // to their fixed shared-memory addresses. - // reset_for_reuse() sets dynamic fields to reclaim defaults (fanout_count=1, - // rest zero) so the first submit needs no reset. - for (uint64_t i = 0; i < ring->task_window_size; i++) { - ring->slot_states[i].bind(&ring->task_payloads[i], &ring->task_descriptors[i], static_cast(ring_id)); - ring->slot_states[i].reset_for_reuse(); - ring->slot_states[i].fanin_count = 0; - ring->slot_states[i].active_mask = ActiveMask{}; - } - - return true; -} - -void PTO2SchedulerState::RingSchedState::destroy() { ring = nullptr; } - -PTO2SchedulerLayout PTO2SchedulerState::reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity) { - PTO2SchedulerLayout layout{}; - layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE; - layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE; - layout.dep_pool_capacity = dep_pool_capacity; - - for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { - layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE); - } - layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE); - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - // Force a cache-line base so writes from scheduler thread 0 (sole - // writer of this ring's dep_pool) do not invalidate adjacent - // multi-threaded regions like ready_queue.slots. - layout.off_dep_pool_entries[r] = - arena.reserve(static_cast(dep_pool_capacity) * sizeof(PTO2DepListEntry), PTO2_ALIGN_SIZE); - } - layout.off_wiring_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE); - return layout; -} - -bool PTO2SchedulerState::init_from_layout( - const PTO2SchedulerLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header_arg -) { - PTO2SchedulerState *sched = this; - sched->sm_header = sm_header_arg; -#if PTO2_SCHED_PROFILING - sched->tasks_completed.store(0, std::memory_order_relaxed); - sched->tasks_consumed.store(0, std::memory_order_relaxed); -#endif - - // Per-ring scheduler state — no arena buffers, just field init. - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - if (!sched->ring_sched_states[r].init(sm_header_arg, r)) { - return false; - } - } - - // Ready queues — one per resource shape plus DUMMY. - for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { - if (!ready_queue_init_from_layout( - &sched->ready_queues[i], arena, layout.off_ready_queue_slots[i], layout.ready_queue_capacity - )) { - return false; - } - } - if (!ready_queue_init_from_layout( - &sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity - )) { - return false; - } - - // Per-ring dep_pool: PTO2DepListPool::init takes an externally-allocated - // base + capacity, so we just plumb the arena region into it. - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - auto *dep_entries = static_cast(arena.region_ptr(layout.off_dep_pool_entries[r])); - // calloc-equivalent: pool expects entries zeroed at construction. - memset(dep_entries, 0, static_cast(layout.dep_pool_capacity) * sizeof(PTO2DepListEntry)); - sched->ring_sched_states[r].dep_pool.init( - dep_entries, layout.dep_pool_capacity, &sm_header_arg->orch_error_code - ); - } - - // Wiring SPSC queue (orchestrator push, scheduler thread 0 pop). - if (!sched->wiring.queue.init_from_layout(arena, layout.off_wiring_spsc_buffer, layout.spsc_capacity)) { - return false; - } - sched->wiring.batch_count = 0; - sched->wiring.batch_index = 0; - sched->wiring.backoff_counter = 0; - - return true; -} - -void PTO2SchedulerState::destroy() { - PTO2SchedulerState *sched = this; - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - sched->ring_sched_states[r].destroy(); - sched->ring_sched_states[r].dep_pool.base = nullptr; - } - - sched->wiring.queue.destroy(); - - for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { - ready_queue_destroy(&sched->ready_queues[i]); - } - ready_queue_destroy(&sched->dummy_ready_queue); -} - // ============================================================================= // Debug Utilities // ============================================================================= diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h index 32887d0be..173f65135 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h @@ -409,7 +409,14 @@ struct alignas(64) PTO2ReadyQueue { // initialize sequence counters // destroy: forget the slots pointer (arena owns the buffer) size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity); -bool ready_queue_init_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity); +// Writes everything *except* the arena-internal `slots` pointer field +// (sequences/positions on the slot array, capacity, mask). Uses +// arena.region_ptr(slots_off) only to address the slot array for writes; +// does NOT store the pointer in `queue->slots`. Call +// `ready_queue_wire_arena_pointers` afterwards to set the field itself. +bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity); +// Stores queue->slots = arena.region_ptr(slots_off). Idempotent. +void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off); void ready_queue_destroy(PTO2ReadyQueue *queue); // ============================================================================= @@ -449,13 +456,17 @@ struct alignas(64) PTO2SpscQueue { return arena.reserve(capacity * sizeof(PTO2TaskSlotState *), PTO2_ALIGN_SIZE); } - // Bind buffer pointer + reset indices. The capacity must be a power of two - // and match the value passed to reserve_layout. - bool init_from_layout(DeviceArena &arena, size_t buffer_off, uint64_t capacity) { + // Writes everything except the arena-internal `buffer_` pointer field + // (zeros the slot pointer array, mask/head/tail). The host pre-builds the + // image without storing a host address in buffer_; the AICPU wires + // buffer_ at boot via wire_arena_pointers(). + bool init_data_from_layout(DeviceArena &arena, size_t buffer_off, uint64_t capacity) { if (capacity == 0 || (capacity & (capacity - 1)) != 0) return false; - buffer_ = static_cast(arena.region_ptr(buffer_off)); + auto *buf = static_cast(arena.region_ptr(buffer_off)); + // calloc'd-equivalent: zero the slot pointers so spurious early pops + // observe nullptr. for (uint64_t i = 0; i < capacity; i++) - buffer_[i] = nullptr; + buf[i] = nullptr; mask_ = capacity - 1; head_.store(0, std::memory_order_relaxed); tail_.store(0, std::memory_order_relaxed); @@ -464,6 +475,12 @@ struct alignas(64) PTO2SpscQueue { return true; } + // Wire the arena-internal pointer. Called by both host (with host arena) + // and AICPU (with device arena attached to the prebuilt image). + void wire_arena_pointers(DeviceArena &arena, size_t buffer_off) { + buffer_ = static_cast(arena.region_ptr(buffer_off)); + } + // Arena owns the buffer; here we only forget our pointer. void destroy() { buffer_ = nullptr; } @@ -561,7 +578,12 @@ struct PTO2SchedulerState { // --- Cache Line 1+: Thread 0 only (wiring dep_pool) --- alignas(64) PTO2DepListPool dep_pool; - bool init(PTO2SharedMemoryHeader *sm_header, int32_t ring_id); + // Initialize arena-internal data + arena-external pointers; does NOT + // store dep_pool.base (that lives in the runtime arena and is wired + // by SchedulerState::wire_arena_pointers). The `ring` field stores + // the device address of the SM ring header — computed via offset + // arithmetic, no SM dereference. + bool init_data_from_layout(void *sm_dev_base, int32_t ring_id); void destroy(); void sync_to_sm() { ring->fc.last_task_alive.store(last_task_alive, std::memory_order_release); } @@ -1040,10 +1062,23 @@ struct PTO2SchedulerState { // Phase 1: declare every sub-region (ready_queue slots, dummy queue slots, // per-ring dep_pool entries, wiring SPSC buffer) on the supplied arena. + // Capacities are baked into the returned layout; init_data_from_layout uses + // the same values. static PTO2SchedulerLayout reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE); - // Phase 3: bind region pointers and initialize state. - bool init_from_layout(const PTO2SchedulerLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header); + // Phase 3a: write everything *except* arena-internal pointer fields. + // `sm_dev_base` is the device address of the SM (only stored, never + // dereferenced here). Safe to call on a host arena that holds the + // prebuilt image buffer. (The orchestrator counterpart takes + // task_window_size for ring task_descriptors address arithmetic; the + // scheduler only needs the SM header / ring header base addresses, + // both window-size-independent.) + bool init_data_from_layout(const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base); + + // Phase 3b: write the arena-internal pointer fields + // (ready_queues[].slots, dummy_ready_queue.slots, dep_pool.base for each + // ring, wiring.queue.buffer_). Called on both host and device sides. + void wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena); // Forget per-region pointers; arena owns the backing memory. void destroy(); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp new file mode 100644 index 000000000..d66acfcc4 --- /dev/null +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp @@ -0,0 +1,355 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Host/AICPU shared runtime-arena layout, init_data and wire implementations. + * + * Lives under runtime/shared/ so it is included in both the host_runtime.so + * build (host pre-populates the prebuilt arena image) and the aicpu_runtime + * build (AICPU runs wire_arena_pointers + destroy after attach). The + * device-only parts of pto_runtime2.cpp / pto_orchestrator.cpp / pto_scheduler.cpp + * (ops table, scope/submit/dispatch business logic, profiling) stay in their + * original files and the aicpu build only. + */ + +#include +#include + +#include "pto_orchestrator.h" +#include "pto_runtime2.h" +#include "pto_ring_buffer.h" +#include "pto_shared_memory.h" +#include "pto_tensormap.h" +#include "scheduler/pto_scheduler.h" + +// ============================================================================= +// Ready queue +// ============================================================================= + +size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity) { + // Align the slots[] base to a full cache line so MPMC CAS traffic on the + // first slot cannot false-share with whatever region sits in front of us + // (e.g. orchestrator tensormap heads written by the orch thread). + return arena.reserve(capacity * sizeof(PTO2ReadyQueueSlot), PTO2_ALIGN_SIZE); +} + +bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity) { + // Address the slots region for data writes without storing the pointer in + // queue->slots — that field is set by ready_queue_wire_arena_pointers. + auto *slots_arena = static_cast(arena.region_ptr(slots_off)); + queue->capacity = capacity; + queue->mask = capacity - 1; + queue->enqueue_pos.store(0, std::memory_order_relaxed); + queue->dequeue_pos.store(0, std::memory_order_relaxed); + + for (uint64_t i = 0; i < capacity; i++) { + slots_arena[i].sequence.store((int64_t)i, std::memory_order_relaxed); + slots_arena[i].slot_state = nullptr; + } + + return true; +} + +void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off) { + queue->slots = static_cast(arena.region_ptr(slots_off)); +} + +void ready_queue_destroy(PTO2ReadyQueue *queue) { + // Arena owns the slots[] buffer; just forget the pointer. + queue->slots = nullptr; +} + +// ============================================================================= +// Scheduler +// ============================================================================= + +bool PTO2SchedulerState::RingSchedState::init_data_from_layout(void *sm_dev_base, int32_t ring_id) { + // ring stores the device address of the SM ring header — pure offset + // arithmetic, no SM load. + ring = pto2_sm_layout::ring_header_addr(sm_dev_base, ring_id); + last_task_alive = 0; + advance_lock.store(0, std::memory_order_relaxed); + + // Per-slot SM-side initialization (bind_ring + reset_for_reuse + + // fanin_count/active_mask zero) lives in PTO2SharedMemoryHandle:: + // init_header_per_ring so the AICPU performs it during SM reset; host + // prebuilt-arena init skips SM access here. + + return true; +} + +void PTO2SchedulerState::RingSchedState::destroy() { ring = nullptr; } + +PTO2SchedulerLayout PTO2SchedulerState::reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity) { + PTO2SchedulerLayout layout{}; + layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE; + layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE; + layout.dep_pool_capacity = dep_pool_capacity; + + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE); + } + layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + // Force a cache-line base so writes from scheduler thread 0 (sole + // writer of this ring's dep_pool) do not invalidate adjacent + // multi-threaded regions like ready_queue.slots. + layout.off_dep_pool_entries[r] = + arena.reserve(static_cast(dep_pool_capacity) * sizeof(PTO2DepListEntry), PTO2_ALIGN_SIZE); + } + layout.off_wiring_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE); + return layout; +} + +bool PTO2SchedulerState::init_data_from_layout( + const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base +) { + PTO2SchedulerState *sched = this; + sched->sm_header = reinterpret_cast(sm_dev_base); +#if PTO2_SCHED_PROFILING + sched->tasks_completed.store(0, std::memory_order_relaxed); + sched->tasks_consumed.store(0, std::memory_order_relaxed); +#endif + + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + if (!sched->ring_sched_states[r].init_data_from_layout(sm_dev_base, r)) { + return false; + } + } + + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + if (!ready_queue_init_data_from_layout( + &sched->ready_queues[i], arena, layout.off_ready_queue_slots[i], layout.ready_queue_capacity + )) { + return false; + } + } + if (!ready_queue_init_data_from_layout( + &sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity + )) { + return false; + } + + auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto *dep_entries = static_cast(arena.region_ptr(layout.off_dep_pool_entries[r])); + memset(dep_entries, 0, static_cast(layout.dep_pool_capacity) * sizeof(PTO2DepListEntry)); + sched->ring_sched_states[r].dep_pool.init(dep_entries, layout.dep_pool_capacity, orch_err); + } + + if (!sched->wiring.queue.init_data_from_layout(arena, layout.off_wiring_spsc_buffer, layout.spsc_capacity)) { + return false; + } + sched->wiring.batch_count = 0; + sched->wiring.batch_index = 0; + sched->wiring.backoff_counter = 0; + + return true; +} + +void PTO2SchedulerState::wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena) { + PTO2SchedulerState *sched = this; + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + ready_queue_wire_arena_pointers(&sched->ready_queues[i], arena, layout.off_ready_queue_slots[i]); + } + ready_queue_wire_arena_pointers(&sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + sched->ring_sched_states[r].dep_pool.base = + static_cast(arena.region_ptr(layout.off_dep_pool_entries[r])); + } + sched->wiring.queue.wire_arena_pointers(arena, layout.off_wiring_spsc_buffer); +} + +void PTO2SchedulerState::destroy() { + PTO2SchedulerState *sched = this; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + sched->ring_sched_states[r].destroy(); + sched->ring_sched_states[r].dep_pool.base = nullptr; + } + sched->wiring.queue.destroy(); + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + ready_queue_destroy(&sched->ready_queues[i]); + } + ready_queue_destroy(&sched->dummy_ready_queue); +} + +// ============================================================================= +// Orchestrator +// ============================================================================= + +PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout( + DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity +) { + PTO2OrchestratorLayout layout{}; + layout.dep_pool_capacity = dep_pool_capacity; + layout.scope_tasks_cap = PTO2_SCOPE_TASKS_CAP; + layout.scope_stack_capacity = PTO2_MAX_SCOPE_DEPTH; + + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + const size_t fanin_pool_bytes = + PTO2_ALIGN_UP(static_cast(dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE); + layout.off_fanin_pool[r] = arena.reserve(fanin_pool_bytes, PTO2_ALIGN_SIZE); + } + layout.off_scope_tasks = arena.reserve( + static_cast(layout.scope_tasks_cap) * sizeof(PTO2TaskSlotState *), alignof(PTO2TaskSlotState *) + ); + layout.off_scope_begins = + arena.reserve(static_cast(layout.scope_stack_capacity) * sizeof(int32_t), alignof(int32_t)); + layout.tensor_map = PTO2TensorMap::reserve_layout_default(arena, task_window_sizes); + return layout; +} + +bool PTO2OrchestratorState::init_data_from_layout( + const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size, + uint64_t task_window_size +) { + auto *orch = this; + *orch = PTO2OrchestratorState{}; + + orch->sm_header = reinterpret_cast(sm_dev_base); + orch->gm_heap_base = gm_heap; + orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH; + orch->fatal = false; + + // Mirror the SM API's per-ring window-size shape so a future per-ring + // SM layout cannot silently disagree with the addresses we compute here. + uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) + task_window_sizes[r] = task_window_size; + + auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + void *ring_heap_base = reinterpret_cast(gm_heap) + r * heap_size; + auto *task_descs_dev = pto2_sm_layout::ring_task_descriptors_addr(sm_dev_base, task_window_sizes, r); + auto *cur_idx_dev = pto2_sm_layout::ring_current_task_index_addr(sm_dev_base, r); + auto *last_alive_dev = pto2_sm_layout::ring_last_task_alive_addr(sm_dev_base, r); + + orch->rings[r].task_allocator.init( + task_descs_dev, static_cast(task_window_size), cur_idx_dev, last_alive_dev, ring_heap_base, + heap_size, orch_err + ); + + const size_t fanin_pool_bytes = + PTO2_ALIGN_UP(static_cast(layout.dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE); + auto *fanin_entries = static_cast(arena.region_ptr(layout.off_fanin_pool[r])); + memset(fanin_entries, 0, fanin_pool_bytes); + orch->rings[r].fanin_pool.init(fanin_entries, layout.dep_pool_capacity, orch_err); + } + + if (!orch->tensor_map.init_data_from_layout(layout.tensor_map, arena)) { + return false; + } + + orch->scope_tasks_size = 0; + orch->scope_tasks_capacity = layout.scope_tasks_cap; + orch->scope_stack_top = -1; + orch->scope_stack_capacity = layout.scope_stack_capacity; + orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH; + + return true; +} + +void PTO2OrchestratorState::wire_arena_pointers( + const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler_arg +) { + auto *orch = this; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + orch->rings[r].fanin_pool.base = static_cast(arena.region_ptr(layout.off_fanin_pool[r])); + } + orch->tensor_map.wire_arena_pointers(layout.tensor_map, arena); + orch->scope_tasks = static_cast(arena.region_ptr(layout.off_scope_tasks)); + orch->scope_begins = static_cast(arena.region_ptr(layout.off_scope_begins)); + orch->scheduler = scheduler_arg; +} + +void PTO2OrchestratorState::destroy() { + auto *orch = this; + orch->tensor_map.destroy(); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + orch->rings[r].fanin_pool.base = nullptr; + } + orch->scope_tasks = nullptr; + orch->scope_begins = nullptr; +} + +void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; } + +// ============================================================================= +// Top-level runtime arena +// ============================================================================= + +PTO2RuntimeArenaLayout +runtime_reserve_layout(DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity) { + PTO2RuntimeArenaLayout layout{}; + layout.task_window_size = task_window_size; + layout.dep_pool_capacity = dep_pool_capacity; + + int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + task_window_sizes[r] = static_cast(task_window_size); + } + + layout.off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle)); + layout.orch = PTO2OrchestratorState::reserve_layout(arena, task_window_sizes, dep_pool_capacity); + layout.sched = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacity); + layout.off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE); + layout.off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox)); + + layout.arena_size = arena.total_size(); + return layout; +} + +PTO2Runtime *runtime_init_data_from_layout( + DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, + uint64_t /*sm_size*/, void *gm_heap_dev_base, uint64_t heap_size +) { + PTO2Runtime *rt = static_cast(arena.region_ptr(layout.off_runtime)); + memset(rt, 0, sizeof(*rt)); + + auto *sm_wrap = static_cast(arena.region_ptr(layout.off_sm_handle)); + memset(sm_wrap, 0, sizeof(*sm_wrap)); + + // rt->ops is filled by the AICPU at boot. + rt->mode = mode; + rt->gm_heap = gm_heap_dev_base; + rt->gm_heap_size = heap_size > 0 ? heap_size * PTO2_MAX_RING_DEPTH : 0; + rt->gm_heap_owned = false; + rt->total_cycles = 0; + + if (!rt->orchestrator.init_data_from_layout( + layout.orch, arena, sm_dev_base, gm_heap_dev_base, heap_size, layout.task_window_size + )) { + return nullptr; + } + if (!rt->scheduler.init_data_from_layout(layout.sched, arena, sm_dev_base)) { + return nullptr; + } + + auto *mailbox = static_cast(arena.region_ptr(layout.off_mailbox)); + memset(mailbox, 0, sizeof(*mailbox)); + + return rt; +} + +void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt) { + rt->sm_handle = static_cast(arena.region_ptr(layout.off_sm_handle)); + rt->aicore_mailbox = static_cast(arena.region_ptr(layout.off_mailbox)); + rt->orchestrator.wire_arena_pointers(layout.orch, arena, &rt->scheduler); + rt->scheduler.wire_arena_pointers(layout.sched, arena); +} + +void runtime_destroy(PTO2Runtime *rt, DeviceArena & /*arena*/) { + // Arena buffer is pooled across runs by DeviceRunner — never freed here. + if (!rt) return; + rt->scheduler.destroy(); + rt->orchestrator.destroy(); + rt->aicore_mailbox = nullptr; + rt->sm_handle = nullptr; +} diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp index 358c87f57..1e1edff92 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp @@ -167,6 +167,23 @@ void PTO2SharedMemoryHandle::init_header_per_ring( header->sched_error_bitmap.store(0, std::memory_order_relaxed); header->sched_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed); header->sched_error_thread.store(-1, std::memory_order_relaxed); + + // Per-ring slot_states reset. Previously lived in + // PTO2SchedulerState::RingSchedState::init(), but it writes into + // ring->slot_states[] which is SM-side storage — keeping it here lets + // host-side prebuilt-arena init skip all SM dereferences. + // bind_ring() pins the ring_id (slot-invariant after this point); + // reset_for_reuse() prepares dynamic fanout/refcount fields so the first + // submit doesn't need an explicit reset. + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto &ring = header->rings[r]; + for (uint64_t i = 0; i < task_window_sizes[r]; i++) { + ring.slot_states[i].bind_ring(static_cast(r)); + ring.slot_states[i].reset_for_reuse(); + ring.slot_states[i].fanin_count = 0; + ring.slot_states[i].active_mask = ActiveMask{}; + } + } } // ============================================================================= diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp similarity index 82% rename from src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp rename to src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp index c09e6f4f6..b99c67233 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp @@ -30,7 +30,6 @@ #include "common.h" #include "common/unified_log.h" -#include "pto_orchestrator.h" // ============================================================================= // TensorMap Lookup Chain Length Statistics (compile-time toggle) @@ -82,37 +81,45 @@ PTO2TensorMap::reserve_layout_default(DeviceArena &arena, const int32_t new_task return reserve_layout(arena, PTO2_TENSORMAP_NUM_BUCKETS, PTO2_TENSORMAP_POOL_SIZE, new_task_window_sizes); } -bool PTO2TensorMap::init_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena) { +bool PTO2TensorMap::init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena) { num_buckets = layout.num_buckets; pool_size = layout.pool_size; - buckets = static_cast(arena.region_ptr(layout.off_buckets)); - entry_pool = static_cast(arena.region_ptr(layout.off_entry_pool)); - free_entry_list = static_cast(arena.region_ptr(layout.off_free_entry_list)); + // Address arena regions for data writes; do not store these in struct + // fields (wire_arena_pointers does that). + auto *buckets_arena = static_cast(arena.region_ptr(layout.off_buckets)); + auto *entry_pool_arena = static_cast(arena.region_ptr(layout.off_entry_pool)); + auto *free_list_arena = static_cast(arena.region_ptr(layout.off_free_entry_list)); + // buckets[]: empty == nullptr. for (int32_t i = 0; i < num_buckets; i++) { - buckets[i] = nullptr; + buckets_arena[i] = nullptr; } - memset(entry_pool, 0, static_cast(pool_size) * sizeof(PTO2TensorMapEntry)); + // entry_pool: zero-init equivalent to the previous calloc(entry_pool, ...). + // The pool's persistent invariant after init is "bucket_index == -1 means + // not linked", set explicitly below. + memset(entry_pool_arena, 0, static_cast(pool_size) * sizeof(PTO2TensorMapEntry)); for (int32_t i = 0; i < pool_size; i++) { - entry_pool[i].bucket_index = -1; - entry_pool[i].next_in_bucket = nullptr; - entry_pool[i].prev_in_bucket = nullptr; - entry_pool[i].next_in_task = nullptr; - entry_pool[i].prev_in_task = nullptr; - entry_pool[i].producer_task_id = PTO2TaskId{}; + entry_pool_arena[i].bucket_index = -1; + entry_pool_arena[i].next_in_bucket = nullptr; + entry_pool_arena[i].prev_in_bucket = nullptr; + entry_pool_arena[i].next_in_task = nullptr; + entry_pool_arena[i].prev_in_task = nullptr; + entry_pool_arena[i].producer_task_id = PTO2TaskId{}; } - memset(free_entry_list, 0, static_cast(pool_size) * sizeof(PTO2TensorMapEntry *)); + // free_entry_list: zeroed (was calloc'd before); contents become meaningful + // only after entries are freed back, so the body of the array stays as 0. + memset(free_list_arena, 0, static_cast(pool_size) * sizeof(PTO2TensorMapEntry *)); next_entry_idx = 0; free_num = 0; for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - task_entry_heads[r] = static_cast(arena.region_ptr(layout.off_task_entry_heads[r])); + auto *heads_arena = static_cast(arena.region_ptr(layout.off_task_entry_heads[r])); for (int32_t i = 0; i < layout.task_window_sizes[r]; i++) { - task_entry_heads[r][i] = nullptr; + heads_arena[i] = nullptr; } task_window_sizes[r] = layout.task_window_sizes[r]; last_task_alives[r] = 0; @@ -122,6 +129,15 @@ bool PTO2TensorMap::init_from_layout(const PTO2TensorMapLayout &layout, DeviceAr return true; } +void PTO2TensorMap::wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena) { + buckets = static_cast(arena.region_ptr(layout.off_buckets)); + entry_pool = static_cast(arena.region_ptr(layout.off_entry_pool)); + free_entry_list = static_cast(arena.region_ptr(layout.off_free_entry_list)); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + task_entry_heads[r] = static_cast(arena.region_ptr(layout.off_task_entry_heads[r])); + } +} + void PTO2TensorMap::destroy() { // Arena owns the backing memory; here we only forget our pointers so any // stray post-destroy access trips a nullptr dereference instead of reading diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp index 7daa54ed5..0ebb2ef79 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp @@ -46,6 +46,8 @@ Runtime::Runtime() { gm_heap_ptr_ = nullptr; slot_states_ptr_ = nullptr; orch_args_storage_.clear(); + prebuilt_arena_base_ = nullptr; + prebuilt_runtime_offset_ = 0; // Initialize device orchestration SO binary dev_orch_so_addr_ = 0; @@ -76,6 +78,13 @@ void Runtime::set_gm_heap(void *p) { gm_heap_ptr_ = p; } void Runtime::set_slot_states_ptr(void *p) { slot_states_ptr_ = p; } void Runtime::set_orch_args(const ChipStorageTaskArgs &args) { orch_args_storage_ = args; } +void Runtime::set_prebuilt_arena(void *arena_base, size_t runtime_off) { + prebuilt_arena_base_ = arena_base; + prebuilt_runtime_offset_ = runtime_off; +} +void *Runtime::get_prebuilt_arena_base() const { return prebuilt_arena_base_; } +size_t Runtime::get_prebuilt_runtime_offset() const { return prebuilt_runtime_offset_; } + // Device orchestration SO metadata (bytes live in a separate device buffer // owned by DeviceRunner; only the address/size travels in Runtime). void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size) { diff --git a/src/common/device_comm/device_arena.h b/src/common/device_comm/device_arena.h index a0ade3dc3..ffe34c479 100644 --- a/src/common/device_comm/device_arena.h +++ b/src/common/device_comm/device_arena.h @@ -95,12 +95,39 @@ class DeviceArena { // the trampoline's free path must therefore be nothrow.) void *commit(size_t base_align = kDefaultBaseAlign); + // Phase 2 alternative: attach to an externally-owned buffer instead of + // allocating one. Caller guarantees: + // (a) `external_base` is already `base_align`-aligned — attach does + // NOT forward-align, since the prebuilt image was constructed for + // the address the caller advertised; + // (b) the buffer is at least `total_size()` bytes (the sum of sizes + // passed to reserve()), since attach uses no forward-alignment + // slack of its own; + // (c) all region offsets the caller plans to read back via + // `region_ptr(off)` are held by the caller — attach does NOT + // repopulate the internal region table, and reserve() cannot run + // after attach (it asserts !committed_). `region_size()` likewise + // returns 0 for attached arenas; treat the arena post-attach as + // a base-pointer wrapper. + // + // Re-attach (release + attach the same or another buffer) is permitted + // so the AICPU boot path can rebind the same pooled image each run. + // + // The external buffer is NOT freed by release()/~DeviceArena(); ownership + // stays with the caller. Used for the prebuilt-arena fast path where + // a host-built image is rtMemcpy'd into a device buffer that DeviceRunner + // owns across runs. + void attach(void *external_base, size_t base_align = kDefaultBaseAlign) noexcept; + // Phase 3: pointer to the sub-region at `offset`. Asserts if called // before commit(). void *region_ptr(size_t offset) const noexcept; // Size of the sub-region whose offset matches `offset`. Linear scan; - // intended for debug / assertions, not hot path. + // intended for debug / assertions, not hot path. Returns 0 for an + // attached arena (attach() does not repopulate the region table) — + // callers in the prebuilt-image path keep sizes alongside their offsets + // instead. size_t region_size(size_t offset) const noexcept; // Free the backing buffer (if any) and reset to the pre-commit state so @@ -135,6 +162,9 @@ class DeviceArena { size_t raw_size_{0}; void *base_{nullptr}; bool committed_{false}; + // True when committed via attach(): the backing buffer is externally + // owned, so release() must not call free_(). + bool attached_{false}; size_t alloc_count_{0}; size_t free_count_{0}; @@ -166,6 +196,38 @@ inline void *DeviceArena::commit(size_t base_align) { return base_; } +inline void DeviceArena::attach(void *external_base, size_t base_align) noexcept { + // Re-attach (e.g. AICPU boot path attaches each run) is fine: only an + // attached state can be "re-attached" — release() it first to keep + // semantics tight. A real commit() (alloc-backed) must not be silently + // dropped, so still trap on that. + if (committed_) { + assert(attached_ && "DeviceArena::attach() called after commit (only re-attach is allowed)"); + release(); + } + // The external buffer must already be base_align-aligned by the caller — + // forward-align in-place would shift the visible base off the address the + // caller advertised (and that the prebuilt image was constructed for). + // The checks below are promoted to unconditional aborts (rather than + // plain assert()) because a misaligned attach silently produces a buffer + // whose visible base disagrees with every offset the prebuilt image was + // laid out against — release builds, which strip assert(), would still + // run on a corrupted arena. Aborting at the breakage point is far cheaper + // to triage than the downstream wild-pointer accesses. + const auto raw = reinterpret_cast(external_base); + const bool ok = (external_base != nullptr) && (base_align > 0) && ((base_align & (base_align - 1)) == 0) && + ((raw & (static_cast(base_align) - 1)) == 0); + if (!ok) { + assert(false && "DeviceArena::attach(): null base, non-power-of-two align, or pre-alignment violated"); + std::abort(); + } + base_ = external_base; + raw_base_ = nullptr; + raw_size_ = 0; + committed_ = true; + attached_ = true; +} + inline void *DeviceArena::region_ptr(size_t offset) const noexcept { assert(committed_ && "DeviceArena::region_ptr() called before commit()"); return reinterpret_cast(base_) + offset; @@ -179,7 +241,8 @@ inline size_t DeviceArena::region_size(size_t offset) const noexcept { } inline void DeviceArena::release() noexcept { - if (raw_base_ != nullptr) { + // attached arenas wrap externally-owned memory — never free. + if (raw_base_ != nullptr && !attached_) { free_(ctx_, raw_base_); ++free_count_; } @@ -189,4 +252,5 @@ inline void DeviceArena::release() noexcept { cursor_ = 0; region_count_ = 0; committed_ = false; + attached_ = false; } diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt index 89314d800..39cf5977b 100644 --- a/tests/ut/cpp/CMakeLists.txt +++ b/tests/ut/cpp/CMakeLists.txt @@ -90,6 +90,7 @@ add_library(a2a3_rt_objs OBJECT ${A2A3_RUNTIME_DIR}/shared/pto_shared_memory.cpp ${A2A3_RUNTIME_DIR}/scheduler/pto_scheduler.cpp ${A2A3_RUNTIME_DIR}/shared/pto_tensormap.cpp + ${A2A3_RUNTIME_DIR}/shared/pto_runtime2_init.cpp ${CMAKE_SOURCE_DIR}/stubs/test_stubs.cpp ) target_include_directories(a2a3_rt_objs PUBLIC @@ -193,6 +194,45 @@ function(add_a5_test name src) add_test(NAME ${name} COMMAND ${name}) endfunction() +# --------------------------------------------------------------------------- +# A5 runtime sources, mirroring a2a3_rt_objs. Bundled into an OBJECT library +# so the runtime .cpp files compile once and the resulting .o files are +# reused across every a5 runtime test executable. +# --------------------------------------------------------------------------- +set(A5_RUNTIME_DIR ${CMAKE_SOURCE_DIR}/../../../src/a5/runtime/tensormap_and_ringbuffer/runtime) + +add_library(a5_rt_objs OBJECT + ${A5_RUNTIME_DIR}/pto_ring_buffer.cpp + ${A5_RUNTIME_DIR}/shared/pto_shared_memory.cpp + ${A5_RUNTIME_DIR}/scheduler/pto_scheduler.cpp + ${A5_RUNTIME_DIR}/shared/pto_tensormap.cpp + ${A5_RUNTIME_DIR}/shared/pto_runtime2_init.cpp + ${CMAKE_SOURCE_DIR}/stubs/test_stubs.cpp +) +target_include_directories(a5_rt_objs PUBLIC + ${CMAKE_SOURCE_DIR}/../../../src/a5/runtime/tensormap_and_ringbuffer/orchestration + ${CMAKE_SOURCE_DIR}/../../../src/a5/runtime/tensormap_and_ringbuffer/runtime + ${CMAKE_SOURCE_DIR}/../../../src/a5/runtime/tensormap_and_ringbuffer/common + ${CMAKE_SOURCE_DIR}/../../../src/a5/platform/include + ${CMAKE_SOURCE_DIR}/../../../src/common/task_interface + ${CMAKE_SOURCE_DIR}/../../../src/common/log/include + ${CMAKE_SOURCE_DIR}/../../../src/common/device_comm +) +target_compile_options(a5_rt_objs PUBLIC -D_GLIBCXX_USE_CXX11_ABI=0) + +function(add_a5_runtime_test name src) + add_executable(${name} ${src}) + target_include_directories(${name} PRIVATE ${GTEST_INCLUDE_DIRS}) + target_link_libraries(${name} PRIVATE + a5_rt_objs + ${GTEST_MAIN_LIB} + ${GTEST_LIB} + pthread + ) + add_test(NAME ${name} COMMAND ${name}) + set_tests_properties(${name} PROPERTIES LABELS "no_hardware") +endfunction() + function(add_task_interface_test name src) add_executable(${name} ${src}) target_include_directories(${name} PRIVATE @@ -313,6 +353,21 @@ add_a2a3_runtime_test(test_wiring a2a3/test_wiring.cpp) # --------------------------------------------------------------------------- add_a5_test(test_a5_fatal a5/test_a5_fatal.cpp) +# A5 trb runtime UTs — mirror of a2a3 trb runtime UTs, link against a5_rt_objs. +# Target names carry the a5_ prefix because hierarchical/test_tensormap (and +# the unprefixed a2a3 runtime targets test_scheduler_state / test_ready_queue +# / ...) already own those bare names. +add_a5_runtime_test(test_a5_task_allocator a5/test_task_allocator.cpp) +add_a5_runtime_test(test_a5_dep_list_pool a5/test_dep_list_pool.cpp) +add_a5_runtime_test(test_a5_scheduler_state a5/test_scheduler_state.cpp) +add_a5_runtime_test(test_a5_task_state a5/test_task_state.cpp) +add_a5_runtime_test(test_a5_ready_queue a5/test_ready_queue.cpp) +add_a5_runtime_test(test_a5_shared_memory a5/test_shared_memory.cpp) +add_a5_runtime_test(test_a5_tensormap a5/test_tensormap.cpp) +add_a5_runtime_test(test_a5_fanin_pool a5/test_fanin_pool.cpp) +add_a5_runtime_test(test_a5_spsc_queue a5/test_spsc_queue.cpp) +add_a5_runtime_test(test_a5_wiring a5/test_wiring.cpp) + # Host logger silent/off behavior — no runtime deps, just compile host_log.cpp # alongside the test (faster than dlopen'ing libsimpler_log.so for a unit test). set(SIMPLER_LOG_DIR ${CMAKE_SOURCE_DIR}/../../../src/common/log) diff --git a/tests/ut/cpp/a2a3/test_ready_queue.cpp b/tests/ut/cpp/a2a3/test_ready_queue.cpp index 413e36cfd..f12b1e7c7 100644 --- a/tests/ut/cpp/a2a3/test_ready_queue.cpp +++ b/tests/ut/cpp/a2a3/test_ready_queue.cpp @@ -61,7 +61,8 @@ class ReadyQueueTest : public ::testing::Test { void SetUp() override { const size_t off = ready_queue_reserve_layout(arena, CAPACITY); ASSERT_NE(arena.commit(), nullptr); - ASSERT_TRUE(ready_queue_init_from_layout(&queue, arena, off, CAPACITY)); + ASSERT_TRUE(ready_queue_init_data_from_layout(&queue, arena, off, CAPACITY)); + ready_queue_wire_arena_pointers(&queue, arena, off); } void TearDown() override { @@ -231,7 +232,8 @@ class ReadyQueueBoundaryTest : public ::testing::Test { void SetUp() override { const size_t off = ready_queue_reserve_layout(arena, QUEUE_CAP); ASSERT_NE(arena.commit(), nullptr); - ASSERT_TRUE(ready_queue_init_from_layout(&queue, arena, off, QUEUE_CAP)); + ASSERT_TRUE(ready_queue_init_data_from_layout(&queue, arena, off, QUEUE_CAP)); + ready_queue_wire_arena_pointers(&queue, arena, off); } void TearDown() override { ready_queue_destroy(&queue); @@ -330,7 +332,8 @@ class ReadyQueueMPMCTest : public ::testing::TestWithParam { void SetUp() override { const size_t off = ready_queue_reserve_layout(arena, CAPACITY); ASSERT_NE(arena.commit(), nullptr); - ASSERT_TRUE(ready_queue_init_from_layout(&queue, arena, off, CAPACITY)); + ASSERT_TRUE(ready_queue_init_data_from_layout(&queue, arena, off, CAPACITY)); + ready_queue_wire_arena_pointers(&queue, arena, off); } void TearDown() override { ready_queue_destroy(&queue); diff --git a/tests/ut/cpp/a2a3/test_scheduler_state.cpp b/tests/ut/cpp/a2a3/test_scheduler_state.cpp index 952aad55a..75476dedf 100644 --- a/tests/ut/cpp/a2a3/test_scheduler_state.cpp +++ b/tests/ut/cpp/a2a3/test_scheduler_state.cpp @@ -34,7 +34,8 @@ class SchedulerStateTest : public ::testing::Test { ASSERT_NE(sm_handle, nullptr); auto layout = PTO2SchedulerState::reserve_layout(sched_arena); ASSERT_NE(sched_arena.commit(), nullptr); - ASSERT_TRUE(sched.init_from_layout(layout, sched_arena, sm_handle->header)); + ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header)); + sched.wire_arena_pointers(layout, sched_arena); } void TearDown() override { diff --git a/tests/ut/cpp/a2a3/test_spsc_queue.cpp b/tests/ut/cpp/a2a3/test_spsc_queue.cpp index 28e43d5a2..5dce3ba4a 100644 --- a/tests/ut/cpp/a2a3/test_spsc_queue.cpp +++ b/tests/ut/cpp/a2a3/test_spsc_queue.cpp @@ -47,7 +47,8 @@ class SpscQueueTest : public ::testing::Test { memset(&queue, 0, sizeof(queue)); const size_t off = PTO2SpscQueue::reserve_layout(arena, CAPACITY); ASSERT_NE(arena.commit(), nullptr); - ASSERT_TRUE(queue.init_from_layout(arena, off, CAPACITY)); + ASSERT_TRUE(queue.init_data_from_layout(arena, off, CAPACITY)); + queue.wire_arena_pointers(arena, off); } void TearDown() override { @@ -74,9 +75,9 @@ TEST_F(SpscQueueTest, InitRejectsNonPowerOfTwo) { const size_t off = PTO2SpscQueue::reserve_layout(local, 1); // dummy reservation so commit succeeds (void)off; ASSERT_NE(local.commit(), nullptr); - EXPECT_FALSE(bad.init_from_layout(local, off, 3)); - EXPECT_FALSE(bad.init_from_layout(local, off, 7)); - EXPECT_FALSE(bad.init_from_layout(local, off, 0)); + EXPECT_FALSE(bad.init_data_from_layout(local, off, 3)); + EXPECT_FALSE(bad.init_data_from_layout(local, off, 7)); + EXPECT_FALSE(bad.init_data_from_layout(local, off, 0)); } TEST_F(SpscQueueTest, InitAcceptsPowerOfTwo) { @@ -85,9 +86,9 @@ TEST_F(SpscQueueTest, InitAcceptsPowerOfTwo) { const size_t off4 = PTO2SpscQueue::reserve_layout(local, 4); const size_t off1024 = PTO2SpscQueue::reserve_layout(local, 1024); ASSERT_NE(local.commit(), nullptr); - EXPECT_TRUE(q.init_from_layout(local, off4, 4)); + EXPECT_TRUE(q.init_data_from_layout(local, off4, 4)); q.destroy(); - EXPECT_TRUE(q.init_from_layout(local, off1024, 1024)); + EXPECT_TRUE(q.init_data_from_layout(local, off1024, 1024)); q.destroy(); } diff --git a/tests/ut/cpp/a2a3/test_task_allocator.cpp b/tests/ut/cpp/a2a3/test_task_allocator.cpp index 383003900..512e241a2 100644 --- a/tests/ut/cpp/a2a3/test_task_allocator.cpp +++ b/tests/ut/cpp/a2a3/test_task_allocator.cpp @@ -388,7 +388,10 @@ TEST_F(TaskAllocatorTest, TaskWindowSaturates) { TEST_F(TaskAllocatorTest, TaskIdNearInt32Max) { current_index.store(INT32_MAX - 2); last_alive.store(INT32_MAX - 2); - allocator.init(descriptors.data(), WINDOW_SIZE, ¤t_index, &last_alive, heap_buf, HEAP_SIZE, &error_code); + allocator.init( + descriptors.data(), WINDOW_SIZE, ¤t_index, &last_alive, heap_buf, HEAP_SIZE, &error_code, + /*initial_local_task_id=*/INT32_MAX - 2 + ); auto r1 = allocator.alloc(0); ASSERT_FALSE(r1.failed()); diff --git a/tests/ut/cpp/a2a3/test_task_state.cpp b/tests/ut/cpp/a2a3/test_task_state.cpp index 729b74999..ffced6f9a 100644 --- a/tests/ut/cpp/a2a3/test_task_state.cpp +++ b/tests/ut/cpp/a2a3/test_task_state.cpp @@ -43,7 +43,8 @@ class TaskStateTest : public ::testing::Test { ASSERT_NE(sm_handle, nullptr); auto layout = PTO2SchedulerState::reserve_layout(sched_arena); ASSERT_NE(sched_arena.commit(), nullptr); - ASSERT_TRUE(sched.init_from_layout(layout, sched_arena, sm_handle->header)); + ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header)); + sched.wire_arena_pointers(layout, sched_arena); } void TearDown() override { diff --git a/tests/ut/cpp/a2a3/test_tensormap.cpp b/tests/ut/cpp/a2a3/test_tensormap.cpp index 204d00e42..805a9e079 100644 --- a/tests/ut/cpp/a2a3/test_tensormap.cpp +++ b/tests/ut/cpp/a2a3/test_tensormap.cpp @@ -83,7 +83,8 @@ class TensorMapTest : public ::testing::Test { int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {WINDOW_SIZE, WINDOW_SIZE, WINDOW_SIZE, WINDOW_SIZE}; auto layout = PTO2TensorMap::reserve_layout(arena, NUM_BUCKETS, POOL_SIZE, window_sizes); ASSERT_NE(arena.commit(), nullptr); - ASSERT_TRUE(tmap.init_from_layout(layout, arena)); + ASSERT_TRUE(tmap.init_data_from_layout(layout, arena)); + tmap.wire_arena_pointers(layout, arena); } void TearDown() override { @@ -113,7 +114,8 @@ TEST_F(TensorMapTest, InitRequiresPowerOfTwoBuckets) { int32_t ws[PTO2_MAX_RING_DEPTH] = {8, 8, 8, 8}; auto layout = PTO2TensorMap::reserve_layout(bad_arena, 8, 64, ws); ASSERT_NE(bad_arena.commit(), nullptr); - EXPECT_TRUE(bad.init_from_layout(layout, bad_arena)); + EXPECT_TRUE(bad.init_data_from_layout(layout, bad_arena)); + bad.wire_arena_pointers(layout, bad_arena); bad.destroy(); } diff --git a/tests/ut/cpp/a2a3/test_wiring.cpp b/tests/ut/cpp/a2a3/test_wiring.cpp index b01052a85..1e8fee9c5 100644 --- a/tests/ut/cpp/a2a3/test_wiring.cpp +++ b/tests/ut/cpp/a2a3/test_wiring.cpp @@ -48,7 +48,8 @@ class WiringTest : public ::testing::Test { ASSERT_NE(sm_handle, nullptr); auto layout = PTO2SchedulerState::reserve_layout(sched_arena); ASSERT_NE(sched_arena.commit(), nullptr); - ASSERT_TRUE(sched.init_from_layout(layout, sched_arena, sm_handle->header)); + ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header)); + sched.wire_arena_pointers(layout, sched_arena); } void TearDown() override { diff --git a/tests/ut/cpp/a5/test_ready_queue.cpp b/tests/ut/cpp/a5/test_ready_queue.cpp index 9dea3ae94..f12b1e7c7 100644 --- a/tests/ut/cpp/a5/test_ready_queue.cpp +++ b/tests/ut/cpp/a5/test_ready_queue.cpp @@ -44,6 +44,7 @@ #include #include +#include "device_arena.h" #include "scheduler/pto_scheduler.h" // ============================================================================= @@ -55,10 +56,19 @@ class ReadyQueueTest : public ::testing::Test { static constexpr uint64_t CAPACITY = 16; // Power of 2 PTO2ReadyQueue queue; + DeviceArena arena; - void SetUp() override { ASSERT_TRUE(ready_queue_init(&queue, CAPACITY)); } + void SetUp() override { + const size_t off = ready_queue_reserve_layout(arena, CAPACITY); + ASSERT_NE(arena.commit(), nullptr); + ASSERT_TRUE(ready_queue_init_data_from_layout(&queue, arena, off, CAPACITY)); + ready_queue_wire_arena_pointers(&queue, arena, off); + } - void TearDown() override { ready_queue_destroy(&queue); } + void TearDown() override { + ready_queue_destroy(&queue); + arena.release(); + } }; // ============================================================================= @@ -217,8 +227,18 @@ class ReadyQueueBoundaryTest : public ::testing::Test { PTO2ReadyQueue queue{}; PTO2TaskSlotState dummy[8]{}; - void SetUp() override { ASSERT_TRUE(ready_queue_init(&queue, QUEUE_CAP)); } - void TearDown() override { ready_queue_destroy(&queue); } + DeviceArena arena; + + void SetUp() override { + const size_t off = ready_queue_reserve_layout(arena, QUEUE_CAP); + ASSERT_NE(arena.commit(), nullptr); + ASSERT_TRUE(ready_queue_init_data_from_layout(&queue, arena, off, QUEUE_CAP)); + ready_queue_wire_arena_pointers(&queue, arena, off); + } + void TearDown() override { + ready_queue_destroy(&queue); + arena.release(); + } }; TEST_F(ReadyQueueBoundaryTest, ExactCapacityFillDrain) { @@ -307,8 +327,18 @@ class ReadyQueueMPMCTest : public ::testing::TestWithParam { static constexpr uint64_t CAPACITY = 1024; PTO2ReadyQueue queue; - void SetUp() override { ASSERT_TRUE(ready_queue_init(&queue, CAPACITY)); } - void TearDown() override { ready_queue_destroy(&queue); } + DeviceArena arena; + + void SetUp() override { + const size_t off = ready_queue_reserve_layout(arena, CAPACITY); + ASSERT_NE(arena.commit(), nullptr); + ASSERT_TRUE(ready_queue_init_data_from_layout(&queue, arena, off, CAPACITY)); + ready_queue_wire_arena_pointers(&queue, arena, off); + } + void TearDown() override { + ready_queue_destroy(&queue); + arena.release(); + } }; TEST_P(ReadyQueueMPMCTest, NoDuplicateNoLoss) { diff --git a/tests/ut/cpp/a5/test_scheduler_state.cpp b/tests/ut/cpp/a5/test_scheduler_state.cpp index 952aad55a..75476dedf 100644 --- a/tests/ut/cpp/a5/test_scheduler_state.cpp +++ b/tests/ut/cpp/a5/test_scheduler_state.cpp @@ -34,7 +34,8 @@ class SchedulerStateTest : public ::testing::Test { ASSERT_NE(sm_handle, nullptr); auto layout = PTO2SchedulerState::reserve_layout(sched_arena); ASSERT_NE(sched_arena.commit(), nullptr); - ASSERT_TRUE(sched.init_from_layout(layout, sched_arena, sm_handle->header)); + ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header)); + sched.wire_arena_pointers(layout, sched_arena); } void TearDown() override { diff --git a/tests/ut/cpp/a5/test_spsc_queue.cpp b/tests/ut/cpp/a5/test_spsc_queue.cpp index a2c80ca05..5dce3ba4a 100644 --- a/tests/ut/cpp/a5/test_spsc_queue.cpp +++ b/tests/ut/cpp/a5/test_spsc_queue.cpp @@ -27,6 +27,7 @@ #include #include +#include "device_arena.h" #include "scheduler/pto_scheduler.h" // ============================================================================= @@ -38,15 +39,22 @@ class SpscQueueTest : public ::testing::Test { static constexpr uint64_t CAPACITY = 16; // must be power of 2 PTO2SpscQueue queue{}; + DeviceArena arena; // Dummy slot states used as push values alignas(64) PTO2TaskSlotState slots[64]{}; void SetUp() override { memset(&queue, 0, sizeof(queue)); - ASSERT_TRUE(queue.init(CAPACITY)); + const size_t off = PTO2SpscQueue::reserve_layout(arena, CAPACITY); + ASSERT_NE(arena.commit(), nullptr); + ASSERT_TRUE(queue.init_data_from_layout(arena, off, CAPACITY)); + queue.wire_arena_pointers(arena, off); } - void TearDown() override { queue.destroy(); } + void TearDown() override { + queue.destroy(); + arena.release(); + } }; // ============================================================================= @@ -60,17 +68,27 @@ TEST_F(SpscQueueTest, InitValidState) { } TEST_F(SpscQueueTest, InitRejectsNonPowerOfTwo) { + // init_from_layout rejects non-power-of-two capacities. Use a fresh arena + // each time since reserve runs before commit. PTO2SpscQueue bad{}; - EXPECT_FALSE(bad.init(3)); - EXPECT_FALSE(bad.init(7)); - EXPECT_FALSE(bad.init(0)); + DeviceArena local; + const size_t off = PTO2SpscQueue::reserve_layout(local, 1); // dummy reservation so commit succeeds + (void)off; + ASSERT_NE(local.commit(), nullptr); + EXPECT_FALSE(bad.init_data_from_layout(local, off, 3)); + EXPECT_FALSE(bad.init_data_from_layout(local, off, 7)); + EXPECT_FALSE(bad.init_data_from_layout(local, off, 0)); } TEST_F(SpscQueueTest, InitAcceptsPowerOfTwo) { PTO2SpscQueue q{}; - EXPECT_TRUE(q.init(4)); + DeviceArena local; + const size_t off4 = PTO2SpscQueue::reserve_layout(local, 4); + const size_t off1024 = PTO2SpscQueue::reserve_layout(local, 1024); + ASSERT_NE(local.commit(), nullptr); + EXPECT_TRUE(q.init_data_from_layout(local, off4, 4)); q.destroy(); - EXPECT_TRUE(q.init(1024)); + EXPECT_TRUE(q.init_data_from_layout(local, off1024, 1024)); q.destroy(); } diff --git a/tests/ut/cpp/a5/test_task_allocator.cpp b/tests/ut/cpp/a5/test_task_allocator.cpp index 383003900..512e241a2 100644 --- a/tests/ut/cpp/a5/test_task_allocator.cpp +++ b/tests/ut/cpp/a5/test_task_allocator.cpp @@ -388,7 +388,10 @@ TEST_F(TaskAllocatorTest, TaskWindowSaturates) { TEST_F(TaskAllocatorTest, TaskIdNearInt32Max) { current_index.store(INT32_MAX - 2); last_alive.store(INT32_MAX - 2); - allocator.init(descriptors.data(), WINDOW_SIZE, ¤t_index, &last_alive, heap_buf, HEAP_SIZE, &error_code); + allocator.init( + descriptors.data(), WINDOW_SIZE, ¤t_index, &last_alive, heap_buf, HEAP_SIZE, &error_code, + /*initial_local_task_id=*/INT32_MAX - 2 + ); auto r1 = allocator.alloc(0); ASSERT_FALSE(r1.failed()); diff --git a/tests/ut/cpp/a5/test_task_state.cpp b/tests/ut/cpp/a5/test_task_state.cpp index 729b74999..ffced6f9a 100644 --- a/tests/ut/cpp/a5/test_task_state.cpp +++ b/tests/ut/cpp/a5/test_task_state.cpp @@ -43,7 +43,8 @@ class TaskStateTest : public ::testing::Test { ASSERT_NE(sm_handle, nullptr); auto layout = PTO2SchedulerState::reserve_layout(sched_arena); ASSERT_NE(sched_arena.commit(), nullptr); - ASSERT_TRUE(sched.init_from_layout(layout, sched_arena, sm_handle->header)); + ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header)); + sched.wire_arena_pointers(layout, sched_arena); } void TearDown() override { diff --git a/tests/ut/cpp/a5/test_tensormap.cpp b/tests/ut/cpp/a5/test_tensormap.cpp index 10eef0317..ec83a064d 100644 --- a/tests/ut/cpp/a5/test_tensormap.cpp +++ b/tests/ut/cpp/a5/test_tensormap.cpp @@ -28,6 +28,7 @@ #include #include +#include "device_arena.h" #include "pto_orchestration_api.h" #include "pto_tensormap.h" @@ -76,13 +77,20 @@ class TensorMapTest : public ::testing::Test { static constexpr int32_t WINDOW_SIZE = 32; PTO2TensorMap tmap{}; + DeviceArena arena; void SetUp() override { int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {WINDOW_SIZE, WINDOW_SIZE, WINDOW_SIZE, WINDOW_SIZE}; - ASSERT_TRUE(tmap.init(NUM_BUCKETS, POOL_SIZE, window_sizes)); + auto layout = PTO2TensorMap::reserve_layout(arena, NUM_BUCKETS, POOL_SIZE, window_sizes); + ASSERT_NE(arena.commit(), nullptr); + ASSERT_TRUE(tmap.init_data_from_layout(layout, arena)); + tmap.wire_arena_pointers(layout, arena); } - void TearDown() override { tmap.destroy(); } + void TearDown() override { + tmap.destroy(); + arena.release(); + } }; // ============================================================================= @@ -97,13 +105,19 @@ TEST_F(TensorMapTest, InitValidState) { EXPECT_EQ(tmap.valid_count(), 0); } -TEST_F(TensorMapTest, InitRequiresPowerOfTwoBuckets) { - PTO2TensorMap bad{}; +TEST_F(TensorMapTest, InitWithPowerOfTwoBucketsSucceeds) { + // The reject path for non-power-of-2 bucket counts is enforced via an + // always_assert inside reserve_layout. It is not asserted here because + // EXPECT_DEATH cannot run reliably in release builds where always_assert + // may compile out. Cover only the accepted (power-of-2) shape. + PTO2TensorMap ok{}; + DeviceArena ok_arena; int32_t ws[PTO2_MAX_RING_DEPTH] = {8, 8, 8, 8}; - EXPECT_FALSE(bad.init(3, 64, ws)) << "non-power-of-2 bucket count must fail"; - EXPECT_FALSE(bad.init(7, 64, ws)); - EXPECT_TRUE(bad.init(8, 64, ws)); - bad.destroy(); + auto layout = PTO2TensorMap::reserve_layout(ok_arena, 8, 64, ws); + ASSERT_NE(ok_arena.commit(), nullptr); + EXPECT_TRUE(ok.init_data_from_layout(layout, ok_arena)); + ok.wire_arena_pointers(layout, ok_arena); + ok.destroy(); } // ============================================================================= diff --git a/tests/ut/cpp/a5/test_wiring.cpp b/tests/ut/cpp/a5/test_wiring.cpp index b01052a85..1e8fee9c5 100644 --- a/tests/ut/cpp/a5/test_wiring.cpp +++ b/tests/ut/cpp/a5/test_wiring.cpp @@ -48,7 +48,8 @@ class WiringTest : public ::testing::Test { ASSERT_NE(sm_handle, nullptr); auto layout = PTO2SchedulerState::reserve_layout(sched_arena); ASSERT_NE(sched_arena.commit(), nullptr); - ASSERT_TRUE(sched.init_from_layout(layout, sched_arena, sm_handle->header)); + ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header)); + sched.wire_arena_pointers(layout, sched_arena); } void TearDown() override {