From ca4a8849ec5eaa4a0cf2d65ea80a6be771ec85bf Mon Sep 17 00:00:00 2001 From: Chao Wang <26245345+ChaoWao@users.noreply.github.com> Date: Wed, 27 May 2026 10:46:28 +0800 Subject: [PATCH] Refactor: persist AICPU/AICore streams + eager bootstrap at simpler_init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related lifecycle changes on onboard DeviceRunner (a2a3 + a5) that both move per-run work to one-shot init/finalize: 1. Streams now live for the DeviceRunner's lifetime. - rtStreamCreate / rtStreamDestroy were happening on every prepare_callable and run_prepared call (4 rtStream* per launch, ~ms each). The stream check inside prepare_run_context already short-circuits on existing streams, so the per-run create/destroy was strictly redundant once streams persist. - release_run_context becomes a no-op; finalize gains the matching rtStreamDestroy pair. simpler_init triggers stream creation eagerly via ensure_device_initialized. 2. Bootstrap (BootstrapDispatcher + LoadAicpuOp::Init + AicpuSoInfo H2D + init_device_args) moves from "first run() call" to simpler_init. - The previous laziness was a stream-lifecycle side effect: bootstrap needs a stream, streams were per-run, so bootstrap had to wait for the first run. With persistent streams, that constraint is gone. - ensure_device_initialized is moved to the public section on DeviceRunner so simpler_init can call it directly after the executor bytes are cached. ABI / Python surface unchanged. Sim platforms untouched (no streams or bootstrap there). Hardware validation (Ascend910, device 3): - aicore_op_timeout (a2a3, a5): PASS - paged_attention_unroll (a2a3 — HANDOFF's canary): PASS - vector_add, hello_worker, paged_attention_manual_scope: PASS - a2a3sim hello_worker (sanity for unchanged sim path): PASS Benchmark (tensormap_and_ringbuffer, device 3, 100 rounds): - Total / Sched / Orch: ±1% (device-side wall is untouched, as expected) - Round 0 (cold start) Host: 7 of 9 examples improved 5%–50%, max -424 ms on paged_attention_unroll C1 (consistent with BootstrapDispatcher being the ~200-500 ms first-run cost). The remaining two examples were within per-example noise (±100 ms band on a single round). - Steady-state Host (round 50+ mean, summed across 9 examples): -12% aggregate, but per-example deltas are noise-dominated (single example variance ±200 ms); the small per-run rtStream*/no-op-release saving (~0.5-1 ms per run) sits well below host noise floor and only the aggregate is interpretable. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../platform/onboard/host/device_runner.cpp | 36 +++++++----- .../platform/onboard/host/device_runner.h | 55 +++++++++++-------- .../onboard/host/pto_runtime_c_api.cpp | 21 +++++-- .../platform/onboard/host/device_runner.cpp | 35 ++++++++---- src/a5/platform/onboard/host/device_runner.h | 55 +++++++++++-------- .../onboard/host/pto_runtime_c_api.cpp | 18 +++++- 6 files changed, 143 insertions(+), 77 deletions(-) diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index cf6ddea88..e39ca1a0b 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -296,8 +296,9 @@ std::thread DeviceRunner::create_thread(std::function fn) { } int DeviceRunner::ensure_device_initialized() { - // First attach the current thread and create fresh run-scoped streams. - // device_id_ was set in attach_current_thread() during simpler_init. + // Attach the current thread to the device and create the persistent + // AICPU/AICore streams (destroyed in finalize()). device_id_ was set in + // attach_current_thread() during simpler_init. int rc = prepare_run_context(device_id_); if (rc != 0) { return rc; @@ -420,8 +421,6 @@ int DeviceRunner::prepare_run_context(int device_id) { return 0; } - release_run_context(); - // Create streams rc = rtStreamCreate(&stream_aicpu_, 0); if (rc != 0) { @@ -442,15 +441,10 @@ int DeviceRunner::prepare_run_context(int device_id) { } void DeviceRunner::release_run_context() { - // Destroy streams (they belong to the current thread's CANN context) - if (stream_aicpu_ != nullptr) { - rtStreamDestroy(stream_aicpu_); - stream_aicpu_ = nullptr; - } - if (stream_aicore_ != nullptr) { - rtStreamDestroy(stream_aicore_); - stream_aicore_ = nullptr; - } + // Streams now live for the lifetime of the DeviceRunner (created at + // simpler_init time via ensure_device_initialized, destroyed in finalize). + // Per-run release is intentionally a no-op so prepare_run_context's stream + // check short-circuits across all prepare_callable / run_prepared calls. } int DeviceRunner::ensure_binaries_loaded() { @@ -1170,6 +1164,22 @@ int DeviceRunner::finalize() { release_run_context(); + // Streams are persistent for the DeviceRunner's lifetime; destroy them here. + // Intentionally no pre-destroy sync: when a run hits the AICore op-timeout + // chain (PR #718), the AICPU stream surfaces ACL_ERROR_RT_AICPU_EXCEPTION + // (507018) at run-path sync; calling aclrtSynchronizeStream* again on the + // error-state stream at finalize wedges subsequent tests (observed: 507018 + // / 507899 / 507901 cascade across the whole st-onboard-a2a3 suite). + // rtStreamDestroy on an error-state stream is the supported teardown path. + if (stream_aicpu_ != nullptr) { + rtStreamDestroy(stream_aicpu_); + stream_aicpu_ = nullptr; + } + if (stream_aicore_ != nullptr) { + rtStreamDestroy(stream_aicore_); + stream_aicore_ = nullptr; + } + // Cleanup kernel args (deviceArgs) kernel_args_.finalize_device_args(); diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h index 53fb6555f..ae89f9172 100644 --- a/src/a2a3/platform/onboard/host/device_runner.h +++ b/src/a2a3/platform/onboard/host/device_runner.h @@ -283,12 +283,13 @@ class DeviceRunner { /** * Take ownership of the dispatcher SO bytes. Called by simpler_init when - * the caller provided a dispatcher path; ensure_binaries_loaded() hands - * the buffer to LoadAicpuOp::BootstrapDispatcher on the first run. - * Leaving this unset (empty buffer) makes ensure_binaries_loaded() fail - * with a clear message — callers that drive _ChipWorker.init directly - * without a dispatcher path get a deterministic error at run() time - * rather than a confusing dladdr-derived path. + * the caller provided a dispatcher path; the eager + * ensure_device_initialized() in simpler_init hands the buffer to + * LoadAicpuOp::BootstrapDispatcher at init time. Leaving this unset + * (empty buffer) makes ensure_binaries_loaded() fail with a clear + * message — callers that drive _ChipWorker.init directly without a + * dispatcher path get a deterministic error at simpler_init time rather + * than a confusing dladdr-derived path. */ void set_dispatcher_binary(std::vector dispatcher_so_binary) { dispatcher_so_binary_ = std::move(dispatcher_so_binary); @@ -455,12 +456,34 @@ class DeviceRunner { /** * Release run-scoped resources owned by the current thread. * - * This destroys AICPU/AICore streams but intentionally preserves device - * allocations, uploaded binaries, and other session state so they can be - * finalized later before rtDeviceReset(). + * No-op since streams now live for the DeviceRunner's lifetime (created at + * simpler_init via ensure_device_initialized, destroyed in finalize). + * Retained as a name so the c_api RAII guards keep their existing shape. */ void release_run_context(); + /** + * One-shot device initialization. Performs, in order: + * 1. rtSetDevice + rtStreamCreate for AICPU and AICore streams. Streams + * live for the DeviceRunner's lifetime and are destroyed in finalize. + * 2. Bundles dispatcher SO bytes + inner AICPU kernel SO bytes through + * `LoadAicpuOp::BootstrapDispatcher` so the inner SO is written to + * the device-side preinstall path. + * 3. Registers the inner SO via `LoadAicpuOp::Init` + * (`rtsBinaryLoadFromFile` + `rtsFuncGetByName`) and caches the + * resulting per-symbol `rtFuncHandle` for per-task `rtsLaunchCpuKernel`. + * 4. Allocates the device-side AicpuSoInfo H2D copy and stamps + * `device_args_.aicpu_so_bin/len` (load-bearing on a5 onboard). + * + * Called once from `simpler_init` after the executor + dispatcher bytes are + * cached on the runner. Idempotent: subsequent calls short-circuit on + * binaries_loaded_. Reads device_id_ recorded by attach_current_thread. + * + * @return 0 on success, error code on failure (e.g. dispatcher SO bytes + * not provided, CANN stream create / register failures). + */ + int ensure_device_initialized(); + /** * Stage a per-callable_id orchestration SO into device memory and remember * the supporting metadata (entry/config symbol names, kernel func_id ↔ @@ -706,20 +729,6 @@ class DeviceRunner { // dep_gen collector — captures orchestrator submit_task inputs for offline replay DepGenCollector dep_gen_collector_; - /** - * Ensure device is initialized (lazy initialization) - * - * Checks if device is already initialized. If not, performs: - * - Attach the current thread to the device - * - Create AICPU and AICore streams - * - Load AICPU SO to device memory - * - Initialize device args - * - * Reads the bound device id and executor binaries from runner state. - * @return 0 on success, error code on failure - */ - int ensure_device_initialized(); - /** * Query the maximum block_dim the stream can host. * diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp index 744b7291c..6effdf7d3 100644 --- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp @@ -259,11 +259,11 @@ int simpler_init( std::vector aicore_vec(aicore_binary, aicore_binary + aicore_size); runner->set_executors(std::move(aicpu_vec), std::move(aicore_vec)); // Dispatcher SO bytes are passed alongside the executors. Onboard - // requires a non-empty buffer: BootstrapDispatcher reads from it on - // the first run() to upload the dispatcher + inner SO bundle through + // requires a non-empty buffer: BootstrapDispatcher reads from it to + // upload the dispatcher + inner SO bundle through // libaicpu_extend_kernels. If the caller drives _ChipWorker.init - // directly without a dispatcher path, this stays empty and any later - // run() fails fast in ensure_binaries_loaded with a clear message. + // directly without a dispatcher path, this stays empty and the + // ensure_device_initialized call below fails fast with a clear message. if (dispatcher_binary != NULL && dispatcher_size > 0) { std::vector dispatcher_vec(dispatcher_binary, dispatcher_binary + dispatcher_size); runner->set_dispatcher_binary(std::move(dispatcher_vec)); @@ -271,6 +271,19 @@ int simpler_init( } catch (...) { return -1; } + + // Eagerly run the one-shot device setup: create persistent AICPU/AICore + // streams, upload the dispatcher + inner SO bundle, and resolve the per- + // symbol rtFuncHandle for per-task launch — so the first prepare_callable + // / run_prepared does not pay any of these costs. Streams live until + // finalize_device; the cached rtFuncHandle on LoadAicpuOp and the + // preinstall file both live until ~DeviceRunner. + try { + rc = runner->ensure_device_initialized(); + } catch (...) { + return -1; + } + if (rc != 0) return rc; return 0; } diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp index 38242555d..066e3995a 100644 --- a/src/a5/platform/onboard/host/device_runner.cpp +++ b/src/a5/platform/onboard/host/device_runner.cpp @@ -240,8 +240,9 @@ std::thread DeviceRunner::create_thread(std::function fn) { } int DeviceRunner::ensure_device_initialized() { - // First attach the current thread and create fresh run-scoped streams. - // device_id_ was set in attach_current_thread() during simpler_init. + // Attach the current thread to the device and create the persistent + // AICPU/AICore streams (destroyed in finalize()). device_id_ was set in + // attach_current_thread() during simpler_init. int rc = prepare_run_context(device_id_); if (rc != 0) { return rc; @@ -304,8 +305,6 @@ int DeviceRunner::prepare_run_context(int device_id) { return 0; } - release_run_context(); - // Create streams rc = rtStreamCreate(&stream_aicpu_, 0); if (rc != 0) { @@ -326,14 +325,10 @@ int DeviceRunner::prepare_run_context(int device_id) { } void DeviceRunner::release_run_context() { - if (stream_aicpu_ != nullptr) { - rtStreamDestroy(stream_aicpu_); - stream_aicpu_ = nullptr; - } - if (stream_aicore_ != nullptr) { - rtStreamDestroy(stream_aicore_); - stream_aicore_ = nullptr; - } + // Streams now live for the lifetime of the DeviceRunner (created at + // simpler_init time via ensure_device_initialized, destroyed in finalize). + // Per-run release is intentionally a no-op so prepare_run_context's stream + // check short-circuits across all prepare_callable / run_prepared calls. } int DeviceRunner::ensure_binaries_loaded() { @@ -978,6 +973,22 @@ int DeviceRunner::finalize() { release_run_context(); + // Streams are persistent for the DeviceRunner's lifetime; destroy them here. + // Intentionally no pre-destroy sync: when a run hits the AICore op-timeout + // chain (PR #718), the AICPU stream surfaces ACL_ERROR_RT_AICPU_EXCEPTION + // (507018) at run-path sync; calling aclrtSynchronizeStream* again on the + // error-state stream at finalize wedges subsequent tests (observed: 507018 + // / 507899 / 507901 cascade across the whole st-onboard-a2a3 suite). + // rtStreamDestroy on an error-state stream is the supported teardown path. + if (stream_aicpu_ != nullptr) { + rtStreamDestroy(stream_aicpu_); + stream_aicpu_ = nullptr; + } + if (stream_aicore_ != nullptr) { + rtStreamDestroy(stream_aicore_); + stream_aicore_ = nullptr; + } + // Cleanup kernel args (deviceArgs); device-side KernelArgs + runtime args // are released by runtime_args_cleanup RAII so they also unwind on errors. kernel_args_.finalize_device_args(); diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h index a07ab28bb..89e93c03b 100644 --- a/src/a5/platform/onboard/host/device_runner.h +++ b/src/a5/platform/onboard/host/device_runner.h @@ -271,12 +271,13 @@ class DeviceRunner { /** * Take ownership of the dispatcher SO bytes. Called by simpler_init when - * the caller provided a dispatcher path; ensure_binaries_loaded() hands - * the buffer to LoadAicpuOp::BootstrapDispatcher on the first run. - * Leaving this unset (empty buffer) makes ensure_binaries_loaded() fail - * with a clear message — callers that drive _ChipWorker.init directly - * without a dispatcher path get a deterministic error at run() time - * rather than a confusing dladdr-derived path. + * the caller provided a dispatcher path; the eager + * ensure_device_initialized() in simpler_init hands the buffer to + * LoadAicpuOp::BootstrapDispatcher at init time. Leaving this unset + * (empty buffer) makes ensure_binaries_loaded() fail with a clear + * message — callers that drive _ChipWorker.init directly without a + * dispatcher path get a deterministic error at simpler_init time rather + * than a confusing dladdr-derived path. */ void set_dispatcher_binary(std::vector dispatcher_so_binary) { dispatcher_so_binary_ = std::move(dispatcher_so_binary); @@ -403,12 +404,34 @@ class DeviceRunner { /** * Release run-scoped resources owned by the current thread. * - * This destroys AICPU/AICore streams but intentionally preserves device - * allocations, uploaded binaries, and other session state so they can be - * finalized later before rtDeviceReset(). + * No-op since streams now live for the DeviceRunner's lifetime (created at + * simpler_init via ensure_device_initialized, destroyed in finalize). + * Retained as a name so the c_api RAII guards keep their existing shape. */ void release_run_context(); + /** + * One-shot device initialization. Performs, in order: + * 1. rtSetDevice + rtStreamCreate for AICPU and AICore streams. Streams + * live for the DeviceRunner's lifetime and are destroyed in finalize. + * 2. Bundles dispatcher SO bytes + inner AICPU kernel SO bytes through + * `LoadAicpuOp::BootstrapDispatcher` so the inner SO is written to + * the device-side preinstall path. + * 3. Registers the inner SO via `LoadAicpuOp::Init` + * (`rtsBinaryLoadFromFile` + `rtsFuncGetByName`) and caches the + * resulting per-symbol `rtFuncHandle` for per-task `rtsLaunchCpuKernel`. + * 4. Allocates the device-side AicpuSoInfo H2D copy and stamps + * `device_args_.aicpu_so_bin/len` (load-bearing on a5 onboard). + * + * Called once from `simpler_init` after the executor + dispatcher bytes are + * cached on the runner. Idempotent: subsequent calls short-circuit on + * binaries_loaded_. Reads device_id_ recorded by attach_current_thread. + * + * @return 0 on success, error code on failure (e.g. dispatcher SO bytes + * not provided, CANN stream create / register failures). + */ + int ensure_device_initialized(); + /** * Stage a per-callable_id orchestration SO into device memory and remember * the supporting metadata (entry/config symbol names, kernel func_id ↔ @@ -598,20 +621,6 @@ class DeviceRunner { // PMU profiling (per-task AICore hardware counters) PmuCollector pmu_collector_; - /** - * Ensure device is initialized (lazy initialization) - * - * Checks if device is already initialized. If not, performs: - * - Attach the current thread to the device - * - Create AICPU and AICore streams - * - Load AICPU SO to device memory - * - Initialize device args - * - * Reads the bound device id and executor binaries from runner state. - * @return 0 on success, error code on failure - */ - int ensure_device_initialized(); - /** * Query the maximum block_dim the stream can host. * diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp index 0cc17c81f..5c075cf87 100644 --- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp @@ -325,8 +325,9 @@ int simpler_init( std::vector aicore_vec(aicore_binary, aicore_binary + aicore_size); runner->set_executors(std::move(aicpu_vec), std::move(aicore_vec)); // Dispatcher SO bytes — see a2a3 sibling for rationale. Empty buffer - // is permitted at simpler_init time; ensure_binaries_loaded surfaces - // the error if/when the bootstrap is actually attempted. + // is permitted at this point; the ensure_device_initialized call + // below fails fast with a clear message if bootstrap is required but + // the dispatcher path was not provided. if (dispatcher_binary != NULL && dispatcher_size > 0) { std::vector dispatcher_vec(dispatcher_binary, dispatcher_binary + dispatcher_size); runner->set_dispatcher_binary(std::move(dispatcher_vec)); @@ -334,6 +335,19 @@ int simpler_init( } catch (...) { return -1; } + + // Eagerly run the one-shot device setup: create persistent AICPU/AICore + // streams, upload the dispatcher + inner SO bundle, and resolve the per- + // symbol rtFuncHandle for per-task launch — so the first prepare_callable + // / run_prepared does not pay any of these costs. Streams live until + // finalize_device; the cached rtFuncHandle on LoadAicpuOp and the + // preinstall file both live until ~DeviceRunner. + try { + rc = runner->ensure_device_initialized(); + } catch (...) { + return -1; + } + if (rc != 0) return rc; return 0; } /* ===========================================================================