Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 23 additions & 13 deletions src/a2a3/platform/onboard/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -296,8 +296,9 @@ std::thread DeviceRunner::create_thread(std::function<void()> fn) {
}

int DeviceRunner::ensure_device_initialized() {
// First attach the current thread and create fresh run-scoped streams.
// device_id_ was set in attach_current_thread() during simpler_init.
// Attach the current thread to the device and create the persistent
// AICPU/AICore streams (destroyed in finalize()). device_id_ was set in
// attach_current_thread() during simpler_init.
int rc = prepare_run_context(device_id_);
if (rc != 0) {
return rc;
Expand Down Expand Up @@ -420,8 +421,6 @@ int DeviceRunner::prepare_run_context(int device_id) {
return 0;
}

release_run_context();

// Create streams
rc = rtStreamCreate(&stream_aicpu_, 0);
if (rc != 0) {
Expand All @@ -442,15 +441,10 @@ int DeviceRunner::prepare_run_context(int device_id) {
}

void DeviceRunner::release_run_context() {
// Destroy streams (they belong to the current thread's CANN context)
if (stream_aicpu_ != nullptr) {
rtStreamDestroy(stream_aicpu_);
stream_aicpu_ = nullptr;
}
if (stream_aicore_ != nullptr) {
rtStreamDestroy(stream_aicore_);
stream_aicore_ = nullptr;
}
// Streams now live for the lifetime of the DeviceRunner (created at
// simpler_init time via ensure_device_initialized, destroyed in finalize).
// Per-run release is intentionally a no-op so prepare_run_context's stream
// check short-circuits across all prepare_callable / run_prepared calls.
}

int DeviceRunner::ensure_binaries_loaded() {
Expand Down Expand Up @@ -1170,6 +1164,22 @@ int DeviceRunner::finalize() {

release_run_context();

// Streams are persistent for the DeviceRunner's lifetime; destroy them here.
// Intentionally no pre-destroy sync: when a run hits the AICore op-timeout
// chain (PR #718), the AICPU stream surfaces ACL_ERROR_RT_AICPU_EXCEPTION
// (507018) at run-path sync; calling aclrtSynchronizeStream* again on the
// error-state stream at finalize wedges subsequent tests (observed: 507018
// / 507899 / 507901 cascade across the whole st-onboard-a2a3 suite).
// rtStreamDestroy on an error-state stream is the supported teardown path.
if (stream_aicpu_ != nullptr) {
rtStreamDestroy(stream_aicpu_);
stream_aicpu_ = nullptr;
}
if (stream_aicore_ != nullptr) {
rtStreamDestroy(stream_aicore_);
stream_aicore_ = nullptr;
}

// Cleanup kernel args (deviceArgs)
kernel_args_.finalize_device_args();

Expand Down
55 changes: 32 additions & 23 deletions src/a2a3/platform/onboard/host/device_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -283,12 +283,13 @@ class DeviceRunner {

/**
* Take ownership of the dispatcher SO bytes. Called by simpler_init when
* the caller provided a dispatcher path; ensure_binaries_loaded() hands
* the buffer to LoadAicpuOp::BootstrapDispatcher on the first run.
* Leaving this unset (empty buffer) makes ensure_binaries_loaded() fail
* with a clear message — callers that drive _ChipWorker.init directly
* without a dispatcher path get a deterministic error at run() time
* rather than a confusing dladdr-derived path.
* the caller provided a dispatcher path; the eager
* ensure_device_initialized() in simpler_init hands the buffer to
* LoadAicpuOp::BootstrapDispatcher at init time. Leaving this unset
* (empty buffer) makes ensure_binaries_loaded() fail with a clear
* message — callers that drive _ChipWorker.init directly without a
* dispatcher path get a deterministic error at simpler_init time rather
* than a confusing dladdr-derived path.
*/
void set_dispatcher_binary(std::vector<uint8_t> dispatcher_so_binary) {
dispatcher_so_binary_ = std::move(dispatcher_so_binary);
Expand Down Expand Up @@ -455,12 +456,34 @@ class DeviceRunner {
/**
* Release run-scoped resources owned by the current thread.
*
* This destroys AICPU/AICore streams but intentionally preserves device
* allocations, uploaded binaries, and other session state so they can be
* finalized later before rtDeviceReset().
* No-op since streams now live for the DeviceRunner's lifetime (created at
* simpler_init via ensure_device_initialized, destroyed in finalize).
* Retained as a name so the c_api RAII guards keep their existing shape.
*/
void release_run_context();

/**
* One-shot device initialization. Performs, in order:
* 1. rtSetDevice + rtStreamCreate for AICPU and AICore streams. Streams
* live for the DeviceRunner's lifetime and are destroyed in finalize.
* 2. Bundles dispatcher SO bytes + inner AICPU kernel SO bytes through
* `LoadAicpuOp::BootstrapDispatcher` so the inner SO is written to
* the device-side preinstall path.
* 3. Registers the inner SO via `LoadAicpuOp::Init`
* (`rtsBinaryLoadFromFile` + `rtsFuncGetByName`) and caches the
* resulting per-symbol `rtFuncHandle` for per-task `rtsLaunchCpuKernel`.
* 4. Allocates the device-side AicpuSoInfo H2D copy and stamps
* `device_args_.aicpu_so_bin/len` (load-bearing on a5 onboard).
*
* Called once from `simpler_init` after the executor + dispatcher bytes are
* cached on the runner. Idempotent: subsequent calls short-circuit on
* binaries_loaded_. Reads device_id_ recorded by attach_current_thread.
*
* @return 0 on success, error code on failure (e.g. dispatcher SO bytes
* not provided, CANN stream create / register failures).
*/
int ensure_device_initialized();

/**
* Stage a per-callable_id orchestration SO into device memory and remember
* the supporting metadata (entry/config symbol names, kernel func_id ↔
Expand Down Expand Up @@ -706,20 +729,6 @@ class DeviceRunner {
// dep_gen collector — captures orchestrator submit_task inputs for offline replay
DepGenCollector dep_gen_collector_;

/**
* Ensure device is initialized (lazy initialization)
*
* Checks if device is already initialized. If not, performs:
* - Attach the current thread to the device
* - Create AICPU and AICore streams
* - Load AICPU SO to device memory
* - Initialize device args
*
* Reads the bound device id and executor binaries from runner state.
* @return 0 on success, error code on failure
*/
int ensure_device_initialized();

/**
* Query the maximum block_dim the stream can host.
*
Expand Down
21 changes: 17 additions & 4 deletions src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -259,18 +259,31 @@ int simpler_init(
std::vector<uint8_t> aicore_vec(aicore_binary, aicore_binary + aicore_size);
runner->set_executors(std::move(aicpu_vec), std::move(aicore_vec));
// Dispatcher SO bytes are passed alongside the executors. Onboard
// requires a non-empty buffer: BootstrapDispatcher reads from it on
// the first run() to upload the dispatcher + inner SO bundle through
// requires a non-empty buffer: BootstrapDispatcher reads from it to
// upload the dispatcher + inner SO bundle through
// libaicpu_extend_kernels. If the caller drives _ChipWorker.init
// directly without a dispatcher path, this stays empty and any later
// run() fails fast in ensure_binaries_loaded with a clear message.
// directly without a dispatcher path, this stays empty and the
// ensure_device_initialized call below fails fast with a clear message.
if (dispatcher_binary != NULL && dispatcher_size > 0) {
std::vector<uint8_t> dispatcher_vec(dispatcher_binary, dispatcher_binary + dispatcher_size);
runner->set_dispatcher_binary(std::move(dispatcher_vec));
}
} catch (...) {
return -1;
}

// Eagerly run the one-shot device setup: create persistent AICPU/AICore
// streams, upload the dispatcher + inner SO bundle, and resolve the per-
// symbol rtFuncHandle for per-task launch — so the first prepare_callable
// / run_prepared does not pay any of these costs. Streams live until
// finalize_device; the cached rtFuncHandle on LoadAicpuOp and the
// preinstall file both live until ~DeviceRunner.
try {
rc = runner->ensure_device_initialized();
} catch (...) {
return -1;
}
if (rc != 0) return rc;
return 0;
}

Expand Down
35 changes: 23 additions & 12 deletions src/a5/platform/onboard/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -240,8 +240,9 @@ std::thread DeviceRunner::create_thread(std::function<void()> fn) {
}

int DeviceRunner::ensure_device_initialized() {
// First attach the current thread and create fresh run-scoped streams.
// device_id_ was set in attach_current_thread() during simpler_init.
// Attach the current thread to the device and create the persistent
// AICPU/AICore streams (destroyed in finalize()). device_id_ was set in
// attach_current_thread() during simpler_init.
int rc = prepare_run_context(device_id_);
if (rc != 0) {
return rc;
Expand Down Expand Up @@ -304,8 +305,6 @@ int DeviceRunner::prepare_run_context(int device_id) {
return 0;
}

release_run_context();

// Create streams
rc = rtStreamCreate(&stream_aicpu_, 0);
if (rc != 0) {
Expand All @@ -326,14 +325,10 @@ int DeviceRunner::prepare_run_context(int device_id) {
}

void DeviceRunner::release_run_context() {
if (stream_aicpu_ != nullptr) {
rtStreamDestroy(stream_aicpu_);
stream_aicpu_ = nullptr;
}
if (stream_aicore_ != nullptr) {
rtStreamDestroy(stream_aicore_);
stream_aicore_ = nullptr;
}
// Streams now live for the lifetime of the DeviceRunner (created at
// simpler_init time via ensure_device_initialized, destroyed in finalize).
// Per-run release is intentionally a no-op so prepare_run_context's stream
// check short-circuits across all prepare_callable / run_prepared calls.
}

int DeviceRunner::ensure_binaries_loaded() {
Expand Down Expand Up @@ -978,6 +973,22 @@ int DeviceRunner::finalize() {

release_run_context();

// Streams are persistent for the DeviceRunner's lifetime; destroy them here.
// Intentionally no pre-destroy sync: when a run hits the AICore op-timeout
// chain (PR #718), the AICPU stream surfaces ACL_ERROR_RT_AICPU_EXCEPTION
// (507018) at run-path sync; calling aclrtSynchronizeStream* again on the
// error-state stream at finalize wedges subsequent tests (observed: 507018
// / 507899 / 507901 cascade across the whole st-onboard-a2a3 suite).
// rtStreamDestroy on an error-state stream is the supported teardown path.
if (stream_aicpu_ != nullptr) {
rtStreamDestroy(stream_aicpu_);
stream_aicpu_ = nullptr;
}
if (stream_aicore_ != nullptr) {
rtStreamDestroy(stream_aicore_);
stream_aicore_ = nullptr;
}

// Cleanup kernel args (deviceArgs); device-side KernelArgs + runtime args
// are released by runtime_args_cleanup RAII so they also unwind on errors.
kernel_args_.finalize_device_args();
Expand Down
55 changes: 32 additions & 23 deletions src/a5/platform/onboard/host/device_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -271,12 +271,13 @@ class DeviceRunner {

/**
* Take ownership of the dispatcher SO bytes. Called by simpler_init when
* the caller provided a dispatcher path; ensure_binaries_loaded() hands
* the buffer to LoadAicpuOp::BootstrapDispatcher on the first run.
* Leaving this unset (empty buffer) makes ensure_binaries_loaded() fail
* with a clear message — callers that drive _ChipWorker.init directly
* without a dispatcher path get a deterministic error at run() time
* rather than a confusing dladdr-derived path.
* the caller provided a dispatcher path; the eager
* ensure_device_initialized() in simpler_init hands the buffer to
* LoadAicpuOp::BootstrapDispatcher at init time. Leaving this unset
* (empty buffer) makes ensure_binaries_loaded() fail with a clear
* message — callers that drive _ChipWorker.init directly without a
* dispatcher path get a deterministic error at simpler_init time rather
* than a confusing dladdr-derived path.
*/
void set_dispatcher_binary(std::vector<uint8_t> dispatcher_so_binary) {
dispatcher_so_binary_ = std::move(dispatcher_so_binary);
Expand Down Expand Up @@ -403,12 +404,34 @@ class DeviceRunner {
/**
* Release run-scoped resources owned by the current thread.
*
* This destroys AICPU/AICore streams but intentionally preserves device
* allocations, uploaded binaries, and other session state so they can be
* finalized later before rtDeviceReset().
* No-op since streams now live for the DeviceRunner's lifetime (created at
* simpler_init via ensure_device_initialized, destroyed in finalize).
* Retained as a name so the c_api RAII guards keep their existing shape.
*/
void release_run_context();

/**
* One-shot device initialization. Performs, in order:
* 1. rtSetDevice + rtStreamCreate for AICPU and AICore streams. Streams
* live for the DeviceRunner's lifetime and are destroyed in finalize.
* 2. Bundles dispatcher SO bytes + inner AICPU kernel SO bytes through
* `LoadAicpuOp::BootstrapDispatcher` so the inner SO is written to
* the device-side preinstall path.
* 3. Registers the inner SO via `LoadAicpuOp::Init`
* (`rtsBinaryLoadFromFile` + `rtsFuncGetByName`) and caches the
* resulting per-symbol `rtFuncHandle` for per-task `rtsLaunchCpuKernel`.
* 4. Allocates the device-side AicpuSoInfo H2D copy and stamps
* `device_args_.aicpu_so_bin/len` (load-bearing on a5 onboard).
*
* Called once from `simpler_init` after the executor + dispatcher bytes are
* cached on the runner. Idempotent: subsequent calls short-circuit on
* binaries_loaded_. Reads device_id_ recorded by attach_current_thread.
*
* @return 0 on success, error code on failure (e.g. dispatcher SO bytes
* not provided, CANN stream create / register failures).
*/
int ensure_device_initialized();

/**
* Stage a per-callable_id orchestration SO into device memory and remember
* the supporting metadata (entry/config symbol names, kernel func_id ↔
Expand Down Expand Up @@ -598,20 +621,6 @@ class DeviceRunner {
// PMU profiling (per-task AICore hardware counters)
PmuCollector pmu_collector_;

/**
* Ensure device is initialized (lazy initialization)
*
* Checks if device is already initialized. If not, performs:
* - Attach the current thread to the device
* - Create AICPU and AICore streams
* - Load AICPU SO to device memory
* - Initialize device args
*
* Reads the bound device id and executor binaries from runner state.
* @return 0 on success, error code on failure
*/
int ensure_device_initialized();

/**
* Query the maximum block_dim the stream can host.
*
Expand Down
18 changes: 16 additions & 2 deletions src/a5/platform/onboard/host/pto_runtime_c_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -325,15 +325,29 @@ int simpler_init(
std::vector<uint8_t> aicore_vec(aicore_binary, aicore_binary + aicore_size);
runner->set_executors(std::move(aicpu_vec), std::move(aicore_vec));
// Dispatcher SO bytes — see a2a3 sibling for rationale. Empty buffer
// is permitted at simpler_init time; ensure_binaries_loaded surfaces
// the error if/when the bootstrap is actually attempted.
// is permitted at this point; the ensure_device_initialized call
// below fails fast with a clear message if bootstrap is required but
// the dispatcher path was not provided.
if (dispatcher_binary != NULL && dispatcher_size > 0) {
std::vector<uint8_t> dispatcher_vec(dispatcher_binary, dispatcher_binary + dispatcher_size);
runner->set_dispatcher_binary(std::move(dispatcher_vec));
}
} catch (...) {
return -1;
}

// Eagerly run the one-shot device setup: create persistent AICPU/AICore
// streams, upload the dispatcher + inner SO bundle, and resolve the per-
// symbol rtFuncHandle for per-task launch — so the first prepare_callable
// / run_prepared does not pay any of these costs. Streams live until
// finalize_device; the cached rtFuncHandle on LoadAicpuOp and the
// preinstall file both live until ~DeviceRunner.
try {
rc = runner->ensure_device_initialized();
} catch (...) {
return -1;
}
if (rc != 0) return rc;
return 0;
}
/* ===========================================================================
Expand Down
Loading