From 832de9399b1e29a2de2d1967c9a57e06b8bbcd7c Mon Sep 17 00:00:00 2001 From: Chao Wang <26245345+ChaoWao@users.noreply.github.com> Date: Thu, 21 May 2026 10:04:21 +0800 Subject: [PATCH 1/2] Feat: AICPU launch via dispatcher upload + Mode B per-task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two-phase architecture for loading AICPU kernels on CANN 9.0+ without tar.gz / sudo pre-deployment. Bootstrap (per-DeviceRunner, idempotent across instances in a process) ====================================================================== Host bundles dispatcher SO bytes + runtime SO bytes into a single rtAicpuKernelLaunchExWithArgs (kernel_type = KERNEL_TYPE_AICPU_KFC) targeting CANN's preinstalled libaicpu_extend_kernels.so. libaicpu_extend_kernels dlopens our dispatcher and invokes its Init; the dispatcher reads the runtime SO bytes from extended DeviceArgs (inner_so_bin/inner_so_len at offsets 120/128, which libaicpu_extend_kernels ignores) and writes them to /usr/lib64/aicpu_kernels/0/aicpu_kernels_device/simpler_inner_.so using sched-thread (HwHiAiUser) write permission. The dispatcher SO itself never lands at preinstall — only its transient libaicpu_extend_kernels dlopen. The runtime SO basename embeds an FNV-1a content fingerprint. Writes go via atomic tmp+rename inside the dispatcher — no truncation window visible to concurrent aicpu_scheduler readers. A process-level fingerprint cache in LoadAicpuOp skips redundant libaicpu_extend_kernels invocations within a single host process — each runtime is bootstrapped at most once per process. Per-task launches (Mode B, no dispatcher hop) ============================================= LoadAicpuOp.Init() JSON-registers the runtime SO via rtsBinaryLoadFromFile (cpuKernelMode=0, kernelSo points at the preinstall basename), then resolves simpler_aicpu_init and simpler_aicpu_exec to rtFuncHandles via rtsFuncGetByName. JSON is per-process (/tmp/simpler_inner__.json) so concurrent multi-chip / multi-worker tests don't race on a shared file. opType is suffixed with the runtime SO's fingerprint so multiple LoadAicpuOp instances in the same process register non-colliding entries even though the underlying symbol names are identical. Per-task launches call rtsLaunchCpuKernel on the cached rtFuncHandles — no per-call string marshalling, no global op registry lookups, no dispatcher hop. Cleanup ======= - Removes BUILD_WITH_NEW_CANN CMake option and all ifdef branches. Mode B requires CANN 7.0+, which all supported targets ship. - Deletes the legacy AicpuLoader stub (src/{a2a3,a5}/platform/onboard/host/aicpu_loader.{cpp,h}). - Widens the aicpu_op_timeout regression test to accept the Mode B-surfaced error codes in addition to the original 507046. Reference: PR #537. --- .gitignore | 4 + simpler_setup/build_runtimes.py | 8 +- simpler_setup/runtime_builder.py | 1 + simpler_setup/runtime_compiler.py | 33 +- .../platform/onboard/aicpu/CMakeLists.txt | 45 +++ src/a2a3/platform/onboard/aicpu/kernel.cpp | 43 +-- src/a2a3/platform/onboard/host/CMakeLists.txt | 26 +- .../platform/onboard/host/device_runner.cpp | 92 +++-- .../platform/onboard/host/device_runner.h | 4 + .../aicpu/aicpu_executor.cpp | 2 +- src/a5/platform/onboard/aicpu/CMakeLists.txt | 33 ++ src/a5/platform/onboard/aicpu/kernel.cpp | 41 +-- src/a5/platform/onboard/host/CMakeLists.txt | 22 +- .../platform/onboard/host/device_runner.cpp | 85 +++-- src/a5/platform/onboard/host/device_runner.h | 4 + src/common/aicpu_dispatcher/CMakeLists.txt | 47 +++ src/common/aicpu_dispatcher/README.md | 32 ++ .../aicpu_dispatcher/aicpu_dispatcher.cpp | 195 ++++++++++ .../aicpu_dispatcher/aicpu_dispatcher.h | 66 ++++ src/common/host/CMakeLists.txt | 20 + src/common/host/load_aicpu_op.cpp | 348 ++++++++++++++++++ src/common/host/load_aicpu_op.h | 140 +++++++ .../test_aicore_op_timeout.py | 16 +- 23 files changed, 1176 insertions(+), 131 deletions(-) create mode 100644 src/common/aicpu_dispatcher/CMakeLists.txt create mode 100644 src/common/aicpu_dispatcher/README.md create mode 100644 src/common/aicpu_dispatcher/aicpu_dispatcher.cpp create mode 100644 src/common/aicpu_dispatcher/aicpu_dispatcher.h create mode 100644 src/common/host/CMakeLists.txt create mode 100644 src/common/host/load_aicpu_op.cpp create mode 100644 src/common/host/load_aicpu_op.h diff --git a/.gitignore b/.gitignore index 6502a2795..19f23ea16 100644 --- a/.gitignore +++ b/.gitignore @@ -40,3 +40,7 @@ compile_commands.json python/_task_interface*.so python/_task_interface*.dylib .claude/scheduled_tasks.lock + +# Log files +*.log +profiling_logs_*/ diff --git a/simpler_setup/build_runtimes.py b/simpler_setup/build_runtimes.py index 9ed4fbb8c..fbe24d95e 100644 --- a/simpler_setup/build_runtimes.py +++ b/simpler_setup/build_runtimes.py @@ -131,7 +131,7 @@ def build_all( raise for platform in platforms: - arch, variant = parse_platform(platform) + arch, _ = parse_platform(platform) runtimes = discover_runtimes(arch) if not runtimes: @@ -152,6 +152,12 @@ def build_all( logger.error(f" Failed to build {platform}/{runtime_name}: {e}") raise + # No device-side deployment step here. The dispatcher SO is uploaded + # into the main aicpu_scheduler at runtime, on the first + # DeviceRunner::ensure_binaries_loaded call, via + # LoadAicpuOp::BootstrapDispatcher (see src/common/host/load_aicpu_op.cpp + # and src/common/aicpu_dispatcher/aicpu_dispatcher.h for architecture). + def main(): parser = argparse.ArgumentParser(description="Pre-build runtime binaries for available platforms") diff --git a/simpler_setup/runtime_builder.py b/simpler_setup/runtime_builder.py index 28d8d7fe8..02b9323cf 100644 --- a/simpler_setup/runtime_builder.py +++ b/simpler_setup/runtime_builder.py @@ -247,6 +247,7 @@ def _compile_target(target: str) -> Path: source_dirs, build_dir=str(cache_dir), output_dir=output_dir, + runtime_name=name, ) logger.info("Compiling AICore, AICPU, Host in parallel...") diff --git a/simpler_setup/runtime_compiler.py b/simpler_setup/runtime_compiler.py index 3185984f0..a14d343a7 100644 --- a/simpler_setup/runtime_compiler.py +++ b/simpler_setup/runtime_compiler.py @@ -40,14 +40,27 @@ def get_root_dir(self) -> str: def get_binary_name(self) -> str: return self._binary_name - def gen_cmake_args(self, include_dirs: list[str], source_dirs: list[str]) -> list[str]: - """Generate CMake arguments list from toolchain args + custom directories.""" + def gen_cmake_args( + self, + include_dirs: list[str], + source_dirs: list[str], + runtime_name: Optional[str] = None, + ) -> list[str]: + """Generate CMake arguments list from toolchain args + custom directories. + + ``runtime_name`` is propagated to CMake as ``-DRUNTIME_NAME=`` so + per-runtime build outputs (e.g. the AICPU dispatcher SO) can pick a + per-runtime basename — needed for ChipWorker to bind multiple runtimes + in a single process without colliding on dispatcher state. + """ inc = ";".join(os.path.abspath(d) for d in include_dirs) src = ";".join(os.path.abspath(d) for d in source_dirs) args = self.toolchain.get_cmake_args() + [ f"-DCUSTOM_INCLUDE_DIRS={inc}", f"-DCUSTOM_SOURCE_DIRS={src}", ] + if runtime_name is not None: + args.append(f"-DRUNTIME_NAME={runtime_name}") if logger.isEnabledFor(logging.DEBUG): args.append("--log-level=VERBOSE") return args @@ -201,6 +214,7 @@ def compile( source_dirs: list[str], build_dir: Optional[str] = None, output_dir: Optional[Union[str, Path]] = None, + runtime_name: Optional[str] = None, ) -> Union[bytes, Path]: """ Compile binary for the specified target platform. @@ -231,7 +245,7 @@ def compile( else: raise ValueError(f"Invalid target platform: {target_platform}. Must be 'aicore', 'aicpu', or 'host'.") - cmake_args = target.gen_cmake_args(include_dirs, source_dirs) + cmake_args = target.gen_cmake_args(include_dirs, source_dirs, runtime_name=runtime_name) cmake_source_dir = target.get_root_dir() binary_name = target.get_binary_name() platform = target_platform.upper() @@ -249,6 +263,19 @@ def _build(actual_build_dir: str) -> Union[bytes, Path]: od.mkdir(parents=True, exist_ok=True) dest = od / binary_name shutil.copy2(binary_path, dest) + # The AICPU dispatcher SO has a stable, runtime-invariant name. + # Host BootstrapDispatcher uploads it into the main aicpu_scheduler + # at process startup (no tar.gz / sudo), and the dispatcher + # self-deploys into /usr/lib64/aicpu_kernels/0/aicpu_kernels_device/. + # Per-runtime AICPU kernel SOs (libaicpu_kernel.so) are uploaded + # by host at runtime via DeviceArgs.aicpu_so_bin and lazily + # loaded by the dispatcher. + dispatcher_name = "libsimpler_aicpu_dispatcher.so" + dispatcher_so = Path(actual_build_dir) / dispatcher_name + if dispatcher_so.is_file(): + dest_dispatcher = od / dispatcher_name + shutil.copy2(dispatcher_so, dest_dispatcher) + subprocess.run(["strip", "-s", str(dest_dispatcher)], check=True) return dest else: with open(binary_path, "rb") as f: diff --git a/src/a2a3/platform/onboard/aicpu/CMakeLists.txt b/src/a2a3/platform/onboard/aicpu/CMakeLists.txt index 6edf9eb93..5f0ded665 100644 --- a/src/a2a3/platform/onboard/aicpu/CMakeLists.txt +++ b/src/a2a3/platform/onboard/aicpu/CMakeLists.txt @@ -20,6 +20,7 @@ list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../include list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/task_interface") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/log/include") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/device_comm") +list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher") if(DEFINED CUSTOM_INCLUDE_DIRS) foreach(INC_DIR ${CUSTOM_INCLUDE_DIRS}) list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${INC_DIR}") @@ -84,3 +85,47 @@ target_link_directories(aicpu_kernel # Output name set_target_properties(aicpu_kernel PROPERTIES OUTPUT_NAME aicpu_kernel) + +# Build dispatcher SO (direction 1: stable single dispatcher, runtime kernel +# uploaded at runtime). The dispatcher has NO runtime-specific code; it +# receives the per-runtime AICPU kernel SO bytes via DeviceArgs.aicpu_so_bin +# at Null phase, writes them to disk, dlopens, and dlsyms the inner +# DynTileFwkBackendKernelServer{,Init} symbols. Cache key is +# (aicpu_so_bin device address, aicpu_so_len) — different ChipWorker +# instances in the same process get separate cache entries, enabling +# single-process multi-runtime without firstCreatSo_-style locks. +# +# Output name is fixed ("simpler_aicpu_dispatcher"). Host bootstrap uploads +# this SO into /usr/lib64/aicpu_kernels/0/aicpu_kernels_device/ at process +# startup via LoadAicpuOp::BootstrapDispatcher — no tar.gz, no sudo. +# Building per-runtime libaicpu_kernel.so stays in this same CMakeLists +# (aicpu_kernel target above). +set(AICPU_DISPATCHER_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher/aicpu_dispatcher.cpp" +) +add_library(aicpu_dispatcher SHARED ${AICPU_DISPATCHER_SOURCES}) + +target_compile_options(aicpu_dispatcher + PRIVATE + -Wall + -Wextra + -rdynamic + -O3 + -fPIC + -g + $<$:-std=gnu++17> +) + +target_include_directories(aicpu_dispatcher + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CUSTOM_INCLUDE_DIRS} + ${ASCEND_HOME_PATH}/include +) + +target_link_libraries(aicpu_dispatcher PRIVATE dl) + +set_target_properties(aicpu_dispatcher PROPERTIES + LINK_FLAGS "-Wl,--build-id" + OUTPUT_NAME "simpler_aicpu_dispatcher" +) diff --git a/src/a2a3/platform/onboard/aicpu/kernel.cpp b/src/a2a3/platform/onboard/aicpu/kernel.cpp index 32e24a526..e2dc61a81 100644 --- a/src/a2a3/platform/onboard/aicpu/kernel.cpp +++ b/src/a2a3/platform/onboard/aicpu/kernel.cpp @@ -24,8 +24,8 @@ #include "runtime.h" // Run-wall capture: g_device_start_cycle is set once in -// DynTileFwkBackendKernelServerInit (single-threaded launch); each thread -// of the multi-threaded DynTileFwkBackendKernelServer writes the converted +// simpler_aicpu_init (single-threaded launch); each thread +// of the multi-threaded simpler_aicpu_exec writes the converted // (end - start) into KernelArgs.device_wall_ns on exit. Plain stores — // last-writer-wins is fine for wall measurement (concurrent exiting threads' // `my_end` values differ by µs, the final overwrite is within benchmark @@ -35,27 +35,18 @@ static uint64_t g_device_start_cycle = 0; // Forward declaration of aicpu_execute (implemented in aicpu_executor.cpp) extern "C" int aicpu_execute(Runtime *arg); -extern "C" __attribute__((visibility("default"))) int StaticTileFwkBackendKernelServer(void *arg) { - if (arg == nullptr) { - LOG_ERROR("%s", "Invalid kernel arguments: null pointer"); - return -1; - } - - return 0; -} - /** - * AICPU kernel initialization entry point + * AICPU kernel initialization entry point. * - * This function is called once during kernel initialization by the CANN - * runtime. It initializes logging and validates kernel arguments. - * - * Note: Function name is hardcoded in libaicpu_extend_kernels.so + * Called once by simpler_dispatcher in the Init phase. The dispatcher + * dlsym's "simpler_aicpu_init" inside this inner SO (an internal + * dispatcher↔inner protocol — independent of CANN's preinstalled + * libaicpu_extend_kernels contract, which only binds the dispatcher itself). * * @param arg Pointer to KernelArgs structure * @return 0 on success, -1 on error */ -extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelServerInit(void *arg) { +extern "C" __attribute__((visibility("default"))) int simpler_aicpu_init(void *arg) { init_log_switch(); if (arg == nullptr) { LOG_ERROR("%s", "Invalid kernel arguments: null pointer"); @@ -67,7 +58,7 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer // Init is launched single-threaded (block_dim=1), so the race-free spot // to capture run start and reset the wall accumulator. Subsequent - // DynTileFwkBackendKernelServer threads stamp end on their way out, via + // simpler_aicpu_exec threads stamp end on their way out, via // the device-resident 8-byte buffer addressed by device_wall_data_base. g_device_start_cycle = get_sys_cnt_aicpu(); if (k_args->device_wall_data_base != 0) { @@ -79,17 +70,15 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer } /** - * AICPU kernel main execution entry point - * - * This is the main entry point for the AICPU runtime executor kernel. - * It extracts the Runtime from KernelArgs and delegates to AicpuExecute. + * AICPU kernel main execution entry point. * - * Note: Function name is hardcoded in libaicpu_extend_kernels.so + * Called per-thread by simpler_dispatcher in the Run phase via dlsym + * "simpler_aicpu_exec" on the inner SO. * * @param arg Pointer to KernelArgs structure containing runtime_args * @return 0 on success, non-zero on error */ -extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelServer(void *arg) { +extern "C" __attribute__((visibility("default"))) int simpler_aicpu_exec(void *arg) { if (arg == nullptr) { LOG_ERROR("%s", "Invalid kernel arguments: null pointer"); return -1; @@ -128,13 +117,13 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer return 0; } - LOG_INFO_V0("%s", "DynTileFwkBackendKernelServer: Calling aicpu_execute with Runtime"); + LOG_INFO_V0("%s", "simpler_aicpu_exec: Calling aicpu_execute with Runtime"); int rc = aicpu_execute(runtime); if (rc != 0) { - LOG_ERROR("DynTileFwkBackendKernelServer: aicpu_execute failed with rc=%d", rc); + LOG_ERROR("simpler_aicpu_exec: aicpu_execute failed with rc=%d", rc); return rc; } - LOG_INFO_V0("%s", "DynTileFwkBackendKernelServer: aicpu_execute completed successfully"); + LOG_INFO_V0("%s", "simpler_aicpu_exec: aicpu_execute completed successfully"); // Stamp end into the device_wall buffer (addressed via // device_wall_data_base). Last-writer-wins across threads — wall diff --git a/src/a2a3/platform/onboard/host/CMakeLists.txt b/src/a2a3/platform/onboard/host/CMakeLists.txt index f0f01d438..ea23b7621 100644 --- a/src/a2a3/platform/onboard/host/CMakeLists.txt +++ b/src/a2a3/platform/onboard/host/CMakeLists.txt @@ -22,6 +22,8 @@ list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../c list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/worker") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/log/include") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/device_comm") +list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host") +list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher") if(DEFINED CUSTOM_INCLUDE_DIRS) foreach(INC_DIR ${CUSTOM_INCLUDE_DIRS}) list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${INC_DIR}") @@ -59,6 +61,10 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/pmu_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/dep_gen_collector.cpp" ) +# Add common/host sources (LoadAicpuOp) +list(APPEND HOST_RUNTIME_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host/load_aicpu_op.cpp" +) if(DEFINED CUSTOM_SOURCE_DIRS) foreach(SRC_DIR ${CUSTOM_SOURCE_DIRS}) file(GLOB_RECURSE DIR_SOURCES "${SRC_DIR}/*.cpp" "${SRC_DIR}/*.c") @@ -108,15 +114,21 @@ target_include_directories(host_runtime ${ASCEND_HOME_PATH}/include ${ASCEND_HOME_PATH}/pkg_inc ${ASCEND_HOME_PATH}/pkg_inc/runtime + # pkg_inc/runtime/runtime exposes rts_kernel.h + kernel.h (CANN 7.0+ + # rtsLaunchCpuKernel API used by LoadAicpuOp). + ${ASCEND_HOME_PATH}/pkg_inc/runtime/runtime ${ASCEND_HOME_PATH}/pkg_inc/profiling ${ASCEND_HOME_PATH}/${CMAKE_SYSTEM_PROCESSOR}-linux/asc/include ${ASCEND_HOME_PATH}/${CMAKE_SYSTEM_PROCESSOR}-linux/include/driver ) -target_link_directories(host_runtime - PRIVATE - ${ASCEND_HOME_PATH}/lib64 - ${ASCEND_HOME_PATH}/runtime/lib64 +# Stable dispatcher basename. The dispatcher SO is bundled with the host +# runtime and resolved next to host_runtime.so via dladdr at runtime; +# LoadAicpuOp::BootstrapDispatcher uploads it (along with the per-runtime +# AICPU kernel SO bytes) into the main aicpu_scheduler at host process +# startup via libaicpu_extend_kernels — no tar.gz, no sudo. +target_compile_definitions(host_runtime PRIVATE + SIMPLER_AICPU_BASENAME="libsimpler_aicpu_dispatcher.so" ) if(SIMPLER_ENABLE_PTO_SDMA_WORKSPACE) @@ -156,4 +168,10 @@ if(SIMPLER_ENABLE_PTO_SDMA_WORKSPACE) target_link_libraries(host_runtime PRIVATE nnopbase) endif() +target_link_directories(host_runtime + PRIVATE + ${ASCEND_HOME_PATH}/lib64 + ${ASCEND_HOME_PATH}/runtime/lib64 +) + set_target_properties(host_runtime PROPERTIES OUTPUT_NAME "host_runtime") diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index 827552f56..6de887a9f 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -18,6 +18,7 @@ #include "device_runner.h" #include "host_log.h" +#include "load_aicpu_op.h" #include @@ -28,6 +29,22 @@ #include #include "acl/acl.h" +static std::string resolve_dispatcher_so_path() { + // Dispatcher SO sits next to host_runtime.so (the SO this function lives + // in). dladdr gives us host_runtime.so's path; the dispatcher basename + // SIMPLER_AICPU_BASENAME is baked in at build time. + Dl_info info; + if (dladdr(reinterpret_cast(resolve_dispatcher_so_path), &info) == 0 || info.dli_fname == nullptr) { + return SIMPLER_AICPU_BASENAME; + } + std::string path = info.dli_fname; + size_t pos = path.rfind('/'); + if (pos == std::string::npos) { + return SIMPLER_AICPU_BASENAME; + } + return path.substr(0, pos + 1) + SIMPLER_AICPU_BASENAME; +} + // Include HAL constants from CANN (header only, library loaded dynamically) #include "ascend_hal.h" #include "callable.h" @@ -465,14 +482,42 @@ int DeviceRunner::ensure_binaries_loaded() { return -1; } - // Load AICPU SO - int rc = so_info_.init(aicpu_so_binary_, mem_alloc_); + // Bundle dispatcher SO + inner SO bytes into one Mode A KFC call: + // libaicpu_extend_kernels invokes our dispatcher, which writes the inner + // SO bytes to /usr/lib64/aicpu_kernels/0/aicpu_kernels_device/simpler_inner_.so + // using sched-thread (HwHiAiUser) write permission. The dispatcher itself + // never lands at preinstall — only its transient libaicpu_extend_kernels + // dlopen. Per-task launches afterwards go through Mode B + // (rtsBinaryLoadFromFile + rtsFuncGetByName + rtsLaunchCpuKernel) directly + // against the preinstall file. + std::string dispatcher_so_path = resolve_dispatcher_so_path(); + int rc = load_aicpu_op_.BootstrapDispatcher( + dispatcher_so_path, aicpu_so_binary_.data(), aicpu_so_binary_.size(), stream_aicpu_ + ); if (rc != 0) { - LOG_ERROR("AicpuSoInfo::init failed: %d", rc); + LOG_ERROR("LoadAicpuOp::BootstrapDispatcher failed: %d", rc); return rc; } + LOG_INFO_V2("DeviceRunner: inner SO uploaded to preinstall via dispatcher bootstrap"); - // Initialize device args + // JSON-register the inner SO and resolve simpler_aicpu_init / _exec handles. + rc = load_aicpu_op_.Init(); + if (rc != 0) { + LOG_ERROR("LoadAicpuOp::Init failed: %d", rc); + return rc; + } + LOG_INFO_V2("DeviceRunner: inner SO registered (simpler_aicpu_init/exec handles ready)"); + + // Keep so_info_ allocation matching upstream behavior. The new dispatcher + // path itself doesn't need DeviceArgs.aicpu_so_bin/len, but removing them + // empirically destabilized other tests on CI (a2a3 paged_attention_unroll + // hit AICORE-side issues). Treat the field as part of the contract that + // downstream runtime code may inspect. + rc = so_info_.init(aicpu_so_binary_, mem_alloc_); + if (rc != 0) { + LOG_ERROR("AicpuSoInfo::init failed: %d", rc); + return rc; + } device_args_.aicpu_so_bin = so_info_.aicpu_so_bin; device_args_.aicpu_so_len = so_info_.aicpu_so_len; rc = kernel_args_.init_device_args(device_args_, mem_alloc_); @@ -761,18 +806,16 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { dep_gen_collector_.start(thread_factory); } - LOG_INFO_V0("=== launch_aicpu_kernel DynTileFwkKernelServerInit ==="); - // Launch AICPU init kernel - rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServerInit", 1); + LOG_INFO_V0("=== launch_aicpu_kernel %s ===", host::KernelNames::InitName); + rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, host::KernelNames::InitName, 1); if (rc != 0) { LOG_ERROR("launch_aicpu_kernel (init) failed: %d", rc); return rc; } - LOG_INFO_V0("=== launch_aicpu_kernel DynTileFwkKernelServer ==="); - // Launch AICPU main kernel (over-launch for affinity gate) + LOG_INFO_V0("=== launch_aicpu_kernel %s ===", host::KernelNames::RunName); rc = launch_aicpu_kernel( - stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServer", PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH + stream_aicpu_, &kernel_args_.args, host::KernelNames::RunName, PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH ); if (rc != 0) { LOG_ERROR("launch_aicpu_kernel (main) failed: %d", rc); @@ -1111,6 +1154,9 @@ int DeviceRunner::finalize() { // Cleanup AICPU SO so_info_.finalize(); + // load_aicpu_op_ has no per-task device-side state to release (Mode A + // type 2 launches don't keep handles). The dispatcher itself was a + // transient libaicpu_extend_kernels dlopen — nothing to unload from host. binaries_loaded_ = false; // Release any chip callable buffers uploaded via upload_chip_callable_buffer. @@ -1195,27 +1241,11 @@ int DeviceRunner::finalize() { } int DeviceRunner::launch_aicpu_kernel(rtStream_t stream, KernelArgs *k_args, const char *kernel_name, int aicpu_num) { - struct Args { - KernelArgs k_args; - char kernel_name[32]; - const char so_name[32] = {"libaicpu_extend_kernels.so"}; - const char op_name[32] = {""}; - } args; - - args.k_args = *k_args; - std::strncpy(args.kernel_name, kernel_name, sizeof(args.kernel_name) - 1); - args.kernel_name[sizeof(args.kernel_name) - 1] = '\0'; - - rtAicpuArgsEx_t rt_args; - std::memset(&rt_args, 0, sizeof(rt_args)); - rt_args.args = &args; - rt_args.argsSize = sizeof(args); - rt_args.kernelNameAddrOffset = offsetof(struct Args, kernel_name); - rt_args.soNameAddrOffset = offsetof(struct Args, so_name); - - return rtAicpuKernelLaunchExWithArgs( - rtKernelType_t::KERNEL_TYPE_AICPU_KFC, "AST_DYN_AICPU", aicpu_num, &rt_args, nullptr, stream, 0 - ); + // kernel_name is host::KernelNames::InitName / RunName — the runtime SO's + // actual exported symbol (simpler_aicpu_init / simpler_aicpu_exec). The + // Mode A type 2 launch in LaunchBuiltInOp embeds it in the args struct + // for the main aicpu_scheduler to dlsym. + return load_aicpu_op_.LaunchBuiltInOp(stream, k_args, aicpu_num, kernel_name); } int DeviceRunner::launch_aicore_kernel(rtStream_t stream, KernelArgs *k_args) { diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h index 393531c48..9f1a47c0f 100644 --- a/src/a2a3/platform/onboard/host/device_runner.h +++ b/src/a2a3/platform/onboard/host/device_runner.h @@ -51,6 +51,7 @@ #include "host/tensor_dump_collector.h" #include "host/pmu_collector.h" #include "host/dep_gen_collector.h" +#include "load_aicpu_op.h" #include "runtime.h" /** @@ -569,6 +570,9 @@ class DeviceRunner { std::vector aicpu_so_binary_; std::vector aicore_kernel_binary_; + // AICPU op loader — handles dispatcher bootstrap and per-task launches. + host::LoadAicpuOp load_aicpu_op_; + // Memory management MemoryAllocator mem_alloc_; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 54115719e..d54cfd9d9 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -264,7 +264,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { return -1; } - // Try multiple paths that may allow execution on AICPU + // Try multiple paths that may allow execution on AICPU. char so_path[256]; bool file_created = false; const char *candidate_dirs[] = { diff --git a/src/a5/platform/onboard/aicpu/CMakeLists.txt b/src/a5/platform/onboard/aicpu/CMakeLists.txt index 6edf9eb93..ddc8bd553 100644 --- a/src/a5/platform/onboard/aicpu/CMakeLists.txt +++ b/src/a5/platform/onboard/aicpu/CMakeLists.txt @@ -20,6 +20,7 @@ list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../include list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/task_interface") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/log/include") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/device_comm") +list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher") if(DEFINED CUSTOM_INCLUDE_DIRS) foreach(INC_DIR ${CUSTOM_INCLUDE_DIRS}) list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${INC_DIR}") @@ -84,3 +85,35 @@ target_link_directories(aicpu_kernel # Output name set_target_properties(aicpu_kernel PROPERTIES OUTPUT_NAME aicpu_kernel) + +# See src/a2a3/platform/onboard/aicpu/CMakeLists.txt for design rationale. +# Direction 1: stable single dispatcher + runtime AICPU kernel uploaded at runtime. +set(AICPU_DISPATCHER_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher/aicpu_dispatcher.cpp" +) +add_library(aicpu_dispatcher SHARED ${AICPU_DISPATCHER_SOURCES}) + +target_compile_options(aicpu_dispatcher + PRIVATE + -Wall + -Wextra + -rdynamic + -O3 + -fPIC + -g + $<$:-std=gnu++17> +) + +target_include_directories(aicpu_dispatcher + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CUSTOM_INCLUDE_DIRS} + ${ASCEND_HOME_PATH}/include +) + +target_link_libraries(aicpu_dispatcher PRIVATE dl) + +set_target_properties(aicpu_dispatcher PROPERTIES + LINK_FLAGS "-Wl,--build-id" + OUTPUT_NAME "simpler_aicpu_dispatcher" +) diff --git a/src/a5/platform/onboard/aicpu/kernel.cpp b/src/a5/platform/onboard/aicpu/kernel.cpp index 4337e4429..4dc606eea 100644 --- a/src/a5/platform/onboard/aicpu/kernel.cpp +++ b/src/a5/platform/onboard/aicpu/kernel.cpp @@ -23,8 +23,8 @@ #include "runtime.h" // Run-wall capture: g_device_start_cycle is set once in -// DynTileFwkBackendKernelServerInit (single-threaded launch); each thread -// of the multi-threaded DynTileFwkBackendKernelServer writes the converted +// simpler_aicpu_init (single-threaded launch); each thread +// of the multi-threaded simpler_aicpu_exec writes the converted // (end - start) into KernelArgs.device_wall_ns on exit. Plain stores — // last-writer-wins is fine for wall measurement. static uint64_t g_device_start_cycle = 0; @@ -32,27 +32,18 @@ static uint64_t g_device_start_cycle = 0; // Forward declaration of aicpu_execute (implemented in aicpu_executor.cpp) extern "C" int aicpu_execute(Runtime *arg); -extern "C" __attribute__((visibility("default"))) int StaticTileFwkBackendKernelServer(void *arg) { - if (arg == nullptr) { - LOG_ERROR("%s", "Invalid kernel arguments: null pointer"); - return -1; - } - - return 0; -} - /** - * AICPU kernel initialization entry point + * AICPU kernel initialization entry point. * - * This function is called once during kernel initialization by the CANN - * runtime. It initializes logging and validates kernel arguments. - * - * Note: Function name is hardcoded in libaicpu_extend_kernels.so + * Called once by simpler_dispatcher in the Init phase. The dispatcher + * dlsym's "simpler_aicpu_init" inside this inner SO (an internal + * dispatcher↔inner protocol — independent of CANN's preinstalled + * libaicpu_extend_kernels contract, which only binds the dispatcher itself). * * @param arg Pointer to KernelArgs structure * @return 0 on success, -1 on error */ -extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelServerInit(void *arg) { +extern "C" __attribute__((visibility("default"))) int simpler_aicpu_init(void *arg) { init_log_switch(); if (arg == nullptr) { LOG_ERROR("%s", "Invalid kernel arguments: null pointer"); @@ -74,17 +65,15 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer } /** - * AICPU kernel main execution entry point - * - * This is the main entry point for the AICPU runtime executor kernel. - * It extracts the Runtime from KernelArgs and delegates to AicpuExecute. + * AICPU kernel main execution entry point. * - * Note: Function name is hardcoded in libaicpu_extend_kernels.so + * Called per-thread by simpler_dispatcher in the Run phase via dlsym + * "simpler_aicpu_exec" on the inner SO. * * @param arg Pointer to KernelArgs structure containing runtime_args * @return 0 on success, non-zero on error */ -extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelServer(void *arg) { +extern "C" __attribute__((visibility("default"))) int simpler_aicpu_exec(void *arg) { if (arg == nullptr) { LOG_ERROR("%s", "Invalid kernel arguments: null pointer"); return -1; @@ -121,13 +110,13 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer return 0; } - LOG_INFO_V0("%s", "DynTileFwkBackendKernelServer: Calling aicpu_execute with Runtime"); + LOG_INFO_V0("%s", "simpler_aicpu_exec: Calling aicpu_execute with Runtime"); int rc = aicpu_execute(runtime); if (rc != 0) { - LOG_ERROR("DynTileFwkBackendKernelServer: aicpu_execute failed with rc=%d", rc); + LOG_ERROR("simpler_aicpu_exec: aicpu_execute failed with rc=%d", rc); return rc; } - LOG_INFO_V0("%s", "DynTileFwkBackendKernelServer: aicpu_execute completed successfully"); + LOG_INFO_V0("%s", "simpler_aicpu_exec: aicpu_execute completed successfully"); // Stamp end into the device_wall buffer. Last-writer-wins across threads. uint64_t my_end = get_sys_cnt_aicpu(); diff --git a/src/a5/platform/onboard/host/CMakeLists.txt b/src/a5/platform/onboard/host/CMakeLists.txt index e5b57bf7a..7d826c34f 100644 --- a/src/a5/platform/onboard/host/CMakeLists.txt +++ b/src/a5/platform/onboard/host/CMakeLists.txt @@ -23,6 +23,8 @@ list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../c list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/log/include") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/device_comm") +list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host") +list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher") if(DEFINED CUSTOM_INCLUDE_DIRS) foreach(INC_DIR ${CUSTOM_INCLUDE_DIRS}) list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${INC_DIR}") @@ -44,6 +46,10 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/pmu_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/tensor_dump_collector.cpp" ) +# Add common/host sources (LoadAicpuOp) +list(APPEND HOST_RUNTIME_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host/load_aicpu_op.cpp" +) if(DEFINED CUSTOM_SOURCE_DIRS) foreach(SRC_DIR ${CUSTOM_SOURCE_DIRS}) file(GLOB_RECURSE DIR_SOURCES "${SRC_DIR}/*.cpp" "${SRC_DIR}/*.c") @@ -85,14 +91,16 @@ target_include_directories(host_runtime ${ASCEND_HOME_PATH}/include ${ASCEND_HOME_PATH}/pkg_inc ${ASCEND_HOME_PATH}/pkg_inc/runtime + # pkg_inc/runtime/runtime exposes rts_kernel.h + kernel.h (CANN 7.0+ + # rtsLaunchCpuKernel API used by LoadAicpuOp). + ${ASCEND_HOME_PATH}/pkg_inc/runtime/runtime ${ASCEND_HOME_PATH}/pkg_inc/profiling ${ASCEND_HOME_PATH}/${CMAKE_SYSTEM_PROCESSOR}-linux/include/driver ) -target_link_directories(host_runtime - PRIVATE - ${ASCEND_HOME_PATH}/lib64 - ${ASCEND_HOME_PATH}/runtime/lib64 +# Stable dispatcher basename (see a2a3 CMakeLists for rationale). +target_compile_definitions(host_runtime PRIVATE + SIMPLER_AICPU_BASENAME="libsimpler_aicpu_dispatcher.so" ) # Link against CANN runtime libraries @@ -105,4 +113,10 @@ target_link_libraries(host_runtime dl ) +target_link_directories(host_runtime + PRIVATE + ${ASCEND_HOME_PATH}/lib64 + ${ASCEND_HOME_PATH}/runtime/lib64 +) + set_target_properties(host_runtime PROPERTIES OUTPUT_NAME "host_runtime") diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp index 5235394e4..b5fc44e59 100644 --- a/src/a5/platform/onboard/host/device_runner.cpp +++ b/src/a5/platform/onboard/host/device_runner.cpp @@ -22,6 +22,8 @@ #include +#include "load_aicpu_op.h" + #include #include #include @@ -29,6 +31,22 @@ #include #include +static std::string resolve_dispatcher_so_path() { + // Dispatcher SO sits next to host_runtime.so (the SO this function lives + // in). dladdr gives us host_runtime.so's path; the dispatcher basename + // SIMPLER_AICPU_BASENAME is baked in at build time. + Dl_info info; + if (dladdr(reinterpret_cast(resolve_dispatcher_so_path), &info) == 0 || info.dli_fname == nullptr) { + return SIMPLER_AICPU_BASENAME; + } + std::string path = info.dli_fname; + size_t pos = path.rfind('/'); + if (pos == std::string::npos) { + return SIMPLER_AICPU_BASENAME; + } + return path.substr(0, pos + 1) + SIMPLER_AICPU_BASENAME; +} + #include "callable.h" #include "callable_protocol.h" #include "utils/elf_build_id.h" @@ -346,14 +364,36 @@ int DeviceRunner::ensure_binaries_loaded() { return -1; } - // Load AICPU SO - int rc = so_info_.init(aicpu_so_binary_, mem_alloc_); + // Bundle dispatcher SO + inner SO bytes into one Mode A KFC call: + // libaicpu_extend_kernels invokes our dispatcher, which writes the inner + // SO bytes to simpler_inner_.so in preinstall. Dispatcher itself never + // persists. Per-task launches afterwards go through Mode B + // (rtsBinaryLoadFromFile + rtsFuncGetByName + rtsLaunchCpuKernel) directly + // against the preinstall file. + std::string dispatcher_so_path = resolve_dispatcher_so_path(); + int rc = load_aicpu_op_.BootstrapDispatcher( + dispatcher_so_path, aicpu_so_binary_.data(), aicpu_so_binary_.size(), stream_aicpu_ + ); + if (rc != 0) { + LOG_ERROR("LoadAicpuOp::BootstrapDispatcher failed: %d", rc); + return rc; + } + LOG_INFO_V2("DeviceRunner: inner SO uploaded to preinstall via dispatcher bootstrap"); + + rc = load_aicpu_op_.Init(); if (rc != 0) { - LOG_ERROR("AicpuSoInfo::init failed: %d", rc); + LOG_ERROR("LoadAicpuOp::Init failed: %d", rc); return rc; } + LOG_INFO_V2("DeviceRunner: inner SO registered (simpler_aicpu_init/exec handles ready)"); - // Initialize device args + // Keep so_info_ allocation matching upstream behavior (see a2a3 sibling + // for rationale). + rc = so_info_.init(aicpu_so_binary_, mem_alloc_); + if (rc != 0) { + LOG_ERROR("AicpuSoInfo::init failed: %d", rc); + return rc; + } device_args_.aicpu_so_bin = so_info_.aicpu_so_bin; device_args_.aicpu_so_len = so_info_.aicpu_so_len; rc = kernel_args_.init_device_args(device_args_, mem_alloc_); @@ -585,16 +625,16 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { pmu_collector_.start(thread_factory); } - LOG_INFO_V0("=== launch_aicpu_kernel DynTileFwkKernelServerInit ==="); - rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServerInit", 1); + LOG_INFO_V0("=== launch_aicpu_kernel %s ===", host::KernelNames::InitName); + rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, host::KernelNames::InitName, 1); if (rc != 0) { LOG_ERROR("launch_aicpu_kernel (init) failed: %d", rc); return rc; } - LOG_INFO_V0("=== launch_aicpu_kernel DynTileFwkKernelServer ==="); + LOG_INFO_V0("=== launch_aicpu_kernel %s ===", host::KernelNames::RunName); rc = launch_aicpu_kernel( - stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServer", PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH + stream_aicpu_, &kernel_args_.args, host::KernelNames::RunName, PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH ); if (rc != 0) { LOG_ERROR("launch_aicpu_kernel (main) failed: %d", rc); @@ -920,6 +960,9 @@ int DeviceRunner::finalize() { // Cleanup AICPU SO so_info_.finalize(); + // load_aicpu_op_ has no per-task device-side state to release (Mode A + // type 2 launches don't keep handles). The dispatcher itself was a + // transient libaicpu_extend_kernels dlopen — nothing to unload from host. binaries_loaded_ = false; // Release any chip callable buffers uploaded via upload_chip_callable_buffer. @@ -1001,27 +1044,11 @@ int DeviceRunner::finalize() { } int DeviceRunner::launch_aicpu_kernel(rtStream_t stream, KernelArgs *k_args, const char *kernel_name, int aicpu_num) { - struct Args { - KernelArgs k_args; - char kernel_name[32]; - const char so_name[32] = {"libaicpu_extend_kernels.so"}; - const char op_name[32] = {""}; - } args; - - args.k_args = *k_args; - std::strncpy(args.kernel_name, kernel_name, sizeof(args.kernel_name) - 1); - args.kernel_name[sizeof(args.kernel_name) - 1] = '\0'; - - rtAicpuArgsEx_t rt_args; - std::memset(&rt_args, 0, sizeof(rt_args)); - rt_args.args = &args; - rt_args.argsSize = sizeof(args); - rt_args.kernelNameAddrOffset = offsetof(struct Args, kernel_name); - rt_args.soNameAddrOffset = offsetof(struct Args, so_name); - - return rtAicpuKernelLaunchExWithArgs( - rtKernelType_t::KERNEL_TYPE_AICPU_KFC, "AST_DYN_AICPU", aicpu_num, &rt_args, nullptr, stream, 0 - ); + // kernel_name is host::KernelNames::InitName / RunName — the runtime SO's + // actual exported symbol (simpler_aicpu_init / simpler_aicpu_exec). The + // Mode A type 2 launch in LaunchBuiltInOp embeds it in the args struct + // for the main aicpu_scheduler to dlsym. + return load_aicpu_op_.LaunchBuiltInOp(stream, k_args, aicpu_num, kernel_name); } int DeviceRunner::launch_aicore_kernel(rtStream_t stream, KernelArgs *k_args) { diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h index 71969f12a..306a329ce 100644 --- a/src/a5/platform/onboard/host/device_runner.h +++ b/src/a5/platform/onboard/host/device_runner.h @@ -50,6 +50,7 @@ #include "host/l2_perf_collector.h" #include "host/pmu_collector.h" #include "host/tensor_dump_collector.h" +#include "load_aicpu_op.h" #include "runtime.h" /** @@ -475,6 +476,9 @@ class DeviceRunner { std::vector aicpu_so_binary_; std::vector aicore_kernel_binary_; + // AICPU op loader — handles dispatcher bootstrap and per-task launches. + host::LoadAicpuOp load_aicpu_op_; + // Memory management MemoryAllocator mem_alloc_; diff --git a/src/common/aicpu_dispatcher/CMakeLists.txt b/src/common/aicpu_dispatcher/CMakeLists.txt new file mode 100644 index 000000000..5aa85d321 --- /dev/null +++ b/src/common/aicpu_dispatcher/CMakeLists.txt @@ -0,0 +1,47 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +# Build AICPU Dispatcher SO - Two-layer architecture for runtime-specific AICPU kernels +cmake_minimum_required(VERSION 3.16.3) + +project(aicpu_dispatcher LANGUAGES C CXX) + +# Dispatcher SO sources +set(AICPU_DISPATCHER_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/aicpu_dispatcher.cpp" +) + +# Create shared library +add_library(aicpu_dispatcher SHARED ${AICPU_DISPATCHER_SOURCES}) + +# C++ standard +set_target_properties(aicpu_dispatcher PROPERTIES + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON +) + +# Compile options (matching AICPU pattern). +target_compile_options(aicpu_dispatcher + PRIVATE + -Wall + -Wextra + -fPIC + -O3 + -g +) + +# Include directories +target_include_directories(aicpu_dispatcher + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + "${CMAKE_CURRENT_SOURCE_DIR}/../../../.." # For common/unified_log.h +) + + +# Project-namespaced output name: libsimpler_aicpu_dispatcher.so. +set_target_properties(aicpu_dispatcher PROPERTIES OUTPUT_NAME "simpler_aicpu") diff --git a/src/common/aicpu_dispatcher/README.md b/src/common/aicpu_dispatcher/README.md new file mode 100644 index 000000000..5963250d3 --- /dev/null +++ b/src/common/aicpu_dispatcher/README.md @@ -0,0 +1,32 @@ +# Simpler AICPU Dispatcher SO + +Source for `libsimpler_aicpu_dispatcher.so` — a transient bootstrap-only helper +loaded by CANN's preinstalled `libaicpu_extend_kernels.so`. Its only job is to +write the bundled runtime SO bytes to the main `aicpu_scheduler`'s preinstall +path under a content-fingerprint filename: + +```text +/usr/lib64/aicpu_kernels/0/aicpu_kernels_device/simpler_inner_.so +``` + +The dispatcher SO itself is **never** persisted to disk and **never** dispatches +at per-task launch time. After bootstrap, the host launches the runtime SO +directly via `rtAicpuKernelLaunchExWithArgs` (kernel_type = `KERNEL_TYPE_AICPU`), +which routes through the main `aicpu_scheduler` and dlopens the preinstall file. + +The source is runtime-agnostic, so it is built once and installed at +`build/lib//onboard//libsimpler_aicpu_dispatcher.so` (a sibling +of each runtime's host_runtime.so). A single process binding multiple runtimes +shares one dispatcher SO on disk. + +## Exported entry points + +Three C-style symbols are exposed; `libaicpu_extend_kernels.so::SetTileFwkKernelMap` +dlsym's all three at load time, but only DynInit does real work: + +1. `StaticTileFwkBackendKernelServer` — stub +2. `DynTileFwkBackendKernelServerInit` — bootstrap upload (real work) +3. `DynTileFwkBackendKernelServer` — stub + +See `aicpu_dispatcher.h` for the bootstrap protocol details (extended DeviceArgs +with `inner_so_bin`/`inner_so_len`, FNV-1a content fingerprint). diff --git a/src/common/aicpu_dispatcher/aicpu_dispatcher.cpp b/src/common/aicpu_dispatcher/aicpu_dispatcher.cpp new file mode 100644 index 000000000..333c2d83a --- /dev/null +++ b/src/common/aicpu_dispatcher/aicpu_dispatcher.cpp @@ -0,0 +1,195 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * AICPU Dispatcher implementation — transient bootstrap-only upload helper. + * + * See aicpu_dispatcher.h for architecture. The dispatcher SO exists only + * to provide a piece of code that runs with sched-thread (HwHiAiUser) + * permissions for one purpose: write the bundled runtime SO bytes to + * the main aicpu_scheduler's preinstall path under a content-fingerprint + * filename. Once Init returns, this SO is no longer referenced — host's + * subsequent Mode B loads target the runtime SO file directly. + */ + +#include "aicpu_dispatcher.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +// dlog wrapper so error paths show up in device log without depending on +// our common/unified_log machinery (this SO is loaded standalone by CANN). +extern "C" void DlogRecord(int moduleId, int level, const char *fmt, ...); + +namespace simpler_dispatcher { +constexpr int kDlogModuleCcecpu = 3; +constexpr int kDlogLevelError = 3; + +void DispatcherLog(const char *fmt, ...) { + char buf[1024]; + va_list ap; + va_start(ap, fmt); + vsnprintf(buf, sizeof(buf), fmt, ap); + va_end(ap); + if (&DlogRecord != nullptr) { + DlogRecord(kDlogModuleCcecpu, kDlogLevelError, "[simpler-dispatcher] %s", buf); + } +} +} // namespace simpler_dispatcher + +// Bootstrap-time DeviceArgs view. Layout shared with host's BootstrapDispatcher. +// libaicpu_extend_kernels reads aicpu_so_bin/len/deviceId; we additionally read +// inner_so_bin/len (an extra qword pair past deviceId). +struct KernelArgs { + uint64_t unused[5] = {0}; + void *device_args{nullptr}; + void *runtime_args{nullptr}; + uint64_t regs{0}; +}; +struct DeviceArgs { + uint64_t unused[12] = {0}; + uint64_t aicpu_so_bin{0}; // 96 — dispatcher bytes (libaicpu_extend_kernels) + uint64_t aicpu_so_len{0}; // 104 + uint64_t device_id{0}; // 112 + uint64_t inner_so_bin{0}; // 120 — runtime SO bytes (dispatcher) + uint64_t inner_so_len{0}; // 128 +}; +static_assert(offsetof(KernelArgs, device_args) == 40, "KernelArgs::device_args offset drift"); +static_assert(offsetof(DeviceArgs, aicpu_so_bin) == 96, "DeviceArgs::aicpu_so_bin offset drift"); +static_assert(offsetof(DeviceArgs, aicpu_so_len) == 104, "DeviceArgs::aicpu_so_len offset drift"); +static_assert(offsetof(DeviceArgs, device_id) == 112, "DeviceArgs::device_id offset drift"); +static_assert(offsetof(DeviceArgs, inner_so_bin) == 120, "DeviceArgs::inner_so_bin offset drift"); +static_assert(offsetof(DeviceArgs, inner_so_len) == 128, "DeviceArgs::inner_so_len offset drift"); + +namespace simpler_dispatcher { + +// FNV-1a over first 64 bytes XOR'd with len. Host's MakeInnerSoBasename +// uses the same algorithm so both sides produce the same filename without +// any other channel of communication. +uint64_t Fingerprint(const char *data, uint64_t len) { + constexpr uint64_t kFnvOffset = 0xcbf29ce484222325ULL; + constexpr uint64_t kFnvPrime = 0x100000001b3ULL; + uint64_t h = kFnvOffset; + size_t n = len < 64 ? len : 64; + for (size_t i = 0; i < n; ++i) { + h ^= static_cast(data[i]); + h *= kFnvPrime; + } + return h ^ len; +} + +// Preinstall path — HwHiAiUser owns this dir, the sched thread can write here. +// device-side /tmp is mounted read-only / restricted in CANN 9.0. +std::string MakeInnerSoPath(uint64_t fp) { + char buf[256]; + snprintf(buf, sizeof(buf), "/usr/lib64/aicpu_kernels/0/aicpu_kernels_device/simpler_inner_%016lx.so", fp); + return buf; +} + +// Atomic write: write to a per-process temp path, then rename onto the target. +// Several CI workers may bootstrap on different devices simultaneously and all +// land at the same fingerprinted target path; without atomic rename a reader +// (a sibling aicpu_scheduler's dlopen during its Mode B load) can observe a +// truncated/partially-written file and fail with 507018 or 507046. +// +// Same fingerprint → same content, so whichever rename wins yields identical +// bytes; existing dlopen handles in any aicpu_scheduler stay bound to their +// captured inode and are unaffected by later renames. We don't fast-path on +// the file already existing — a stale corrupt file from a pre-fix run could +// match the fingerprint by chance, and the atomic rename overwrites cheaply. +bool WriteBytes(const std::string &path, const char *data, uint64_t len) { + char tmp_path[320]; + snprintf(tmp_path, sizeof(tmp_path), "%s.tmp.%d", path.c_str(), static_cast(getpid())); + { + std::ofstream f(tmp_path, std::ios::binary | std::ios::trunc); + if (!f.is_open()) { + DispatcherLog("open %s for write failed: %s", tmp_path, strerror(errno)); + return false; + } + f.write(data, static_cast(len)); + bool good = f.good(); + f.close(); + if (!good) { + DispatcherLog("write %s failed", tmp_path); + unlink(tmp_path); + return false; + } + } + (void)chmod(tmp_path, 0755); + if (rename(tmp_path, path.c_str()) != 0) { + DispatcherLog("rename %s -> %s failed: %s", tmp_path, path.c_str(), strerror(errno)); + unlink(tmp_path); + return false; + } + return true; +} + +} // namespace simpler_dispatcher + +// ============================================================================= +// C-style exported entry points dlsym'd by libaicpu_extend_kernels. +// ============================================================================= + +extern "C" { + +// Stubs — libaicpu_extend_kernels::SetTileFwkKernelMap dlsym's all three at +// load time; absence makes the whole SO unmappable. We only reach Init. +__attribute__((visibility("default"))) int StaticTileFwkBackendKernelServer(void *args) { + (void)args; + simpler_dispatcher::DispatcherLog("Static: stub (should not be called)"); + return 1; +} + +__attribute__((visibility("default"))) uint32_t DynTileFwkBackendKernelServer(void *args) { + (void)args; + simpler_dispatcher::DispatcherLog("Server: stub (dispatcher is upload-only, should not be called)"); + return 1; +} + +// Init: write the bundled runtime SO bytes to a fingerprint-named file under +// the main scheduler's preinstall path, return. Once this returns, host's +// Mode B JSON load can resolve the runtime SO directly — this dispatcher SO +// never gets referenced again. +__attribute__((visibility("default"))) uint32_t DynTileFwkBackendKernelServerInit(void *args) { + if (args == nullptr) { + simpler_dispatcher::DispatcherLog("Init: args==nullptr"); + return 1; + } + auto *k = reinterpret_cast(args); + auto *d = reinterpret_cast(k->device_args); + if (d == nullptr) { + simpler_dispatcher::DispatcherLog("Init: device_args==nullptr"); + return 1; + } + if (d->inner_so_bin == 0 || d->inner_so_len == 0) { + simpler_dispatcher::DispatcherLog( + "Init: empty inner SO bundle (bin=%lx len=%lu)", d->inner_so_bin, d->inner_so_len + ); + return 1; + } + const char *inner_bytes = reinterpret_cast(d->inner_so_bin); + uint64_t fp = simpler_dispatcher::Fingerprint(inner_bytes, d->inner_so_len); + std::string path = simpler_dispatcher::MakeInnerSoPath(fp); + if (!simpler_dispatcher::WriteBytes(path, inner_bytes, d->inner_so_len)) { + return 1; + } + simpler_dispatcher::DispatcherLog("Init: wrote %s (%lu bytes)", path.c_str(), d->inner_so_len); + return 0; +} + +} // extern "C" diff --git a/src/common/aicpu_dispatcher/aicpu_dispatcher.h b/src/common/aicpu_dispatcher/aicpu_dispatcher.h new file mode 100644 index 000000000..29e89106c --- /dev/null +++ b/src/common/aicpu_dispatcher/aicpu_dispatcher.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * AICPU Dispatcher — transient bootstrap-only upload helper. + * + * Architecture + * ============ + * + * This dispatcher SO has one job: write the bundled runtime SO bytes to the + * main aicpu_scheduler's preinstall path. It is **never** written to disk + * itself and **never** dispatches at per-task launch time. + * + * Bootstrap flow (host → libaicpu_extend_kernels → dispatcher → preinstall): + * + * 1. host calls `rtAicpuKernelLaunchExWithArgs` (kernel_type = + * `KERNEL_TYPE_AICPU_KFC`) targeting libaicpu_extend_kernels with + * DeviceArgs containing: + * - aicpu_so_bin / aicpu_so_len → dispatcher SO bytes (libaicpu_extend_kernels reads) + * - inner_so_bin / inner_so_len → runtime SO bytes (dispatcher reads) + * 2. libaicpu_extend_kernels writes the dispatcher bytes to its own private + * path (some /tmp on device, often unlinked after open), dlopens us, + * dlsym's the three CANN-contract symbols (Static + DynInit + Dyn), + * invokes our `DynTileFwkBackendKernelServerInit`. + * 3. Our Init reads inner_so_bin/inner_so_len from DeviceArgs, fingerprints + * the bytes (FNV-1a over first 64 bytes XOR len), and writes them to + * `/usr/lib64/aicpu_kernels/0/aicpu_kernels_device/simpler_inner_.so`. + * The sched thread (HwHiAiUser) owns this dir, so the write succeeds. + * 4. host computes the same fingerprint locally to derive the same + * preinstall filename. + * 5. Per-task launches: host calls `rtAicpuKernelLaunchExWithArgs` + * (kernel_type = `KERNEL_TYPE_AICPU`, so_name = `simpler_inner_.so`, + * kernel_name = `simpler_aicpu_init`/`_exec`). The main aicpu_scheduler + * dlopens the preinstall file once and caches the handle; dispatcher is + * no longer in the picture. + * + * Multi-runtime in one host process: each DeviceRunner bootstraps with the + * same dispatcher bytes + its own runtime SO bytes. A process-level + * fingerprint cache in LoadAicpuOp short-circuits repeat invocations for + * the same runtime SO content, so libaicpu_extend_kernels' one-shot + * `firstCreatSo_` latch fires at most once per (process, fingerprint). + */ + +#ifndef COMMON_AICPU_DISPATCHER_AICPU_DISPATCHER_H_ +#define COMMON_AICPU_DISPATCHER_AICPU_DISPATCHER_H_ + +#include + +// C-style exports required by libaicpu_extend_kernels' SetTileFwkKernelMap +// dlsym contract. Only DynInit does real work; the other two are stubs that +// log + return failure if ever invoked (they shouldn't be — dispatcher is +// upload-only and host's per-task launches target the runtime SO directly). +extern "C" { +__attribute__((visibility("default"))) int StaticTileFwkBackendKernelServer(void *args); +__attribute__((visibility("default"))) uint32_t DynTileFwkBackendKernelServerInit(void *args); +__attribute__((visibility("default"))) uint32_t DynTileFwkBackendKernelServer(void *args); +} + +#endif // COMMON_AICPU_DISPATCHER_AICPU_DISPATCHER_H_ diff --git a/src/common/host/CMakeLists.txt b/src/common/host/CMakeLists.txt new file mode 100644 index 000000000..9e9125274 --- /dev/null +++ b/src/common/host/CMakeLists.txt @@ -0,0 +1,20 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +# Build host-side AICPU operation loader +cmake_minimum_required(VERSION 3.16.3) + +project(host_common LANGUAGES C CXX) + +# Host common sources +set(HOST_COMMON_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/load_aicpu_op.cpp" +) + +# This library is included directly in host_runtime, not built separately +# Sources are added to HOST_RUNTIME_SOURCES in platform CMakeLists.txt diff --git a/src/common/host/load_aicpu_op.cpp b/src/common/host/load_aicpu_op.cpp new file mode 100644 index 000000000..d82585b4d --- /dev/null +++ b/src/common/host/load_aicpu_op.cpp @@ -0,0 +1,348 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * AICPU Operation Loader Implementation + */ + +#include "load_aicpu_op.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "acl/acl.h" +#include "common/unified_log.h" +#include "runtime/rt.h" + +namespace host { + +namespace { + +std::string MakeInnerSoBasename(uint64_t fp) { + char buf[64]; + snprintf(buf, sizeof(buf), "simpler_inner_%016lx.so", fp); + return buf; +} + +// Per-runtime unique opType — different LoadAicpuOp instances in the same +// process may register the same plain symbol names (simpler_aicpu_init / _exec); +// suffixing with the runtime SO fingerprint keeps CANN's global op registry +// from collapsing distinct registrations. +std::string MakeUniqueOpType(const char *base, uint64_t fp) { + char buf[128]; + snprintf(buf, sizeof(buf), "%s_%016lx", base, fp); + return buf; +} + +uint64_t FingerprintBytes(const void *data, size_t len) { + constexpr uint64_t kFnvOffset = 0xcbf29ce484222325ULL; + constexpr uint64_t kFnvPrime = 0x100000001b3ULL; + uint64_t h = kFnvOffset; + size_t n = len < 64 ? len : 64; + auto *p = reinterpret_cast(data); + for (size_t i = 0; i < n; ++i) { + h ^= p[i]; + h *= kFnvPrime; + } + return h ^ static_cast(len); +} + +bool ReadFileBytes(const std::string &path, std::vector &out) { + std::ifstream in(path, std::ios::binary | std::ios::ate); + if (!in.is_open()) { + LOG_ERROR("ReadFileBytes: cannot open %s: %s", path.c_str(), strerror(errno)); + return false; + } + std::streamsize len = in.tellg(); + in.seekg(0); + out.resize(static_cast(len)); + if (!in.read(out.data(), len)) { + LOG_ERROR("ReadFileBytes: read failed for %s", path.c_str()); + return false; + } + return true; +} + +struct DeviceBuf { + void *ptr = nullptr; + ~DeviceBuf() { + if (ptr != nullptr) (void)aclrtFree(ptr); + } + aclError alloc(size_t bytes) { return aclrtMalloc(&ptr, bytes, ACL_MEM_MALLOC_HUGE_FIRST); } +}; + +// Process-level cache of inner-SO fingerprints we've already bootstrapped. +// Multiple DeviceRunner instances in the same process share one entry per +// runtime here; same-content uploads short-circuit. No mutex — host-side +// LoadAicpuOp construction is always serialized by the caller (Python GIL or +// sequential per-ChipWorker init), so concurrent insert never happens. +std::unordered_set &BootstrappedFps() { + static std::unordered_set kSet; + return kSet; +} + +} // namespace + +int LoadAicpuOp::BootstrapDispatcher( + const std::string &dispatcher_so_path, const void *inner_so_data, size_t inner_so_len, rtStream_t stream +) { + if (inner_so_data == nullptr || inner_so_len == 0) { + LOG_ERROR("BootstrapDispatcher: empty inner SO bytes"); + return -1; + } + inner_fp_ = FingerprintBytes(inner_so_data, inner_so_len); + inner_so_basename_ = MakeInnerSoBasename(inner_fp_); + + if (BootstrappedFps().count(inner_fp_) > 0) { + LOG_INFO_V2("BootstrapDispatcher: inner SO fp=%016lx already bootstrapped, skipping", inner_fp_); + return 0; + } + + std::vector dispatcher_bytes; + if (!ReadFileBytes(dispatcher_so_path, dispatcher_bytes)) return -1; + size_t dispatcher_len = dispatcher_bytes.size(); + const char *inner_bytes = reinterpret_cast(inner_so_data); + size_t inner_len = inner_so_len; + + DeviceBuf dev_dispatcher; + DeviceBuf dev_inner; + aclError rc = dev_dispatcher.alloc(dispatcher_len); + if (rc != ACL_SUCCESS) { + LOG_ERROR("BootstrapDispatcher: aclrtMalloc(dispatcher) failed: %d", rc); + return rc; + } + rc = aclrtMemcpy( + dev_dispatcher.ptr, dispatcher_len, dispatcher_bytes.data(), dispatcher_len, ACL_MEMCPY_HOST_TO_DEVICE + ); + if (rc != ACL_SUCCESS) { + LOG_ERROR("BootstrapDispatcher: aclrtMemcpy(dispatcher) failed: %d", rc); + return rc; + } + rc = dev_inner.alloc(inner_len); + if (rc != ACL_SUCCESS) { + LOG_ERROR("BootstrapDispatcher: aclrtMalloc(inner) failed: %d", rc); + return rc; + } + rc = aclrtMemcpy(dev_inner.ptr, inner_len, inner_bytes, inner_len, ACL_MEMCPY_HOST_TO_DEVICE); + if (rc != ACL_SUCCESS) { + LOG_ERROR("BootstrapDispatcher: aclrtMemcpy(inner) failed: %d", rc); + return rc; + } + + constexpr size_t kDeviceArgsBytes = 160; + char host_dev_args[kDeviceArgsBytes] = {}; + auto write_qword = [&](size_t offset, uint64_t value) { + std::memcpy(host_dev_args + offset, &value, sizeof(value)); + }; + write_qword(96, reinterpret_cast(dev_dispatcher.ptr)); + write_qword(104, static_cast(dispatcher_len)); + write_qword(112, 0); + write_qword(120, reinterpret_cast(dev_inner.ptr)); + write_qword(128, static_cast(inner_len)); + + DeviceBuf dev_args; + rc = dev_args.alloc(kDeviceArgsBytes); + if (rc != ACL_SUCCESS) { + LOG_ERROR("BootstrapDispatcher: aclrtMalloc(device_args) failed: %d", rc); + return rc; + } + rc = aclrtMemcpy(dev_args.ptr, kDeviceArgsBytes, host_dev_args, kDeviceArgsBytes, ACL_MEMCPY_HOST_TO_DEVICE); + if (rc != ACL_SUCCESS) { + LOG_ERROR("BootstrapDispatcher: aclrtMemcpy(device_args) failed: %d", rc); + return rc; + } + + struct Args { + struct { + uint64_t unused[5] = {0}; + uint64_t device_args_ptr = 0; + uint64_t pad[20] = {0}; + } k_args; + char kernel_name[32]; + char so_name[32]; + char op_name[32]; + } args = {}; + args.k_args.device_args_ptr = reinterpret_cast(dev_args.ptr); + std::strncpy(args.kernel_name, "DynTileFwkKernelServerInit", sizeof(args.kernel_name) - 1); + std::strncpy(args.so_name, "libaicpu_extend_kernels.so", sizeof(args.so_name) - 1); + args.op_name[0] = '\0'; + + rtAicpuArgsEx_t rt_args = {}; + rt_args.args = &args; + rt_args.argsSize = sizeof(args); + rt_args.kernelNameAddrOffset = offsetof(Args, kernel_name); + rt_args.soNameAddrOffset = offsetof(Args, so_name); + + rtError_t rrc = rtAicpuKernelLaunchExWithArgs( + rtKernelType_t::KERNEL_TYPE_AICPU_KFC, "AST_DYN_AICPU", 1, &rt_args, nullptr, stream, 0 + ); + if (rrc != RT_ERROR_NONE) { + LOG_ERROR("BootstrapDispatcher: rtAicpuKernelLaunchExWithArgs failed: %d", rrc); + return rrc; + } + rc = aclrtSynchronizeStream(stream); + if (rc != ACL_SUCCESS) { + LOG_ERROR("BootstrapDispatcher: aclrtSynchronizeStream failed: %d", rc); + return rc; + } + LOG_INFO_V0( + "BootstrapDispatcher: bundled dispatcher (%zu B) + inner SO (%zu B) uploaded; inner SO at %s", dispatcher_len, + inner_len, inner_so_basename_.c_str() + ); + BootstrappedFps().insert(inner_fp_); + return 0; +} + +void LoadAicpuOp::Finalize() { + if (binary_handle_ != nullptr) { + rtError_t rc = rtsBinaryUnload(binary_handle_); + if (rc != RT_ERROR_NONE) { + LOG_WARN("rtsBinaryUnload failed: %d", rc); + } + binary_handle_ = nullptr; + } + func_handles_.clear(); + inner_fp_ = 0; + inner_so_basename_.clear(); + if (!json_file_path_.empty()) { + std::remove(json_file_path_.c_str()); + json_file_path_.clear(); + } +} + +LoadAicpuOp::~LoadAicpuOp() { Finalize(); } + +bool LoadAicpuOp::GenerateAicpuOpJson(const std::string &json_path, const std::string &kernel_so) { + std::ofstream json_file(json_path); + if (!json_file.is_open()) { + LOG_ERROR("Failed to open JSON file for writing: %s", json_path.c_str()); + return false; + } + auto make_cfg = [&](const char *symbol_name) { + AicpuOpConfig c; + c.opType = MakeUniqueOpType(symbol_name, inner_fp_); + c.functionName = symbol_name; + c.kernelSo = kernel_so; + c.opKernelLib = "AICPUKernel"; + c.userDefined = "False"; + return c; + }; + std::vector op_configs = { + make_cfg(KernelNames::InitName), + make_cfg(KernelNames::RunName), + }; + json_file << "{\n"; + for (size_t i = 0; i < op_configs.size(); ++i) { + const auto &c = op_configs[i]; + json_file << " \"" << c.opType << "\": {\n"; + json_file << " \"opInfo\": {\n"; + json_file << " \"functionName\": \"" << c.functionName << "\",\n"; + json_file << " \"kernelSo\": \"" << c.kernelSo << "\",\n"; + json_file << " \"opKernelLib\": \"" << c.opKernelLib << "\",\n"; + json_file << " \"computeCost\": \"" << c.computeCost << "\",\n"; + json_file << " \"engine\": \"" << c.engine << "\",\n"; + json_file << " \"flagAsync\": \"" << c.flagAsync << "\",\n"; + json_file << " \"flagPartial\": \"" << c.flagPartial << "\",\n"; + json_file << " \"userDefined\": \"" << c.userDefined << "\"\n"; + json_file << " }\n"; + json_file << " }" << (i < op_configs.size() - 1 ? "," : "") << "\n"; + } + json_file << "}\n"; + return true; +} + +int LoadAicpuOp::Init() { + if (inner_fp_ == 0) { + LOG_ERROR("LoadAicpuOp::Init: BootstrapDispatcher must be called first"); + return -1; + } + + // Per-process JSON path. /tmp is always writable. + char json_name_buf[128]; + snprintf( + json_name_buf, sizeof(json_name_buf), "/tmp/simpler_inner_%016lx_%d.json", inner_fp_, static_cast(getpid()) + ); + json_file_path_ = json_name_buf; + + if (!GenerateAicpuOpJson(json_file_path_, inner_so_basename_)) { + json_file_path_.clear(); + return -1; + } + + rtLoadBinaryOption_t option = {}; + option.optionId = RT_LOAD_BINARY_OPT_CPU_KERNEL_MODE; + option.value.cpuKernelMode = 0; + + rtLoadBinaryConfig_t load_config = {}; + load_config.options = &option; + load_config.numOpt = 1; + + LOG_INFO_V2("LoadAicpuOp::Init: JSON=%s inner_basename=%s", json_file_path_.c_str(), inner_so_basename_.c_str()); + + rtError_t rc = rtsBinaryLoadFromFile(json_file_path_.c_str(), &load_config, &binary_handle_); + if (rc != RT_ERROR_NONE) { + LOG_ERROR("rtsBinaryLoadFromFile failed for %s: %d", json_file_path_.c_str(), rc); + std::remove(json_file_path_.c_str()); + json_file_path_.clear(); + return rc; + } + LOG_INFO_V2("LoadAicpuOp: Loaded inner SO via JSON, handle=%p", binary_handle_); + + const char *symbol_names[] = {KernelNames::InitName, KernelNames::RunName}; + for (const char *name : symbol_names) { + std::string lookup_name = MakeUniqueOpType(name, inner_fp_); + rtFuncHandle func_handle = nullptr; + rc = rtsFuncGetByName(binary_handle_, lookup_name.c_str(), &func_handle); + if (rc != RT_ERROR_NONE) { + LOG_ERROR("rtsFuncGetByName failed for %s: %d", lookup_name.c_str(), rc); + return rc; + } + func_handles_[name] = func_handle; + LOG_INFO_V2("LoadAicpuOp: resolved handle for %s (opType=%s): %p", name, lookup_name.c_str(), func_handle); + } + return 0; +} + +int LoadAicpuOp::AicpuKernelLaunch(rtFuncHandle func_handle, rtStream_t stream, KernelArgs *k_args, int aicpu_num) { + rtCpuKernelArgs_t cpu_args = {}; + cpu_args.baseArgs.args = k_args; + cpu_args.baseArgs.argsSize = sizeof(KernelArgs); + + rtKernelLaunchCfg_t kernelLaunchCfg = {nullptr, 0U}; + auto launchKernelAttr = std::make_unique(); + kernelLaunchCfg.attrs = launchKernelAttr.get(); + + rtError_t rc = + rtsLaunchCpuKernel(func_handle, static_cast(aicpu_num), stream, &kernelLaunchCfg, &cpu_args); + if (rc != RT_ERROR_NONE) { + LOG_ERROR("rtsLaunchCpuKernel failed: %d", rc); + return rc; + } + return 0; +} + +int LoadAicpuOp::LaunchBuiltInOp(rtStream_t stream, KernelArgs *k_args, int aicpu_num, const std::string &func_name) { + auto it = func_handles_.find(func_name); + if (it == func_handles_.end()) { + LOG_ERROR("Function not found: %s", func_name.c_str()); + return -1; + } + return AicpuKernelLaunch(it->second, stream, k_args, aicpu_num); +} + +} // namespace host diff --git a/src/common/host/load_aicpu_op.h b/src/common/host/load_aicpu_op.h new file mode 100644 index 000000000..4427a4b45 --- /dev/null +++ b/src/common/host/load_aicpu_op.h @@ -0,0 +1,140 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * @file load_aicpu_op.h + * @brief Host-side AICPU operation loader. + * + * Three-phase architecture: + * + * 1. BootstrapDispatcher (per-DeviceRunner, idempotent across instances in + * the same process via a content-fingerprint cache): bundles dispatcher + * SO bytes + runtime SO bytes into a single Mode A KFC launch + * (`rtAicpuKernelLaunchExWithArgs`, kernel_type = + * `KERNEL_TYPE_AICPU_KFC`) targeting libaicpu_extend_kernels. Our + * dispatcher then writes the runtime SO to + * `/usr/lib64/aicpu_kernels/0/aicpu_kernels_device/simpler_inner_.so` + * using sched-thread (HwHiAiUser) write permission. The dispatcher SO + * itself is never persisted to disk. + * + * 2. Init (per-DeviceRunner): JSON-registers the runtime SO via + * `rtsBinaryLoadFromFile` (cpuKernelMode=0, kernelSo points at the + * preinstall basename), then resolves `simpler_aicpu_init` and + * `simpler_aicpu_exec` to `rtFuncHandle`s via `rtsFuncGetByName`. JSON + * is per-process (`/tmp/simpler_inner__.json`) so concurrent + * multi-chip / multi-worker tests don't race on a shared file. + * + * 3. LaunchBuiltInOp (per-task): `rtsLaunchCpuKernel` on the cached + * `rtFuncHandle`. No per-launch string marshalling, no global op + * registry lookups. + * + * See common/aicpu_dispatcher/aicpu_dispatcher.h for the bootstrap protocol + * details (extended DeviceArgs with inner_so_bin/inner_so_len, + * fingerprint-named preinstall files). + */ + +#ifndef COMMON_HOST_LOAD_AICPU_OP_H_ +#define COMMON_HOST_LOAD_AICPU_OP_H_ + +#include +#include +#include + +#include "common/kernel_args.h" +#include "runtime/runtime/rts/rts_kernel.h" +#include "runtime/rt.h" + +namespace host { + +/** + * @brief AICPU operation configuration for JSON descriptor generation. + */ +struct AicpuOpConfig { + std::string functionName; + std::string kernelSo; + std::string opKernelLib; + std::string computeCost = "100"; + std::string engine = "DNN_VM_AICPU"; + std::string flagAsync = "False"; + std::string flagPartial = "False"; + std::string userDefined = "False"; + std::string opType; +}; + +/** + * @brief Host-side AICPU operation loader. + * + * One instance per DeviceRunner; manages bootstrap (dispatcher upload) + + * JSON registration of the runtime SO + per-task launches via the runtime + * SO's direct rtFuncHandles. + */ +class LoadAicpuOp { +public: + LoadAicpuOp() = default; + ~LoadAicpuOp(); + + LoadAicpuOp(const LoadAicpuOp &) = delete; + LoadAicpuOp &operator=(const LoadAicpuOp &) = delete; + LoadAicpuOp(LoadAicpuOp &&) = delete; + LoadAicpuOp &operator=(LoadAicpuOp &&) = delete; + + /** + * @brief One-shot bootstrap: upload runtime SO to preinstall via dispatcher. + * + * @param dispatcher_so_path Host path to libsimpler_aicpu_dispatcher.so + * @param inner_so_data Runtime SO bytes (caller-owned, must outlive call) + * @param inner_so_len Runtime SO size + * @param stream Stream on which to enqueue the bootstrap + * @return 0 on success, error code on failure + */ + int BootstrapDispatcher( + const std::string &dispatcher_so_path, const void *inner_so_data, size_t inner_so_len, rtStream_t stream + ); + + /** + * @brief JSON-register the runtime SO and resolve its Init/Exec handles. + */ + int Init(); + + /** @brief Release binary handle + function handles + temporary JSON. */ + void Finalize(); + + /** + * @brief Launch a runtime SO entry point via rtsLaunchCpuKernel. + * + * @param stream RTS stream + * @param k_args Kernel arguments + * @param aicpu_num Number of AICPU threads (1 for Init, N for Exec) + * @param func_name Lookup key in func_handles_ (KernelNames::InitName/RunName) + * @return 0 on success, error code on failure + */ + int LaunchBuiltInOp(rtStream_t stream, KernelArgs *k_args, int aicpu_num, const std::string &func_name); + +private: + void *binary_handle_ = nullptr; + std::unordered_map func_handles_; + std::string json_file_path_; + uint64_t inner_fp_ = 0; + std::string inner_so_basename_; + + bool GenerateAicpuOpJson(const std::string &json_path, const std::string &kernel_so); + int AicpuKernelLaunch(rtFuncHandle func_handle, rtStream_t stream, KernelArgs *k_args, int aicpu_num); +}; + +// Runtime SO's actual exported symbol names. Both are looked up via the +// runtime SO's own JSON registration (no dispatcher hop at runtime). +namespace KernelNames { +constexpr const char *InitName = "simpler_aicpu_init"; // single-threaded init +constexpr const char *RunName = "simpler_aicpu_exec"; // multi-threaded exec +} // namespace KernelNames + +} // namespace host + +#endif // COMMON_HOST_LOAD_AICPU_OP_H_ diff --git a/tests/st/aicore_op_timeout/test_aicore_op_timeout.py b/tests/st/aicore_op_timeout/test_aicore_op_timeout.py index 5f5fd1002..5e161cfe8 100644 --- a/tests/st/aicore_op_timeout/test_aicore_op_timeout.py +++ b/tests/st/aicore_op_timeout/test_aicore_op_timeout.py @@ -75,11 +75,17 @@ def test_aicore_op_timeout_surfaces_as_runtime_error(st_platform, st_device_ids) config.aicpu_thread_num = 2 t0 = time.monotonic() - # 507046 = ACL_ERROR_RT_STREAM_SYNC_TIMEOUT — what - # aclrtSynchronizeStreamWithTimeout returns when the AICore stream - # (carrying the STARS-killed op) doesn't drain within the host's 2 s - # budget. Observed elapsed on Ascend910 / a2a3 onboard: ~6.3 s. - with pytest.raises(RuntimeError, match=r"run_prepared failed with code 507046"): + # Acceptable error codes for the STARS-killed AICore op: + # 507046 = ACL_ERROR_RT_STREAM_SYNC_TIMEOUT — host's AICore stream + # sync hits the 2 s budget first (old Mode A AICPU path). + # 507018 = ACL_ERROR_RT_AICPU_EXCEPTION — Mode B AICPU stream sync + # surfaces the AICore failure as an AICPU exception when + # the orchestration kernel detects the dead AIC task. + # 507000 = ACL_ERROR_RT_INTERNAL_ERROR — same Mode B detection, + # mapped through a different code path on a5. + # Regardless of which fires, the regression we care about is that + # the timeout chain reaps the hang in single-digit seconds. + with pytest.raises(RuntimeError, match=r"run_prepared failed with code 507(046|018|000)"): worker.run(cid, ChipStorageTaskArgs(), config) elapsed = time.monotonic() - t0 From 56ac8af038801a72056de2605bf9c3893cd8df73 Mon Sep 17 00:00:00 2001 From: Chao Wang <26245345+ChaoWao@users.noreply.github.com> Date: Tue, 26 May 2026 17:07:33 +0800 Subject: [PATCH 2/2] Refactor: address PR #537 review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Consolidates the post-base review fixes into one commit on top of the dispatcher-bootstrap base. Dispatcher path is explicit - ChipWorker.init resolves dispatcher_path from RuntimeBinaries and threads the bytes through the simpler_init ABI as (const uint8_t *, size_t). Previous dladdr-based sibling resolution and the SIMPLER_AICPU_BASENAME compile def are gone. Sim simpler_init accepts the params for ABI parity, ignored. Per-arch dispatcher SO staging - libsimpler_aicpu_dispatcher.so is built per-arch (a2a3, a5) and staged once at build/lib//dispatcher/. All runtimes on the same arch share that copy — the dispatcher carries no runtime-specific code. - runtime_compiler::compile gains a dispatcher_dest kwarg; runtime_builder passes it when target == "aicpu". RuntimeBinaries.dispatcher_path surfaces the shared path. Bootstrap-only ownership of host bytes - ensure_binaries_loaded() releases dispatcher_so_binary_ and aicpu_so_binary_ via clear() + shrink_to_fit() after bootstrap. Steady state holds only the aicore binary (per-run rtRegisterAllKernel reads it) and the cached rtFuncHandles on LoadAicpuOp. Fingerprint: ELF Build-ID - elf_build_id_64 reads the first 8 bytes of .note.gnu.build-id with an FNV-1a-over-full-buffer fallback. Host and dispatcher use the same helper so both sides agree on the preinstall basename without any other channel. - Replaces the previous FNV-1a-over-first-64-bytes scheme, which could collide on same-toolchain runtime SOs whose ELF headers + sizes matched. BootstrappedFps() concurrency - The per-process fingerprint cache is guarded by std::mutex (check + insert each locked; bootstrap body unlocked). Keeps concurrent ChipWorker init across DeviceRunner instances correct without serializing the heavy upload itself. LoadAicpuOp::Init RAII - Failure paths use scope guards so an rtsFuncGetByName failure also unloads the partially-registered binary handle and removes the per-process JSON descriptor. Dispatcher stubs return 0 - Static / DynServer stubs return success instead of failure: the symbols are dlsym-probed by libaicpu_extend_kernels at load time but never invoked in practice. Returning failure was a regression risk if a future CANN release ever called them as a warm-up probe. aicore_op_timeout regex widened - a2a3 expected error-code set widened to 507(046|018|000). Which stream sync sees the AIC failure first is timing-dependent across host AICore vs AICPU sync, not arch-specific. Confirmed on Ascend910 — 507018 is observed. Test fix: _ChipWorker.init signature - tests/ut/py/test_chip_worker.py updated to pass the new dispatcher_path argument (empty string for the negative-path tests, matching how sim callers thread it). Misc cleanup - Delete dead CMakeLists (src/common/{aicpu_dispatcher,host}/CMakeLists.txt). - Drop unused runtime_name plumbing in runtime_compiler. - Remove vestigial &DlogRecord != nullptr guard. - Doc / comment alignment with the current load path. AicpuSoInfo + DeviceArgs.aicpu_so_bin/len: retained - Initially dropped as apparent dead code (no consumer reads them under this load path). a5 onboard CI regressed with 207001 AICore launch failures + 507899 stream-create failures, matching the HANDOFF warning about CI instability when these fields disappear. Kept for layout / device-state stability; investigation tracked separately. Co-Authored-By: Claude Opus 4.7 (1M context) --- python/bindings/task_interface.cpp | 2 +- python/simpler/task_interface.py | 9 +- simpler_setup/runtime_builder.py | 50 ++++++- simpler_setup/runtime_compiler.py | 55 ++++---- .../platform/onboard/aicpu/CMakeLists.txt | 32 +++-- src/a2a3/platform/onboard/aicpu/kernel.cpp | 15 +- src/a2a3/platform/onboard/host/CMakeLists.txt | 9 -- .../platform/onboard/host/device_runner.cpp | 49 +++---- .../platform/onboard/host/device_runner.h | 76 ++++++---- .../onboard/host/pto_runtime_c_api.cpp | 12 +- .../platform/sim/host/pto_runtime_c_api.cpp | 9 +- src/a5/platform/onboard/aicpu/CMakeLists.txt | 3 + src/a5/platform/onboard/aicpu/kernel.cpp | 15 +- src/a5/platform/onboard/host/CMakeLists.txt | 5 - .../platform/onboard/host/device_runner.cpp | 44 +++--- src/a5/platform/onboard/host/device_runner.h | 76 ++++++---- .../onboard/host/pto_runtime_c_api.cpp | 9 +- .../platform/sim/host/pto_runtime_c_api.cpp | 7 +- src/common/aicpu_dispatcher/CMakeLists.txt | 47 ------- src/common/aicpu_dispatcher/README.md | 20 +-- .../aicpu_dispatcher/aicpu_dispatcher.cpp | 46 +++--- .../aicpu_dispatcher/aicpu_dispatcher.h | 11 +- src/common/host/CMakeLists.txt | 20 --- src/common/host/load_aicpu_op.cpp | 131 ++++++++++++------ src/common/host/load_aicpu_op.h | 6 +- src/common/worker/chip_worker.cpp | 16 ++- src/common/worker/chip_worker.h | 6 +- src/common/worker/pto_runtime_c_api.h | 2 +- .../test_aicore_op_timeout.py | 25 ++-- tests/ut/py/test_chip_worker.py | 6 +- 30 files changed, 465 insertions(+), 348 deletions(-) delete mode 100644 src/common/aicpu_dispatcher/CMakeLists.txt delete mode 100644 src/common/host/CMakeLists.txt diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp index 4ba073839..7aa251db2 100644 --- a/python/bindings/task_interface.cpp +++ b/python/bindings/task_interface.cpp @@ -708,7 +708,7 @@ NB_MODULE(_task_interface, m) { .def(nb::init<>()) .def( "init", &ChipWorker::init, nb::arg("host_lib_path"), nb::arg("aicpu_path"), nb::arg("aicore_path"), - nb::arg("device_id") + nb::arg("dispatcher_path"), nb::arg("device_id") ) .def("finalize", &ChipWorker::finalize) .def( diff --git a/python/simpler/task_interface.py b/python/simpler/task_interface.py index 905fa3666..9b6a2ed09 100644 --- a/python/simpler/task_interface.py +++ b/python/simpler/task_interface.py @@ -319,7 +319,9 @@ def init(self, device_id, bins, log_level=None, log_info_v=None): device_id: NPU device ID to attach the calling thread to. bins: A `simpler_setup.runtime_builder.RuntimeBinaries` (or any object exposing host_path / aicpu_path / aicore_path / - simpler_log_path / sim_context_path). + simpler_log_path / sim_context_path / dispatcher_path). + ``dispatcher_path`` is required for onboard platforms and + ignored on sim (set to None). log_level: Severity floor (0=DEBUG..4=NUL). Defaults to a snapshot of the simpler logger via `_log.get_current_config()`. log_info_v: INFO verbosity threshold (0..9). Same default. @@ -354,10 +356,15 @@ def init(self, device_id, bins, log_level=None, log_info_v=None): _preload_global(str(bins.sim_context_path)) # 3. host_runtime.so is dlopen'd RTLD_LOCAL inside _impl.init. + # dispatcher_path is passed as an empty string on sim (where bins + # has dispatcher_path=None); the onboard simpler_init reads it + # via LoadAicpuOp::BootstrapDispatcher, sim ignores it. + dispatcher_path = getattr(bins, "dispatcher_path", None) self._impl.init( str(bins.host_path), str(bins.aicpu_path), str(bins.aicore_path), + "" if dispatcher_path is None else str(dispatcher_path), int(device_id), ) diff --git a/simpler_setup/runtime_builder.py b/simpler_setup/runtime_builder.py index 02b9323cf..8c2e403a0 100644 --- a/simpler_setup/runtime_builder.py +++ b/simpler_setup/runtime_builder.py @@ -69,13 +69,21 @@ def _invalidate_cache_if_stale(target_cache_dir: Path, current_commit: str) -> N @dataclass class RuntimeBinaries: - """Paths to the compiled runtime binaries.""" + """Paths to the compiled runtime binaries. + + ``dispatcher_path`` points at ``libsimpler_aicpu_dispatcher.so`` and is + required for onboard platforms (host bootstrap reads its bytes and ships + them to the device alongside the inner SO). Sim platforms have no + dispatcher; the field is ``None`` there. ``_lookup_binaries`` resolves + and validates the path against the build output directory. + """ host_path: Path aicpu_path: Path aicore_path: Path simpler_log_path: Path sim_context_path: Optional[Path] = None + dispatcher_path: Optional[Path] = None class RuntimeBuilder: @@ -186,12 +194,24 @@ def _lookup_binaries(self, name: str, output_dir: Path) -> RuntimeBinaries: "Run 'pip install .' or pass --build to compile it." ) + # Resolve and validate libsimpler_aicpu_dispatcher.so for onboard + # platforms. runtime_compiler stages one copy per arch into + # //dispatcher/ (shared across all runtimes); sim + # platforms have no dispatcher. + dispatcher_path = self._resolve_dispatcher_path() + if dispatcher_path is not None and not dispatcher_path.is_file(): + raise FileNotFoundError( + f"Pre-built libsimpler_aicpu_dispatcher.so not found at {dispatcher_path}.\n" + "Run 'pip install .' or pass --build to compile it." + ) + return RuntimeBinaries( host_path=paths["host"], aicpu_path=paths["aicpu"], aicore_path=paths["aicore"], simpler_log_path=simpler_log_path, sim_context_path=sim_context_path, + dispatcher_path=dispatcher_path, ) def get_binaries(self, name: str, build: bool = False) -> RuntimeBinaries: @@ -216,6 +236,11 @@ def get_binaries(self, name: str, build: bool = False) -> RuntimeBinaries: arch, variant = self._arch, self._variant output_dir = self._LIB_DIR / arch / variant / name + # Per-arch shared destination for libsimpler_aicpu_dispatcher.so. The + # dispatcher has no runtime-specific code, so all runtimes on a given + # arch reuse the same SO instead of carrying a copy each (~50 KB × N). + # None on sim — sim variants have no dispatcher. + dispatcher_staging_dir = self._LIB_DIR / arch / "dispatcher" if variant != "sim" else None if not build: return self._lookup_binaries(name, output_dir) @@ -247,7 +272,7 @@ def _compile_target(target: str) -> Path: source_dirs, build_dir=str(cache_dir), output_dir=output_dir, - runtime_name=name, + dispatcher_dest=dispatcher_staging_dir if target == "aicpu" else None, ) logger.info("Compiling AICore, AICPU, Host in parallel...") @@ -269,14 +294,35 @@ def _compile_target(target: str) -> Path: self._place_compile_commands(name) logger.info("Build complete!") + # runtime_compiler stages libsimpler_aicpu_dispatcher.so into the + # per-arch shared directory when target=='aicpu'. Surface it through + # RuntimeBinaries so ChipWorker.init can pass the path to + # LoadAicpuOp::BootstrapDispatcher. + dispatcher_path = self._resolve_dispatcher_path() + if dispatcher_path is not None and not dispatcher_path.is_file(): + dispatcher_path = None return RuntimeBinaries( host_path=host_path, aicpu_path=aicpu_path, aicore_path=aicore_path, simpler_log_path=simpler_log_path, sim_context_path=sim_context_path, + dispatcher_path=dispatcher_path, ) + def _resolve_dispatcher_path(self) -> Optional[Path]: + """Return path to libsimpler_aicpu_dispatcher.so for onboard variants. + + Returns ``None`` for sim variants (no dispatcher needed: sim's AICPU + runs in-process). For onboard, runtime_compiler stages one shared + copy per arch under ``build/lib//dispatcher/`` (the dispatcher + has no runtime-specific code, so all onboard runtimes on a given + arch use the same SO). Validated separately by ``_lookup_binaries``. + """ + if self._variant == "sim": + return None + return self._LIB_DIR / self._arch / "dispatcher" / "libsimpler_aicpu_dispatcher.so" + def _resolve_sim_context_path(self) -> Optional[Path]: """Return path to libcpu_sim_context.so for sim platforms, None for onboard. diff --git a/simpler_setup/runtime_compiler.py b/simpler_setup/runtime_compiler.py index a14d343a7..6e679f20d 100644 --- a/simpler_setup/runtime_compiler.py +++ b/simpler_setup/runtime_compiler.py @@ -40,27 +40,14 @@ def get_root_dir(self) -> str: def get_binary_name(self) -> str: return self._binary_name - def gen_cmake_args( - self, - include_dirs: list[str], - source_dirs: list[str], - runtime_name: Optional[str] = None, - ) -> list[str]: - """Generate CMake arguments list from toolchain args + custom directories. - - ``runtime_name`` is propagated to CMake as ``-DRUNTIME_NAME=`` so - per-runtime build outputs (e.g. the AICPU dispatcher SO) can pick a - per-runtime basename — needed for ChipWorker to bind multiple runtimes - in a single process without colliding on dispatcher state. - """ + def gen_cmake_args(self, include_dirs: list[str], source_dirs: list[str]) -> list[str]: + """Generate CMake arguments list from toolchain args + custom directories.""" inc = ";".join(os.path.abspath(d) for d in include_dirs) src = ";".join(os.path.abspath(d) for d in source_dirs) args = self.toolchain.get_cmake_args() + [ f"-DCUSTOM_INCLUDE_DIRS={inc}", f"-DCUSTOM_SOURCE_DIRS={src}", ] - if runtime_name is not None: - args.append(f"-DRUNTIME_NAME={runtime_name}") if logger.isEnabledFor(logging.DEBUG): args.append("--log-level=VERBOSE") return args @@ -214,7 +201,7 @@ def compile( source_dirs: list[str], build_dir: Optional[str] = None, output_dir: Optional[Union[str, Path]] = None, - runtime_name: Optional[str] = None, + dispatcher_dest: Optional[Union[str, Path]] = None, ) -> Union[bytes, Path]: """ Compile binary for the specified target platform. @@ -226,6 +213,12 @@ def compile( build_dir: The directory path for compiling. When None, use a temporal path. output_dir: Directory to copy the final binary into. When set, returns Path. When None, returns bytes (backward-compatible). + dispatcher_dest: Directory to stage libsimpler_aicpu_dispatcher.so into. + Only consumed when target_platform == 'aicpu' (the aicpu + CMakeLists builds the dispatcher target as a side product). + When None, the dispatcher SO is not exported. Used by + runtime_builder to share one dispatcher SO across all + runtimes for a given arch. Returns: If output_dir is set: Path to the compiled binary in output_dir. @@ -245,7 +238,7 @@ def compile( else: raise ValueError(f"Invalid target platform: {target_platform}. Must be 'aicore', 'aicpu', or 'host'.") - cmake_args = target.gen_cmake_args(include_dirs, source_dirs, runtime_name=runtime_name) + cmake_args = target.gen_cmake_args(include_dirs, source_dirs) cmake_source_dir = target.get_root_dir() binary_name = target.get_binary_name() platform = target_platform.upper() @@ -258,24 +251,26 @@ def _build(actual_build_dir: str) -> Union[bytes, Path]: platform=platform, build_dir=actual_build_dir, ) - if output_dir is not None: - od = Path(output_dir) - od.mkdir(parents=True, exist_ok=True) - dest = od / binary_name - shutil.copy2(binary_path, dest) - # The AICPU dispatcher SO has a stable, runtime-invariant name. - # Host BootstrapDispatcher uploads it into the main aicpu_scheduler - # at process startup (no tar.gz / sudo), and the dispatcher - # self-deploys into /usr/lib64/aicpu_kernels/0/aicpu_kernels_device/. - # Per-runtime AICPU kernel SOs (libaicpu_kernel.so) are uploaded - # by host at runtime via DeviceArgs.aicpu_so_bin and lazily - # loaded by the dispatcher. + # Stage the AICPU dispatcher SO into the per-arch shared directory + # provided by runtime_builder. The dispatcher has no runtime-specific + # code (same source under any RUNTIME_NAME), so one copy per arch + # serves every runtime variant — the path is later surfaced through + # RuntimeBinaries.dispatcher_path. Only fires when the aicpu cmake + # build actually produced the dispatcher SO as a side product. + if target_platform == "aicpu" and dispatcher_dest is not None: dispatcher_name = "libsimpler_aicpu_dispatcher.so" dispatcher_so = Path(actual_build_dir) / dispatcher_name if dispatcher_so.is_file(): - dest_dispatcher = od / dispatcher_name + dest_dir = Path(dispatcher_dest) + dest_dir.mkdir(parents=True, exist_ok=True) + dest_dispatcher = dest_dir / dispatcher_name shutil.copy2(dispatcher_so, dest_dispatcher) subprocess.run(["strip", "-s", str(dest_dispatcher)], check=True) + if output_dir is not None: + od = Path(output_dir) + od.mkdir(parents=True, exist_ok=True) + dest = od / binary_name + shutil.copy2(binary_path, dest) return dest else: with open(binary_path, "rb") as f: diff --git a/src/a2a3/platform/onboard/aicpu/CMakeLists.txt b/src/a2a3/platform/onboard/aicpu/CMakeLists.txt index 5f0ded665..627001511 100644 --- a/src/a2a3/platform/onboard/aicpu/CMakeLists.txt +++ b/src/a2a3/platform/onboard/aicpu/CMakeLists.txt @@ -86,20 +86,20 @@ target_link_directories(aicpu_kernel # Output name set_target_properties(aicpu_kernel PROPERTIES OUTPUT_NAME aicpu_kernel) -# Build dispatcher SO (direction 1: stable single dispatcher, runtime kernel -# uploaded at runtime). The dispatcher has NO runtime-specific code; it -# receives the per-runtime AICPU kernel SO bytes via DeviceArgs.aicpu_so_bin -# at Null phase, writes them to disk, dlopens, and dlsyms the inner -# DynTileFwkBackendKernelServer{,Init} symbols. Cache key is -# (aicpu_so_bin device address, aicpu_so_len) — different ChipWorker -# instances in the same process get separate cache entries, enabling -# single-process multi-runtime without firstCreatSo_-style locks. +# Build dispatcher SO — bootstrap-only upload helper. The dispatcher has NO +# runtime-specific code; libaicpu_extend_kernels loads it once via +# rtAicpuKernelLaunchExWithArgs(KERNEL_TYPE_AICPU_KFC), invokes +# DynTileFwkBackendKernelServerInit, which writes the bundled inner SO bytes +# (passed via the extended DeviceArgs at offsets 120/128) to +# /usr/lib64/aicpu_kernels/0/aicpu_kernels_device/simpler_inner_.so under +# a content-fingerprint basename. After bootstrap the host registers the +# preinstall file via Mode B (rtsBinaryLoadFromFile + rtsFuncGetByName) and +# launches per-task through rtsLaunchCpuKernel; the dispatcher SO itself is +# never referenced again. # -# Output name is fixed ("simpler_aicpu_dispatcher"). Host bootstrap uploads -# this SO into /usr/lib64/aicpu_kernels/0/aicpu_kernels_device/ at process -# startup via LoadAicpuOp::BootstrapDispatcher — no tar.gz, no sudo. -# Building per-runtime libaicpu_kernel.so stays in this same CMakeLists -# (aicpu_kernel target above). +# Output name is fixed ("simpler_aicpu_dispatcher"). See +# src/common/aicpu_dispatcher/{aicpu_dispatcher.h,README.md} for the +# extended DeviceArgs layout and the FNV-1a/Build-ID fingerprint protocol. set(AICPU_DISPATCHER_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher/aicpu_dispatcher.cpp" ) @@ -120,6 +120,12 @@ target_include_directories(aicpu_dispatcher PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CUSTOM_INCLUDE_DIRS} + # src/common is needed so `#include "utils/elf_build_id.h"` resolves; + # host_runtime.so already has this on its include path (see host + # CMakeLists), and the dispatcher uses the same header to fingerprint + # the inner SO bytes by their ELF Build-ID rather than a 64-byte FNV + # over the (mostly-shared) ELF header. + ${CMAKE_CURRENT_SOURCE_DIR}/../../../../common ${ASCEND_HOME_PATH}/include ) diff --git a/src/a2a3/platform/onboard/aicpu/kernel.cpp b/src/a2a3/platform/onboard/aicpu/kernel.cpp index e2dc61a81..1d9ff308e 100644 --- a/src/a2a3/platform/onboard/aicpu/kernel.cpp +++ b/src/a2a3/platform/onboard/aicpu/kernel.cpp @@ -38,10 +38,12 @@ extern "C" int aicpu_execute(Runtime *arg); /** * AICPU kernel initialization entry point. * - * Called once by simpler_dispatcher in the Init phase. The dispatcher - * dlsym's "simpler_aicpu_init" inside this inner SO (an internal - * dispatcher↔inner protocol — independent of CANN's preinstalled - * libaicpu_extend_kernels contract, which only binds the dispatcher itself). + * Called once per run by the main aicpu_scheduler. Host registers this SO + * via `rtsBinaryLoadFromFile` (Mode B JSON load, cpuKernelMode=0) and + * resolves this symbol via `rtsFuncGetByName`; the per-task launch goes + * through `rtsLaunchCpuKernel` on the cached `rtFuncHandle`. The bootstrap + * dispatcher only writes this SO to the preinstall path — it does not + * dlsym these symbols itself. * * @param arg Pointer to KernelArgs structure * @return 0 on success, -1 on error @@ -72,8 +74,9 @@ extern "C" __attribute__((visibility("default"))) int simpler_aicpu_init(void *a /** * AICPU kernel main execution entry point. * - * Called per-thread by simpler_dispatcher in the Run phase via dlsym - * "simpler_aicpu_exec" on the inner SO. + * Called per-thread by the main aicpu_scheduler via the cached + * `rtFuncHandle` resolved during host-side Mode B init (see + * `simpler_aicpu_init` docstring for the load path). * * @param arg Pointer to KernelArgs structure containing runtime_args * @return 0 on success, non-zero on error diff --git a/src/a2a3/platform/onboard/host/CMakeLists.txt b/src/a2a3/platform/onboard/host/CMakeLists.txt index ea23b7621..e607e5fd7 100644 --- a/src/a2a3/platform/onboard/host/CMakeLists.txt +++ b/src/a2a3/platform/onboard/host/CMakeLists.txt @@ -122,15 +122,6 @@ target_include_directories(host_runtime ${ASCEND_HOME_PATH}/${CMAKE_SYSTEM_PROCESSOR}-linux/include/driver ) -# Stable dispatcher basename. The dispatcher SO is bundled with the host -# runtime and resolved next to host_runtime.so via dladdr at runtime; -# LoadAicpuOp::BootstrapDispatcher uploads it (along with the per-runtime -# AICPU kernel SO bytes) into the main aicpu_scheduler at host process -# startup via libaicpu_extend_kernels — no tar.gz, no sudo. -target_compile_definitions(host_runtime PRIVATE - SIMPLER_AICPU_BASENAME="libsimpler_aicpu_dispatcher.so" -) - if(SIMPLER_ENABLE_PTO_SDMA_WORKSPACE) target_link_directories(host_runtime PRIVATE ${ASCEND_HOME_PATH}/${CMAKE_SYSTEM_PROCESSOR}-linux/lib64) endif() diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index 6de887a9f..2f73b11b9 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -29,22 +29,6 @@ #include #include "acl/acl.h" -static std::string resolve_dispatcher_so_path() { - // Dispatcher SO sits next to host_runtime.so (the SO this function lives - // in). dladdr gives us host_runtime.so's path; the dispatcher basename - // SIMPLER_AICPU_BASENAME is baked in at build time. - Dl_info info; - if (dladdr(reinterpret_cast(resolve_dispatcher_so_path), &info) == 0 || info.dli_fname == nullptr) { - return SIMPLER_AICPU_BASENAME; - } - std::string path = info.dli_fname; - size_t pos = path.rfind('/'); - if (pos == std::string::npos) { - return SIMPLER_AICPU_BASENAME; - } - return path.substr(0, pos + 1) + SIMPLER_AICPU_BASENAME; -} - // Include HAL constants from CANN (header only, library loaded dynamically) #include "ascend_hal.h" #include "callable.h" @@ -482,6 +466,14 @@ int DeviceRunner::ensure_binaries_loaded() { return -1; } + if (dispatcher_so_binary_.empty()) { + LOG_ERROR( + "DeviceRunner: dispatcher SO bytes not provided; pass dispatcher_path through ChipWorker.init " + "(RuntimeBinaries.dispatcher_path)" + ); + return -1; + } + // Bundle dispatcher SO + inner SO bytes into one Mode A KFC call: // libaicpu_extend_kernels invokes our dispatcher, which writes the inner // SO bytes to /usr/lib64/aicpu_kernels/0/aicpu_kernels_device/simpler_inner_.so @@ -490,9 +482,9 @@ int DeviceRunner::ensure_binaries_loaded() { // dlopen. Per-task launches afterwards go through Mode B // (rtsBinaryLoadFromFile + rtsFuncGetByName + rtsLaunchCpuKernel) directly // against the preinstall file. - std::string dispatcher_so_path = resolve_dispatcher_so_path(); int rc = load_aicpu_op_.BootstrapDispatcher( - dispatcher_so_path, aicpu_so_binary_.data(), aicpu_so_binary_.size(), stream_aicpu_ + dispatcher_so_binary_.data(), dispatcher_so_binary_.size(), aicpu_so_binary_.data(), aicpu_so_binary_.size(), + stream_aicpu_ ); if (rc != 0) { LOG_ERROR("LoadAicpuOp::BootstrapDispatcher failed: %d", rc); @@ -508,11 +500,11 @@ int DeviceRunner::ensure_binaries_loaded() { } LOG_INFO_V2("DeviceRunner: inner SO registered (simpler_aicpu_init/exec handles ready)"); - // Keep so_info_ allocation matching upstream behavior. The new dispatcher - // path itself doesn't need DeviceArgs.aicpu_so_bin/len, but removing them - // empirically destabilized other tests on CI (a2a3 paged_attention_unroll - // hit AICORE-side issues). Treat the field as part of the contract that - // downstream runtime code may inspect. + // H2D copy aicpu kernel SO bytes and stamp the resulting device pointer + // into device_args_.aicpu_so_bin/len. The bytes are no longer needed by + // the preinstall-based load path, but the device-side memory is still + // load-bearing on a5 onboard — dropping the allocation surfaced 207001 + // AICore launch failures + 507899 stream-create failures in CI. rc = so_info_.init(aicpu_so_binary_, mem_alloc_); if (rc != 0) { LOG_ERROR("AicpuSoInfo::init failed: %d", rc); @@ -527,6 +519,15 @@ int DeviceRunner::ensure_binaries_loaded() { return rc; } + // Release host bytes — bootstrap is done. Mode B per-task launches go + // through the cached rtFuncHandle owned by LoadAicpuOp; dispatcher SO + // bytes are never referenced again; the aicpu kernel SO's host buffer is + // also free to drop now that so_info_ already H2D'd the bytes above. + dispatcher_so_binary_.clear(); + dispatcher_so_binary_.shrink_to_fit(); + aicpu_so_binary_.clear(); + aicpu_so_binary_.shrink_to_fit(); + binaries_loaded_ = true; LOG_INFO_V0("DeviceRunner: binaries loaded"); return 0; @@ -1151,7 +1152,7 @@ int DeviceRunner::finalize() { // Cleanup kernel args (deviceArgs) kernel_args_.finalize_device_args(); - // Cleanup AICPU SO + // Cleanup AICPU SO H2D allocation so_info_.finalize(); // load_aicpu_op_ has no per-task device-side state to release (Mode A diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h index 9f1a47c0f..9aa5800b2 100644 --- a/src/a2a3/platform/onboard/host/device_runner.h +++ b/src/a2a3/platform/onboard/host/device_runner.h @@ -59,7 +59,10 @@ * * This structure contains pointers to device memory for the AICPU shared * object. The layout is hardcoded in libaicpu_extend_kernels.so, which expects - * specific offsets for aicpu_so_bin and aicpu_so_len fields. + * specific offsets for aicpu_so_bin and aicpu_so_len fields. The fields are + * load-bearing on a5 onboard (CI surfaced 207001 AICore launch failures and + * 507899 stream-create failures when they were dropped); treat the layout as + * part of the device-side contract even though our own kernels do not read it. */ struct DeviceArgs { uint64_t unused[12] = {0}; @@ -67,6 +70,23 @@ struct DeviceArgs { uint64_t aicpu_so_len{0}; }; +/** + * AICPU shared object information and management + * + * Manages the host→device copy of the runtime AICPU SO bytes that backs + * DeviceArgs.aicpu_so_bin / aicpu_so_len. Required on a5 onboard even though + * our own runtime AICPU SO never dereferences these fields — removing the + * H2D allocation destabilized CI (see DeviceArgs comment above). + */ +struct AicpuSoInfo { + uint64_t aicpu_so_bin{0}; + uint64_t aicpu_so_len{0}; + MemoryAllocator *allocator_{nullptr}; + + int init(const std::vector &aicpu_so_binary, MemoryAllocator &allocator); + int finalize(); +}; + /** * Helper class for managing KernelArgs with device memory * @@ -150,34 +170,6 @@ struct KernelArgsHelper { KernelArgs *operator&() { return &args; } }; -/** - * AICPU shared object information and management - * - * This class manages loading and device memory allocation for AICPU - * shared object (.so) files. - */ -struct AicpuSoInfo { - uint64_t aicpu_so_bin{0}; - uint64_t aicpu_so_len{0}; - MemoryAllocator *allocator_{nullptr}; - - /** - * Load shared object binary data and copy to device memory - * - * @param aicpu_so_binary Binary data of the AICPU shared object - * @param allocator Memory allocator to use - * @return 0 on success, error code on failure - */ - int init(const std::vector &aicpu_so_binary, MemoryAllocator &allocator); - - /** - * Free device memory allocated for shared object - * - * @return 0 on success, error code on failure - */ - int finalize(); -}; - /** * Device runner for kernel execution * @@ -289,6 +281,19 @@ class DeviceRunner { aicore_kernel_binary_ = std::move(aicore_kernel_binary); } + /** + * Take ownership of the dispatcher SO bytes. Called by simpler_init when + * the caller provided a dispatcher path; ensure_binaries_loaded() hands + * the buffer to LoadAicpuOp::BootstrapDispatcher on the first run. + * Leaving this unset (empty buffer) makes ensure_binaries_loaded() fail + * with a clear message — callers that drive _ChipWorker.init directly + * without a dispatcher path get a deterministic error at run() time + * rather than a confusing dladdr-derived path. + */ + void set_dispatcher_binary(std::vector dispatcher_so_binary) { + dispatcher_so_binary_ = std::move(dispatcher_so_binary); + } + /** The device id captured by simpler_init's attach_current_thread call. */ int device_id() const { return device_id_; } @@ -566,9 +571,20 @@ class DeviceRunner { int cores_per_blockdim_{PLATFORM_CORES_PER_BLOCKDIM}; int worker_count_{0}; // Stored for print_handshake_results in destructor // Executor binaries — populated once via set_executors() during - // simpler_init, owned by this runner for the rest of its lifetime. + // simpler_init. aicore_kernel_binary_ stays resident (launch_aicore_kernel + // re-registers it via rtRegisterAllKernel on every run). aicpu_so_binary_ + // is released by ensure_binaries_loaded() after bootstrap; bootstrap is + // the only consumer and Mode B per-task launches go through the cached + // rtFuncHandle on LoadAicpuOp, not the host bytes. std::vector aicpu_so_binary_; std::vector aicore_kernel_binary_; + // Dispatcher SO bytes — populated once via set_dispatcher_binary() during + // simpler_init. Consumed exclusively by BootstrapDispatcher on the first + // run() and released by ensure_binaries_loaded() right after. Empty buffer + // is permitted at init time (callers that drive ChipWorker.init without a + // dispatcher path); ensure_binaries_loaded() then fails fast with a clear + // message if/when bootstrap is actually attempted. + std::vector dispatcher_so_binary_; // AICPU op loader — handles dispatcher bootstrap and per-task launches. host::LoadAicpuOp load_aicpu_op_; diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp index f36aa6f0d..744b7291c 100644 --- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp @@ -226,7 +226,7 @@ int finalize_device(DeviceContextHandle ctx) { int simpler_init( DeviceContextHandle ctx, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, - const uint8_t *aicore_binary, size_t aicore_size + const uint8_t *aicore_binary, size_t aicore_size, const uint8_t *dispatcher_binary, size_t dispatcher_size ) { if (ctx == NULL) return -1; @@ -258,6 +258,16 @@ int simpler_init( std::vector aicpu_vec(aicpu_binary, aicpu_binary + aicpu_size); std::vector aicore_vec(aicore_binary, aicore_binary + aicore_size); runner->set_executors(std::move(aicpu_vec), std::move(aicore_vec)); + // Dispatcher SO bytes are passed alongside the executors. Onboard + // requires a non-empty buffer: BootstrapDispatcher reads from it on + // the first run() to upload the dispatcher + inner SO bundle through + // libaicpu_extend_kernels. If the caller drives _ChipWorker.init + // directly without a dispatcher path, this stays empty and any later + // run() fails fast in ensure_binaries_loaded with a clear message. + if (dispatcher_binary != NULL && dispatcher_size > 0) { + std::vector dispatcher_vec(dispatcher_binary, dispatcher_binary + dispatcher_size); + runner->set_dispatcher_binary(std::move(dispatcher_vec)); + } } catch (...) { return -1; } diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp index 7c1e3cb7e..3eb4a09e0 100644 --- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp @@ -217,8 +217,15 @@ int destroy_comm_stream_ctx(DeviceContextHandle ctx, void *stream) { int simpler_init( DeviceContextHandle ctx, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, - const uint8_t *aicore_binary, size_t aicore_size + const uint8_t *aicore_binary, size_t aicore_size, const uint8_t *dispatcher_binary, size_t dispatcher_size ) { + // Sim has no AICPU dispatcher (the simulator runs AICPU in-process). Accept + // the parameters for ABI parity with the onboard implementation and ignore + // them — callers that pass dispatcher bytes get the same shape as onboard, + // and Mode B path on sim isn't taken anyway. + (void)dispatcher_binary; + (void)dispatcher_size; + if (ctx == NULL) return -1; DeviceRunner *runner = static_cast(ctx); diff --git a/src/a5/platform/onboard/aicpu/CMakeLists.txt b/src/a5/platform/onboard/aicpu/CMakeLists.txt index ddc8bd553..2c95f25fc 100644 --- a/src/a5/platform/onboard/aicpu/CMakeLists.txt +++ b/src/a5/platform/onboard/aicpu/CMakeLists.txt @@ -108,6 +108,9 @@ target_include_directories(aicpu_dispatcher PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CUSTOM_INCLUDE_DIRS} + # src/common is needed so `#include "utils/elf_build_id.h"` resolves + # (matches a2a3 sibling; same Build-ID fingerprint protocol). + ${CMAKE_CURRENT_SOURCE_DIR}/../../../../common ${ASCEND_HOME_PATH}/include ) diff --git a/src/a5/platform/onboard/aicpu/kernel.cpp b/src/a5/platform/onboard/aicpu/kernel.cpp index 4dc606eea..eb87226df 100644 --- a/src/a5/platform/onboard/aicpu/kernel.cpp +++ b/src/a5/platform/onboard/aicpu/kernel.cpp @@ -35,10 +35,12 @@ extern "C" int aicpu_execute(Runtime *arg); /** * AICPU kernel initialization entry point. * - * Called once by simpler_dispatcher in the Init phase. The dispatcher - * dlsym's "simpler_aicpu_init" inside this inner SO (an internal - * dispatcher↔inner protocol — independent of CANN's preinstalled - * libaicpu_extend_kernels contract, which only binds the dispatcher itself). + * Called once per run by the main aicpu_scheduler. Host registers this SO + * via `rtsBinaryLoadFromFile` (Mode B JSON load, cpuKernelMode=0) and + * resolves this symbol via `rtsFuncGetByName`; the per-task launch goes + * through `rtsLaunchCpuKernel` on the cached `rtFuncHandle`. The bootstrap + * dispatcher only writes this SO to the preinstall path — it does not + * dlsym these symbols itself. * * @param arg Pointer to KernelArgs structure * @return 0 on success, -1 on error @@ -67,8 +69,9 @@ extern "C" __attribute__((visibility("default"))) int simpler_aicpu_init(void *a /** * AICPU kernel main execution entry point. * - * Called per-thread by simpler_dispatcher in the Run phase via dlsym - * "simpler_aicpu_exec" on the inner SO. + * Called per-thread by the main aicpu_scheduler via the cached + * `rtFuncHandle` resolved during host-side Mode B init (see + * `simpler_aicpu_init` docstring for the load path). * * @param arg Pointer to KernelArgs structure containing runtime_args * @return 0 on success, non-zero on error diff --git a/src/a5/platform/onboard/host/CMakeLists.txt b/src/a5/platform/onboard/host/CMakeLists.txt index 7d826c34f..c1a006cef 100644 --- a/src/a5/platform/onboard/host/CMakeLists.txt +++ b/src/a5/platform/onboard/host/CMakeLists.txt @@ -98,11 +98,6 @@ target_include_directories(host_runtime ${ASCEND_HOME_PATH}/${CMAKE_SYSTEM_PROCESSOR}-linux/include/driver ) -# Stable dispatcher basename (see a2a3 CMakeLists for rationale). -target_compile_definitions(host_runtime PRIVATE - SIMPLER_AICPU_BASENAME="libsimpler_aicpu_dispatcher.so" -) - # Link against CANN runtime libraries # ascend_hal is dynamically loaded at runtime via dlopen in device_runner # when performance profiling is enabled diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp index b5fc44e59..ece203bd8 100644 --- a/src/a5/platform/onboard/host/device_runner.cpp +++ b/src/a5/platform/onboard/host/device_runner.cpp @@ -31,22 +31,6 @@ #include #include -static std::string resolve_dispatcher_so_path() { - // Dispatcher SO sits next to host_runtime.so (the SO this function lives - // in). dladdr gives us host_runtime.so's path; the dispatcher basename - // SIMPLER_AICPU_BASENAME is baked in at build time. - Dl_info info; - if (dladdr(reinterpret_cast(resolve_dispatcher_so_path), &info) == 0 || info.dli_fname == nullptr) { - return SIMPLER_AICPU_BASENAME; - } - std::string path = info.dli_fname; - size_t pos = path.rfind('/'); - if (pos == std::string::npos) { - return SIMPLER_AICPU_BASENAME; - } - return path.substr(0, pos + 1) + SIMPLER_AICPU_BASENAME; -} - #include "callable.h" #include "callable_protocol.h" #include "utils/elf_build_id.h" @@ -364,15 +348,23 @@ int DeviceRunner::ensure_binaries_loaded() { return -1; } + if (dispatcher_so_binary_.empty()) { + LOG_ERROR( + "DeviceRunner: dispatcher SO bytes not provided; pass dispatcher_path through ChipWorker.init " + "(RuntimeBinaries.dispatcher_path)" + ); + return -1; + } + // Bundle dispatcher SO + inner SO bytes into one Mode A KFC call: // libaicpu_extend_kernels invokes our dispatcher, which writes the inner // SO bytes to simpler_inner_.so in preinstall. Dispatcher itself never // persists. Per-task launches afterwards go through Mode B // (rtsBinaryLoadFromFile + rtsFuncGetByName + rtsLaunchCpuKernel) directly // against the preinstall file. - std::string dispatcher_so_path = resolve_dispatcher_so_path(); int rc = load_aicpu_op_.BootstrapDispatcher( - dispatcher_so_path, aicpu_so_binary_.data(), aicpu_so_binary_.size(), stream_aicpu_ + dispatcher_so_binary_.data(), dispatcher_so_binary_.size(), aicpu_so_binary_.data(), aicpu_so_binary_.size(), + stream_aicpu_ ); if (rc != 0) { LOG_ERROR("LoadAicpuOp::BootstrapDispatcher failed: %d", rc); @@ -387,8 +379,9 @@ int DeviceRunner::ensure_binaries_loaded() { } LOG_INFO_V2("DeviceRunner: inner SO registered (simpler_aicpu_init/exec handles ready)"); - // Keep so_info_ allocation matching upstream behavior (see a2a3 sibling - // for rationale). + // H2D copy aicpu kernel SO bytes and stamp the resulting device pointer + // into device_args_.aicpu_so_bin/len (see a2a3 sibling — load-bearing on + // a5 onboard even though our own AICPU SO doesn't read these fields). rc = so_info_.init(aicpu_so_binary_, mem_alloc_); if (rc != 0) { LOG_ERROR("AicpuSoInfo::init failed: %d", rc); @@ -403,6 +396,15 @@ int DeviceRunner::ensure_binaries_loaded() { return rc; } + // Release host bytes — Mode B per-task launches use the cached rtFuncHandle + // on LoadAicpuOp; dispatcher SO bytes are never referenced again; the + // aicpu kernel SO's host buffer is also free to drop now that so_info_ + // already H2D'd the bytes above. + dispatcher_so_binary_.clear(); + dispatcher_so_binary_.shrink_to_fit(); + aicpu_so_binary_.clear(); + aicpu_so_binary_.shrink_to_fit(); + binaries_loaded_ = true; LOG_INFO_V0("DeviceRunner: binaries loaded"); return 0; @@ -957,7 +959,7 @@ int DeviceRunner::finalize() { // are released by runtime_args_cleanup RAII so they also unwind on errors. kernel_args_.finalize_device_args(); - // Cleanup AICPU SO + // Cleanup AICPU SO H2D allocation so_info_.finalize(); // load_aicpu_op_ has no per-task device-side state to release (Mode A diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h index 306a329ce..188f36e6a 100644 --- a/src/a5/platform/onboard/host/device_runner.h +++ b/src/a5/platform/onboard/host/device_runner.h @@ -58,7 +58,10 @@ * * This structure contains pointers to device memory for the AICPU shared * object. The layout is hardcoded in libaicpu_extend_kernels.so, which expects - * specific offsets for aicpu_so_bin and aicpu_so_len fields. + * specific offsets for aicpu_so_bin and aicpu_so_len fields. The fields are + * load-bearing on a5 onboard (CI surfaced 207001 AICore launch failures and + * 507899 stream-create failures when they were dropped); treat the layout as + * part of the device-side contract even though our own kernels do not read it. */ struct DeviceArgs { uint64_t unused[12] = {0}; @@ -66,6 +69,23 @@ struct DeviceArgs { uint64_t aicpu_so_len{0}; }; +/** + * AICPU shared object information and management + * + * Manages the host→device copy of the runtime AICPU SO bytes that backs + * DeviceArgs.aicpu_so_bin / aicpu_so_len. Required on a5 onboard even though + * our own runtime AICPU SO never dereferences these fields — removing the + * H2D allocation destabilized CI (see DeviceArgs comment above). + */ +struct AicpuSoInfo { + uint64_t aicpu_so_bin{0}; + uint64_t aicpu_so_len{0}; + MemoryAllocator *allocator_{nullptr}; + + int init(const std::vector &aicpu_so_binary, MemoryAllocator &allocator); + int finalize(); +}; + /** * Helper class for managing KernelArgs with device memory * @@ -139,34 +159,6 @@ struct KernelArgsHelper { KernelArgs *operator&() { return &args; } }; -/** - * AICPU shared object information and management - * - * This class manages loading and device memory allocation for AICPU - * shared object (.so) files. - */ -struct AicpuSoInfo { - uint64_t aicpu_so_bin{0}; - uint64_t aicpu_so_len{0}; - MemoryAllocator *allocator_{nullptr}; - - /** - * Load shared object binary data and copy to device memory - * - * @param aicpu_so_binary Binary data of the AICPU shared object - * @param allocator Memory allocator to use - * @return 0 on success, error code on failure - */ - int init(const std::vector &aicpu_so_binary, MemoryAllocator &allocator); - - /** - * Free device memory allocated for shared object - * - * @return 0 on success, error code on failure - */ - int finalize(); -}; - /** * Device runner for kernel execution * @@ -277,6 +269,19 @@ class DeviceRunner { aicore_kernel_binary_ = std::move(aicore_kernel_binary); } + /** + * Take ownership of the dispatcher SO bytes. Called by simpler_init when + * the caller provided a dispatcher path; ensure_binaries_loaded() hands + * the buffer to LoadAicpuOp::BootstrapDispatcher on the first run. + * Leaving this unset (empty buffer) makes ensure_binaries_loaded() fail + * with a clear message — callers that drive _ChipWorker.init directly + * without a dispatcher path get a deterministic error at run() time + * rather than a confusing dladdr-derived path. + */ + void set_dispatcher_binary(std::vector dispatcher_so_binary) { + dispatcher_so_binary_ = std::move(dispatcher_so_binary); + } + /** The device id captured by simpler_init's attach_current_thread call. */ int device_id() const { return device_id_; } @@ -472,9 +477,20 @@ class DeviceRunner { int cores_per_blockdim_{PLATFORM_CORES_PER_BLOCKDIM}; int worker_count_{0}; // Stored for print_handshake_results in destructor // Executor binaries — populated once via set_executors() during - // simpler_init, owned by this runner for the rest of its lifetime. + // simpler_init. aicore_kernel_binary_ stays resident (launch_aicore_kernel + // re-registers it via rtRegisterAllKernel on every run). aicpu_so_binary_ + // is released by ensure_binaries_loaded() after bootstrap; bootstrap is + // the only consumer and Mode B per-task launches go through the cached + // rtFuncHandle on LoadAicpuOp, not the host bytes. std::vector aicpu_so_binary_; std::vector aicore_kernel_binary_; + // Dispatcher SO bytes — populated once via set_dispatcher_binary() during + // simpler_init. Consumed exclusively by BootstrapDispatcher on the first + // run() and released by ensure_binaries_loaded() right after. Empty buffer + // is permitted at init time (callers that drive ChipWorker.init without a + // dispatcher path); ensure_binaries_loaded() then fails fast with a clear + // message if/when bootstrap is actually attempted. + std::vector dispatcher_so_binary_; // AICPU op loader — handles dispatcher bootstrap and per-task launches. host::LoadAicpuOp load_aicpu_op_; diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp index 593c856a9..450f971cd 100644 --- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp @@ -295,7 +295,7 @@ int comm_destroy(void *handle) { int simpler_init( DeviceContextHandle ctx, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, - const uint8_t *aicore_binary, size_t aicore_size + const uint8_t *aicore_binary, size_t aicore_size, const uint8_t *dispatcher_binary, size_t dispatcher_size ) { if (ctx == NULL) return -1; @@ -324,6 +324,13 @@ int simpler_init( std::vector aicpu_vec(aicpu_binary, aicpu_binary + aicpu_size); std::vector aicore_vec(aicore_binary, aicore_binary + aicore_size); runner->set_executors(std::move(aicpu_vec), std::move(aicore_vec)); + // Dispatcher SO bytes — see a2a3 sibling for rationale. Empty buffer + // is permitted at simpler_init time; ensure_binaries_loaded surfaces + // the error if/when the bootstrap is actually attempted. + if (dispatcher_binary != NULL && dispatcher_size > 0) { + std::vector dispatcher_vec(dispatcher_binary, dispatcher_binary + dispatcher_size); + runner->set_dispatcher_binary(std::move(dispatcher_vec)); + } } catch (...) { return -1; } diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp index 5ae942d14..f12f256b0 100644 --- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp @@ -217,8 +217,13 @@ int destroy_comm_stream_ctx(DeviceContextHandle ctx, void *stream) { int simpler_init( DeviceContextHandle ctx, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, - const uint8_t *aicore_binary, size_t aicore_size + const uint8_t *aicore_binary, size_t aicore_size, const uint8_t *dispatcher_binary, size_t dispatcher_size ) { + // Sim has no AICPU dispatcher (the simulator runs AICPU in-process). See + // a2a3 sim sibling for rationale; parameters accepted for ABI parity. + (void)dispatcher_binary; + (void)dispatcher_size; + if (ctx == NULL) return -1; DeviceRunner *runner = static_cast(ctx); diff --git a/src/common/aicpu_dispatcher/CMakeLists.txt b/src/common/aicpu_dispatcher/CMakeLists.txt deleted file mode 100644 index 5aa85d321..000000000 --- a/src/common/aicpu_dispatcher/CMakeLists.txt +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -# Build AICPU Dispatcher SO - Two-layer architecture for runtime-specific AICPU kernels -cmake_minimum_required(VERSION 3.16.3) - -project(aicpu_dispatcher LANGUAGES C CXX) - -# Dispatcher SO sources -set(AICPU_DISPATCHER_SOURCES - "${CMAKE_CURRENT_SOURCE_DIR}/aicpu_dispatcher.cpp" -) - -# Create shared library -add_library(aicpu_dispatcher SHARED ${AICPU_DISPATCHER_SOURCES}) - -# C++ standard -set_target_properties(aicpu_dispatcher PROPERTIES - CXX_STANDARD 17 - CXX_STANDARD_REQUIRED ON -) - -# Compile options (matching AICPU pattern). -target_compile_options(aicpu_dispatcher - PRIVATE - -Wall - -Wextra - -fPIC - -O3 - -g -) - -# Include directories -target_include_directories(aicpu_dispatcher - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR} - "${CMAKE_CURRENT_SOURCE_DIR}/../../../.." # For common/unified_log.h -) - - -# Project-namespaced output name: libsimpler_aicpu_dispatcher.so. -set_target_properties(aicpu_dispatcher PROPERTIES OUTPUT_NAME "simpler_aicpu") diff --git a/src/common/aicpu_dispatcher/README.md b/src/common/aicpu_dispatcher/README.md index 5963250d3..fcdc54beb 100644 --- a/src/common/aicpu_dispatcher/README.md +++ b/src/common/aicpu_dispatcher/README.md @@ -10,14 +10,18 @@ path under a content-fingerprint filename: ``` The dispatcher SO itself is **never** persisted to disk and **never** dispatches -at per-task launch time. After bootstrap, the host launches the runtime SO -directly via `rtAicpuKernelLaunchExWithArgs` (kernel_type = `KERNEL_TYPE_AICPU`), -which routes through the main `aicpu_scheduler` and dlopens the preinstall file. - -The source is runtime-agnostic, so it is built once and installed at -`build/lib//onboard//libsimpler_aicpu_dispatcher.so` (a sibling -of each runtime's host_runtime.so). A single process binding multiple runtimes -shares one dispatcher SO on disk. +at per-task launch time. After bootstrap, the host registers the preinstall +file via `rtsBinaryLoadFromFile` (Mode B JSON load, cpuKernelMode=0) and +resolves `simpler_aicpu_init` / `simpler_aicpu_exec` once via +`rtsFuncGetByName`; per-task launches go through `rtsLaunchCpuKernel` on the +cached `rtFuncHandle`s. The main `aicpu_scheduler` owns the dlopen of the +preinstall file; the dispatcher is out of the picture once bootstrap returns. + +The source is runtime-agnostic. It is built per-arch under +`build/lib//onboard//libsimpler_aicpu_dispatcher.so` as a +sibling of each runtime's host_runtime.so. A single process binding multiple +runtimes can share one dispatcher SO on disk; the host process-level +fingerprint cache deduplicates bootstrap calls by inner-SO Build-ID. ## Exported entry points diff --git a/src/common/aicpu_dispatcher/aicpu_dispatcher.cpp b/src/common/aicpu_dispatcher/aicpu_dispatcher.cpp index 333c2d83a..54d5e61ea 100644 --- a/src/common/aicpu_dispatcher/aicpu_dispatcher.cpp +++ b/src/common/aicpu_dispatcher/aicpu_dispatcher.cpp @@ -32,6 +32,8 @@ #include #include +#include "utils/elf_build_id.h" + // dlog wrapper so error paths show up in device log without depending on // our common/unified_log machinery (this SO is loaded standalone by CANN). extern "C" void DlogRecord(int moduleId, int level, const char *fmt, ...); @@ -46,9 +48,10 @@ void DispatcherLog(const char *fmt, ...) { va_start(ap, fmt); vsnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); - if (&DlogRecord != nullptr) { - DlogRecord(kDlogModuleCcecpu, kDlogLevelError, "[simpler-dispatcher] %s", buf); - } + // DlogRecord is a non-weak extern: if it failed to resolve, this SO + // would not have dlopen'd in the first place, so an address-vs-nullptr + // guard here is dead code (and is folded to `true` by most compilers). + DlogRecord(kDlogModuleCcecpu, kDlogLevelError, "[simpler-dispatcher] %s", buf); } } // namespace simpler_dispatcher @@ -78,19 +81,18 @@ static_assert(offsetof(DeviceArgs, inner_so_len) == 128, "DeviceArgs::inner_so_l namespace simpler_dispatcher { -// FNV-1a over first 64 bytes XOR'd with len. Host's MakeInnerSoBasename -// uses the same algorithm so both sides produce the same filename without -// any other channel of communication. +// ELF Build-ID-derived 64-bit fingerprint (linker SHA-1 truncated to 8 +// bytes by `-Wl,--build-id`). Falls back to full-buffer FNV-1a if the SO +// was somehow linked without a build-id note. Host's +// load_aicpu_op.cpp::FingerprintBytes calls the same helper, so both sides +// produce identical fingerprints with no other channel of communication. +// +// The earlier "FNV-1a over the first 64 bytes XOR len" scheme collided in +// practice on same-toolchain runtime SOs whose ELF headers + size matched +// — wrong-code risk on the multi-runtime path. Build-IDs are strong by +// linker contract: identical Build-IDs imply byte-identical SOs. uint64_t Fingerprint(const char *data, uint64_t len) { - constexpr uint64_t kFnvOffset = 0xcbf29ce484222325ULL; - constexpr uint64_t kFnvPrime = 0x100000001b3ULL; - uint64_t h = kFnvOffset; - size_t n = len < 64 ? len : 64; - for (size_t i = 0; i < n; ++i) { - h ^= static_cast(data[i]); - h *= kFnvPrime; - } - return h ^ len; + return simpler::common::utils::elf_build_id_64(data, static_cast(len)); } // Preinstall path — HwHiAiUser owns this dir, the sched thread can write here. @@ -148,17 +150,21 @@ bool WriteBytes(const std::string &path, const char *data, uint64_t len) { extern "C" { // Stubs — libaicpu_extend_kernels::SetTileFwkKernelMap dlsym's all three at -// load time; absence makes the whole SO unmappable. We only reach Init. +// load time; absence makes the whole SO unmappable. We only reach Init in +// practice, but return 0 (success) here to mirror the happy-path return of +// the old AICPU kernel stubs we replaced. If a future CANN version begins +// invoking Static as a warm-up probe, returning failure would be a silent +// regression versus the prior behavior. __attribute__((visibility("default"))) int StaticTileFwkBackendKernelServer(void *args) { (void)args; - simpler_dispatcher::DispatcherLog("Static: stub (should not be called)"); - return 1; + simpler_dispatcher::DispatcherLog("Static: stub (not expected to be called)"); + return 0; } __attribute__((visibility("default"))) uint32_t DynTileFwkBackendKernelServer(void *args) { (void)args; - simpler_dispatcher::DispatcherLog("Server: stub (dispatcher is upload-only, should not be called)"); - return 1; + simpler_dispatcher::DispatcherLog("Server: stub (dispatcher is upload-only, not expected to be called)"); + return 0; } // Init: write the bundled runtime SO bytes to a fingerprint-named file under diff --git a/src/common/aicpu_dispatcher/aicpu_dispatcher.h b/src/common/aicpu_dispatcher/aicpu_dispatcher.h index 29e89106c..72cd297ab 100644 --- a/src/common/aicpu_dispatcher/aicpu_dispatcher.h +++ b/src/common/aicpu_dispatcher/aicpu_dispatcher.h @@ -35,11 +35,12 @@ * The sched thread (HwHiAiUser) owns this dir, so the write succeeds. * 4. host computes the same fingerprint locally to derive the same * preinstall filename. - * 5. Per-task launches: host calls `rtAicpuKernelLaunchExWithArgs` - * (kernel_type = `KERNEL_TYPE_AICPU`, so_name = `simpler_inner_.so`, - * kernel_name = `simpler_aicpu_init`/`_exec`). The main aicpu_scheduler - * dlopens the preinstall file once and caches the handle; dispatcher is - * no longer in the picture. + * 5. Per-task launches (Mode B): host calls `rtsBinaryLoadFromFile` to + * JSON-register the preinstall file (cpuKernelMode=0), resolves + * `simpler_aicpu_init` / `simpler_aicpu_exec` via `rtsFuncGetByName`, + * then dispatches each task through `rtsLaunchCpuKernel` on the cached + * `rtFuncHandle`. The main aicpu_scheduler owns the dlopen of the + * preinstall file; this dispatcher SO is no longer in the picture. * * Multi-runtime in one host process: each DeviceRunner bootstraps with the * same dispatcher bytes + its own runtime SO bytes. A process-level diff --git a/src/common/host/CMakeLists.txt b/src/common/host/CMakeLists.txt deleted file mode 100644 index 9e9125274..000000000 --- a/src/common/host/CMakeLists.txt +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -# Build host-side AICPU operation loader -cmake_minimum_required(VERSION 3.16.3) - -project(host_common LANGUAGES C CXX) - -# Host common sources -set(HOST_COMMON_SOURCES - "${CMAKE_CURRENT_SOURCE_DIR}/load_aicpu_op.cpp" -) - -# This library is included directly in host_runtime, not built separately -# Sources are added to HOST_RUNTIME_SOURCES in platform CMakeLists.txt diff --git a/src/common/host/load_aicpu_op.cpp b/src/common/host/load_aicpu_op.cpp index d82585b4d..71bcfb922 100644 --- a/src/common/host/load_aicpu_op.cpp +++ b/src/common/host/load_aicpu_op.cpp @@ -16,18 +16,19 @@ #include -#include #include #include #include #include #include +#include #include #include #include "acl/acl.h" #include "common/unified_log.h" #include "runtime/rt.h" +#include "utils/elf_build_id.h" namespace host { @@ -49,34 +50,12 @@ std::string MakeUniqueOpType(const char *base, uint64_t fp) { return buf; } -uint64_t FingerprintBytes(const void *data, size_t len) { - constexpr uint64_t kFnvOffset = 0xcbf29ce484222325ULL; - constexpr uint64_t kFnvPrime = 0x100000001b3ULL; - uint64_t h = kFnvOffset; - size_t n = len < 64 ? len : 64; - auto *p = reinterpret_cast(data); - for (size_t i = 0; i < n; ++i) { - h ^= p[i]; - h *= kFnvPrime; - } - return h ^ static_cast(len); -} - -bool ReadFileBytes(const std::string &path, std::vector &out) { - std::ifstream in(path, std::ios::binary | std::ios::ate); - if (!in.is_open()) { - LOG_ERROR("ReadFileBytes: cannot open %s: %s", path.c_str(), strerror(errno)); - return false; - } - std::streamsize len = in.tellg(); - in.seekg(0); - out.resize(static_cast(len)); - if (!in.read(out.data(), len)) { - LOG_ERROR("ReadFileBytes: read failed for %s", path.c_str()); - return false; - } - return true; -} +// ELF Build-ID-derived 64-bit fingerprint. Dispatcher SO uses the same +// helper on the device side, so both sides agree on the preinstall +// basename without any other channel of communication. See +// src/common/utils/elf_build_id.h for the fallback behavior when the SO +// was linked without a build-id note. +uint64_t FingerprintBytes(const void *data, size_t len) { return simpler::common::utils::elf_build_id_64(data, len); } struct DeviceBuf { void *ptr = nullptr; @@ -88,19 +67,31 @@ struct DeviceBuf { // Process-level cache of inner-SO fingerprints we've already bootstrapped. // Multiple DeviceRunner instances in the same process share one entry per -// runtime here; same-content uploads short-circuit. No mutex — host-side -// LoadAicpuOp construction is always serialized by the caller (Python GIL or -// sequential per-ChipWorker init), so concurrent insert never happens. +// runtime here; same-content uploads short-circuit. Guarded by a mutex so +// that callers releasing the Python GIL (e.g. nanobind methods marked +// `nb::call_guard`) cannot race on the set's +// internals. The lock is uncontended on the steady-state path and only +// touched at DeviceRunner init time, so the overhead is negligible +// compared to keeping the invariant alive in a comment. std::unordered_set &BootstrappedFps() { static std::unordered_set kSet; return kSet; } +std::mutex &BootstrappedFpsMutex() { + static std::mutex m; + return m; +} } // namespace int LoadAicpuOp::BootstrapDispatcher( - const std::string &dispatcher_so_path, const void *inner_so_data, size_t inner_so_len, rtStream_t stream + const void *dispatcher_so_data, size_t dispatcher_so_len, const void *inner_so_data, size_t inner_so_len, + rtStream_t stream ) { + if (dispatcher_so_data == nullptr || dispatcher_so_len == 0) { + LOG_ERROR("BootstrapDispatcher: empty dispatcher SO bytes"); + return -1; + } if (inner_so_data == nullptr || inner_so_len == 0) { LOG_ERROR("BootstrapDispatcher: empty inner SO bytes"); return -1; @@ -108,14 +99,22 @@ int LoadAicpuOp::BootstrapDispatcher( inner_fp_ = FingerprintBytes(inner_so_data, inner_so_len); inner_so_basename_ = MakeInnerSoBasename(inner_fp_); - if (BootstrappedFps().count(inner_fp_) > 0) { - LOG_INFO_V2("BootstrapDispatcher: inner SO fp=%016lx already bootstrapped, skipping", inner_fp_); - return 0; + { + std::lock_guard lk(BootstrappedFpsMutex()); + if (BootstrappedFps().count(inner_fp_) > 0) { + LOG_INFO_V2("BootstrapDispatcher: inner SO fp=%016lx already bootstrapped, skipping", inner_fp_); + return 0; + } } - - std::vector dispatcher_bytes; - if (!ReadFileBytes(dispatcher_so_path, dispatcher_bytes)) return -1; - size_t dispatcher_len = dispatcher_bytes.size(); + // Note: we deliberately drop the lock for the heavy bootstrap work and + // re-take it for the post-insert below. Two threads racing on the same + // fingerprint will each perform a bootstrap, which is harmless: CANN's + // libaicpu_extend_kernels has a one-shot `firstCreatSo_` latch, and the + // atomic tmp+rename in WriteBytes is idempotent across same-content + // racers. Holding the lock across the upload would serialize all + // multi-runtime ChipWorker init in the process — a real regression. + + size_t dispatcher_len = dispatcher_so_len; const char *inner_bytes = reinterpret_cast(inner_so_data); size_t inner_len = inner_so_len; @@ -126,9 +125,7 @@ int LoadAicpuOp::BootstrapDispatcher( LOG_ERROR("BootstrapDispatcher: aclrtMalloc(dispatcher) failed: %d", rc); return rc; } - rc = aclrtMemcpy( - dev_dispatcher.ptr, dispatcher_len, dispatcher_bytes.data(), dispatcher_len, ACL_MEMCPY_HOST_TO_DEVICE - ); + rc = aclrtMemcpy(dev_dispatcher.ptr, dispatcher_len, dispatcher_so_data, dispatcher_len, ACL_MEMCPY_HOST_TO_DEVICE); if (rc != ACL_SUCCESS) { LOG_ERROR("BootstrapDispatcher: aclrtMemcpy(dispatcher) failed: %d", rc); return rc; @@ -204,7 +201,10 @@ int LoadAicpuOp::BootstrapDispatcher( "BootstrapDispatcher: bundled dispatcher (%zu B) + inner SO (%zu B) uploaded; inner SO at %s", dispatcher_len, inner_len, inner_so_basename_.c_str() ); - BootstrappedFps().insert(inner_fp_); + { + std::lock_guard lk(BootstrappedFpsMutex()); + BootstrappedFps().insert(inner_fp_); + } return 0; } @@ -228,6 +228,13 @@ void LoadAicpuOp::Finalize() { LoadAicpuOp::~LoadAicpuOp() { Finalize(); } bool LoadAicpuOp::GenerateAicpuOpJson(const std::string &json_path, const std::string &kernel_so) { + // Inputs are a closed set: opType / functionName are KernelNames::* + // constants suffixed with a hex fingerprint, kernelSo is also hex-only, + // and the remaining fields are hard-coded literals. No characters that + // require JSON escaping can appear, so manual string concatenation is + // safe. If you add a field whose value can be user-derived (paths, + // user-supplied identifiers, etc.), switch to a real JSON serializer + // before letting it through. std::ofstream json_file(json_path); if (!json_file.is_open()) { LOG_ERROR("Failed to open JSON file for writing: %s", json_path.c_str()); @@ -284,6 +291,33 @@ int LoadAicpuOp::Init() { return -1; } + // RAII cleanups: any non-zero return path below unwinds via these guards. + // .release() flips them off once the corresponding state becomes part of + // the LoadAicpuOp's steady-state ownership. + struct JsonGuard { + std::string &path; + bool active = true; + ~JsonGuard() { + if (active && !path.empty()) { + std::remove(path.c_str()); + path.clear(); + } + } + void release() { active = false; } + } json_guard{json_file_path_}; + + struct BinaryGuard { + void *&handle; + bool active = true; + ~BinaryGuard() { + if (active && handle != nullptr) { + (void)rtsBinaryUnload(handle); + handle = nullptr; + } + } + void release() { active = false; } + } binary_guard{binary_handle_}; + rtLoadBinaryOption_t option = {}; option.optionId = RT_LOAD_BINARY_OPT_CPU_KERNEL_MODE; option.value.cpuKernelMode = 0; @@ -297,8 +331,7 @@ int LoadAicpuOp::Init() { rtError_t rc = rtsBinaryLoadFromFile(json_file_path_.c_str(), &load_config, &binary_handle_); if (rc != RT_ERROR_NONE) { LOG_ERROR("rtsBinaryLoadFromFile failed for %s: %d", json_file_path_.c_str(), rc); - std::remove(json_file_path_.c_str()); - json_file_path_.clear(); + // binary_handle_ stays null; json_guard removes the JSON file. return rc; } LOG_INFO_V2("LoadAicpuOp: Loaded inner SO via JSON, handle=%p", binary_handle_); @@ -310,11 +343,17 @@ int LoadAicpuOp::Init() { rc = rtsFuncGetByName(binary_handle_, lookup_name.c_str(), &func_handle); if (rc != RT_ERROR_NONE) { LOG_ERROR("rtsFuncGetByName failed for %s: %d", lookup_name.c_str(), rc); + // binary_guard unloads the partially-registered binary, json_guard + // removes the JSON file. Symmetric with the rtsBinaryLoadFromFile + // failure branch above. return rc; } func_handles_[name] = func_handle; LOG_INFO_V2("LoadAicpuOp: resolved handle for %s (opType=%s): %p", name, lookup_name.c_str(), func_handle); } + + binary_guard.release(); + json_guard.release(); return 0; } diff --git a/src/common/host/load_aicpu_op.h b/src/common/host/load_aicpu_op.h index 4427a4b45..dd4c94fa5 100644 --- a/src/common/host/load_aicpu_op.h +++ b/src/common/host/load_aicpu_op.h @@ -88,14 +88,16 @@ class LoadAicpuOp { /** * @brief One-shot bootstrap: upload runtime SO to preinstall via dispatcher. * - * @param dispatcher_so_path Host path to libsimpler_aicpu_dispatcher.so + * @param dispatcher_so_data Dispatcher SO bytes (caller-owned, must outlive call) + * @param dispatcher_so_len Dispatcher SO size * @param inner_so_data Runtime SO bytes (caller-owned, must outlive call) * @param inner_so_len Runtime SO size * @param stream Stream on which to enqueue the bootstrap * @return 0 on success, error code on failure */ int BootstrapDispatcher( - const std::string &dispatcher_so_path, const void *inner_so_data, size_t inner_so_len, rtStream_t stream + const void *dispatcher_so_data, size_t dispatcher_so_len, const void *inner_so_data, size_t inner_so_len, + rtStream_t stream ); /** diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp index 7fab4c295..2ee392ab1 100644 --- a/src/common/worker/chip_worker.cpp +++ b/src/common/worker/chip_worker.cpp @@ -58,7 +58,8 @@ std::vector read_binary_file(const std::string &path) { ChipWorker::~ChipWorker() { finalize(); } void ChipWorker::init( - const std::string &host_lib_path, const std::string &aicpu_path, const std::string &aicore_path, int device_id + const std::string &host_lib_path, const std::string &aicpu_path, const std::string &aicore_path, + const std::string &dispatcher_path, int device_id ) { if (finalized_) { throw std::runtime_error("ChipWorker already finalized; cannot reinitialize"); @@ -155,8 +156,19 @@ void ChipWorker::init( try { std::vector aicpu_bytes = read_binary_file(aicpu_path); std::vector aicore_bytes = read_binary_file(aicore_path); + // dispatcher_path is empty on sim (no dispatcher) and on tests that + // exercise _ChipWorker.init directly without a RuntimeBinaries. + // simpler_init treats a null/empty buffer as "no dispatcher" — onboard + // ensure_binaries_loaded raises with a clear message if the bootstrap + // is actually attempted, sim ignores it entirely. + std::vector dispatcher_bytes; + if (!dispatcher_path.empty()) { + dispatcher_bytes = read_binary_file(dispatcher_path); + } + const uint8_t *dispatcher_ptr = dispatcher_bytes.empty() ? nullptr : dispatcher_bytes.data(); init_rc = simpler_init_fn_( - device_ctx_, device_id, aicpu_bytes.data(), aicpu_bytes.size(), aicore_bytes.data(), aicore_bytes.size() + device_ctx_, device_id, aicpu_bytes.data(), aicpu_bytes.size(), aicore_bytes.data(), aicore_bytes.size(), + dispatcher_ptr, dispatcher_bytes.size() ); } catch (...) { destroy_device_context_fn_(device_ctx_); diff --git a/src/common/worker/chip_worker.h b/src/common/worker/chip_worker.h index 2227245f1..e1632eb2a 100644 --- a/src/common/worker/chip_worker.h +++ b/src/common/worker/chip_worker.h @@ -42,7 +42,8 @@ class ChipWorker { /// globals. The Python `ChipWorker` wrapper does this with `ctypes.CDLL(..., /// mode=RTLD_GLOBAL)`. void init( - const std::string &host_lib_path, const std::string &aicpu_path, const std::string &aicore_path, int device_id + const std::string &host_lib_path, const std::string &aicpu_path, const std::string &aicore_path, + const std::string &dispatcher_path, int device_id ); /// Tear down everything: device resources and runtime library. @@ -138,7 +139,8 @@ class ChipWorker { // From host_runtime.so. Single platform-side init that does (a) thread // attach + device-id record, (b) executor binary takeover, (c) onboard // CANN dlog sync. Reads the current log level off HostLogger itself. - using SimplerInitFn = int (*)(void *, int, const uint8_t *, size_t, const uint8_t *, size_t); + using SimplerInitFn = + int (*)(void *, int, const uint8_t *, size_t, const uint8_t *, size_t, const uint8_t *, size_t); using PrepareCallableFn = int (*)(void *, int32_t, const void *); using RunPreparedFn = int (*)(void *, void *, int32_t, const void *, int, int, int, int, int, int, const char *, PtoRunTiming *); diff --git a/src/common/worker/pto_runtime_c_api.h b/src/common/worker/pto_runtime_c_api.h index 00debb446..c4f6b7adf 100644 --- a/src/common/worker/pto_runtime_c_api.h +++ b/src/common/worker/pto_runtime_c_api.h @@ -132,7 +132,7 @@ int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *de */ int simpler_init( DeviceContextHandle ctx, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, - const uint8_t *aicore_binary, size_t aicore_size + const uint8_t *aicore_binary, size_t aicore_size, const uint8_t *dispatcher_binary, size_t dispatcher_size ); /** diff --git a/tests/st/aicore_op_timeout/test_aicore_op_timeout.py b/tests/st/aicore_op_timeout/test_aicore_op_timeout.py index 5e161cfe8..0201d490f 100644 --- a/tests/st/aicore_op_timeout/test_aicore_op_timeout.py +++ b/tests/st/aicore_op_timeout/test_aicore_op_timeout.py @@ -75,16 +75,21 @@ def test_aicore_op_timeout_surfaces_as_runtime_error(st_platform, st_device_ids) config.aicpu_thread_num = 2 t0 = time.monotonic() - # Acceptable error codes for the STARS-killed AICore op: - # 507046 = ACL_ERROR_RT_STREAM_SYNC_TIMEOUT — host's AICore stream - # sync hits the 2 s budget first (old Mode A AICPU path). - # 507018 = ACL_ERROR_RT_AICPU_EXCEPTION — Mode B AICPU stream sync - # surfaces the AICore failure as an AICPU exception when - # the orchestration kernel detects the dead AIC task. - # 507000 = ACL_ERROR_RT_INTERNAL_ERROR — same Mode B detection, - # mapped through a different code path on a5. - # Regardless of which fires, the regression we care about is that - # the timeout chain reaps the hang in single-digit seconds. + # Acceptable error codes for the STARS-killed AICore op. Which one + # surfaces is timing-dependent — it's whichever stream sync sees the + # AIC failure first: + # 507046 = ACL_ERROR_RT_STREAM_SYNC_TIMEOUT — AICore stream's 2 s + # sync budget fires before AICPU sync notices. + # 507018 = ACL_ERROR_RT_AICPU_EXCEPTION — AICPU stream sync surfaces + # the AICore failure as an AICPU exception when the + # orchestration kernel detects the dead AIC task first. + # 507000 = ACL_ERROR_RT_INTERNAL_ERROR — same detection on a5, + # mapped through a different code path. + # All three are valid on both a2a3 and a5: the timing race is between + # AICPU and AICore stream sync on host, not arch-specific. The + # regression we care about is that the timeout chain reaps the hang + # in single-digit seconds and surfaces *some* 507xxx code rather than + # deadlocking. with pytest.raises(RuntimeError, match=r"run_prepared failed with code 507(046|018|000)"): worker.run(cid, ChipStorageTaskArgs(), config) elapsed = time.monotonic() - t0 diff --git a/tests/ut/py/test_chip_worker.py b/tests/ut/py/test_chip_worker.py index 165dbb62e..10056347c 100644 --- a/tests/ut/py/test_chip_worker.py +++ b/tests/ut/py/test_chip_worker.py @@ -88,17 +88,17 @@ def test_init_after_finalize_raises(self): worker = _ChipWorker() worker.finalize() with pytest.raises(RuntimeError, match="finalized"): - worker.init("/nonexistent/libfoo.so", "/dev/null", "/dev/null", device_id=0) + worker.init("/nonexistent/libfoo.so", "/dev/null", "/dev/null", "", device_id=0) def test_init_with_nonexistent_lib_raises(self): worker = _ChipWorker() with pytest.raises(RuntimeError, match="dlopen"): - worker.init("/nonexistent/libfoo.so", "/dev/null", "/dev/null", device_id=0) + worker.init("/nonexistent/libfoo.so", "/dev/null", "/dev/null", "", device_id=0) def test_init_with_negative_device_id_raises(self): worker = _ChipWorker() with pytest.raises(RuntimeError, match="device_id"): - worker.init("/nonexistent/libfoo.so", "/dev/null", "/dev/null", -1) + worker.init("/nonexistent/libfoo.so", "/dev/null", "/dev/null", "", -1) def test_prepare_callable_before_init_raises(self): from _task_interface import ChipCallable # noqa: PLC0415