From 32d9a8734ff88e372c493d8802fb1e9988d5f99a Mon Sep 17 00:00:00 2001 From: Chao Wang <26245345+ChaoWao@users.noreply.github.com> Date: Wed, 27 May 2026 13:01:55 +0800 Subject: [PATCH] =?UTF-8?q?Revert=20"Feat:=20AICPU=20launch=20via=20dispat?= =?UTF-8?q?cher=20bootstrap=20and=20per-task=20rtsLaunchCpu=E2=80=A6"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 27479f213db68570defb89266b7c3dc13a6a51b0. --- .gitignore | 4 - python/bindings/task_interface.cpp | 2 +- python/simpler/task_interface.py | 9 +- simpler_setup/build_runtimes.py | 8 +- simpler_setup/runtime_builder.py | 49 +-- simpler_setup/runtime_compiler.py | 22 - .../platform/onboard/aicpu/CMakeLists.txt | 51 --- src/a2a3/platform/onboard/aicpu/kernel.cpp | 46 ++- src/a2a3/platform/onboard/host/CMakeLists.txt | 21 +- .../platform/onboard/host/device_runner.cpp | 95 ++--- .../platform/onboard/host/device_runner.h | 80 ++-- .../onboard/host/pto_runtime_c_api.cpp | 12 +- .../platform/sim/host/pto_runtime_c_api.cpp | 9 +- .../aicpu/aicpu_executor.cpp | 2 +- src/a5/platform/onboard/aicpu/CMakeLists.txt | 36 -- src/a5/platform/onboard/aicpu/kernel.cpp | 44 +- src/a5/platform/onboard/host/CMakeLists.txt | 21 +- .../platform/onboard/host/device_runner.cpp | 89 ++-- src/a5/platform/onboard/host/device_runner.h | 80 ++-- .../onboard/host/pto_runtime_c_api.cpp | 9 +- .../platform/sim/host/pto_runtime_c_api.cpp | 7 +- src/common/aicpu_dispatcher/README.md | 36 -- .../aicpu_dispatcher/aicpu_dispatcher.cpp | 201 --------- .../aicpu_dispatcher/aicpu_dispatcher.h | 67 --- src/common/host/load_aicpu_op.cpp | 387 ------------------ src/common/host/load_aicpu_op.h | 142 ------- src/common/worker/chip_worker.cpp | 16 +- src/common/worker/chip_worker.h | 6 +- src/common/worker/pto_runtime_c_api.h | 2 +- .../test_aicore_op_timeout.py | 21 +- tests/ut/py/test_chip_worker.py | 6 +- 31 files changed, 209 insertions(+), 1371 deletions(-) delete mode 100644 src/common/aicpu_dispatcher/README.md delete mode 100644 src/common/aicpu_dispatcher/aicpu_dispatcher.cpp delete mode 100644 src/common/aicpu_dispatcher/aicpu_dispatcher.h delete mode 100644 src/common/host/load_aicpu_op.cpp delete mode 100644 src/common/host/load_aicpu_op.h diff --git a/.gitignore b/.gitignore index 19f23ea16..6502a2795 100644 --- a/.gitignore +++ b/.gitignore @@ -40,7 +40,3 @@ compile_commands.json python/_task_interface*.so python/_task_interface*.dylib .claude/scheduled_tasks.lock - -# Log files -*.log -profiling_logs_*/ diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp index 7aa251db2..4ba073839 100644 --- a/python/bindings/task_interface.cpp +++ b/python/bindings/task_interface.cpp @@ -708,7 +708,7 @@ NB_MODULE(_task_interface, m) { .def(nb::init<>()) .def( "init", &ChipWorker::init, nb::arg("host_lib_path"), nb::arg("aicpu_path"), nb::arg("aicore_path"), - nb::arg("dispatcher_path"), nb::arg("device_id") + nb::arg("device_id") ) .def("finalize", &ChipWorker::finalize) .def( diff --git a/python/simpler/task_interface.py b/python/simpler/task_interface.py index d2dc8cc5b..0a06d269a 100644 --- a/python/simpler/task_interface.py +++ b/python/simpler/task_interface.py @@ -319,9 +319,7 @@ def init(self, device_id, bins, log_level=None, log_info_v=None): device_id: NPU device ID to attach the calling thread to. bins: A `simpler_setup.runtime_builder.RuntimeBinaries` (or any object exposing host_path / aicpu_path / aicore_path / - simpler_log_path / sim_context_path / dispatcher_path). - ``dispatcher_path`` is required for onboard platforms and - ignored on sim (set to None). + simpler_log_path / sim_context_path). log_level: Severity floor (0=DEBUG..4=NUL). Defaults to a snapshot of the simpler logger via `_log.get_current_config()`. log_info_v: INFO verbosity threshold (0..9). Same default. @@ -356,15 +354,10 @@ def init(self, device_id, bins, log_level=None, log_info_v=None): _preload_global(str(bins.sim_context_path)) # 3. host_runtime.so is dlopen'd RTLD_LOCAL inside _impl.init. - # dispatcher_path is passed as an empty string on sim (where bins - # has dispatcher_path=None); the onboard simpler_init reads it - # via LoadAicpuOp::BootstrapDispatcher, sim ignores it. - dispatcher_path = getattr(bins, "dispatcher_path", None) self._impl.init( str(bins.host_path), str(bins.aicpu_path), str(bins.aicore_path), - "" if dispatcher_path is None else str(dispatcher_path), int(device_id), ) diff --git a/simpler_setup/build_runtimes.py b/simpler_setup/build_runtimes.py index fbe24d95e..9ed4fbb8c 100644 --- a/simpler_setup/build_runtimes.py +++ b/simpler_setup/build_runtimes.py @@ -131,7 +131,7 @@ def build_all( raise for platform in platforms: - arch, _ = parse_platform(platform) + arch, variant = parse_platform(platform) runtimes = discover_runtimes(arch) if not runtimes: @@ -152,12 +152,6 @@ def build_all( logger.error(f" Failed to build {platform}/{runtime_name}: {e}") raise - # No device-side deployment step here. The dispatcher SO is uploaded - # into the main aicpu_scheduler at runtime, on the first - # DeviceRunner::ensure_binaries_loaded call, via - # LoadAicpuOp::BootstrapDispatcher (see src/common/host/load_aicpu_op.cpp - # and src/common/aicpu_dispatcher/aicpu_dispatcher.h for architecture). - def main(): parser = argparse.ArgumentParser(description="Pre-build runtime binaries for available platforms") diff --git a/simpler_setup/runtime_builder.py b/simpler_setup/runtime_builder.py index 8c2e403a0..28d8d7fe8 100644 --- a/simpler_setup/runtime_builder.py +++ b/simpler_setup/runtime_builder.py @@ -69,21 +69,13 @@ def _invalidate_cache_if_stale(target_cache_dir: Path, current_commit: str) -> N @dataclass class RuntimeBinaries: - """Paths to the compiled runtime binaries. - - ``dispatcher_path`` points at ``libsimpler_aicpu_dispatcher.so`` and is - required for onboard platforms (host bootstrap reads its bytes and ships - them to the device alongside the inner SO). Sim platforms have no - dispatcher; the field is ``None`` there. ``_lookup_binaries`` resolves - and validates the path against the build output directory. - """ + """Paths to the compiled runtime binaries.""" host_path: Path aicpu_path: Path aicore_path: Path simpler_log_path: Path sim_context_path: Optional[Path] = None - dispatcher_path: Optional[Path] = None class RuntimeBuilder: @@ -194,24 +186,12 @@ def _lookup_binaries(self, name: str, output_dir: Path) -> RuntimeBinaries: "Run 'pip install .' or pass --build to compile it." ) - # Resolve and validate libsimpler_aicpu_dispatcher.so for onboard - # platforms. runtime_compiler stages one copy per arch into - # //dispatcher/ (shared across all runtimes); sim - # platforms have no dispatcher. - dispatcher_path = self._resolve_dispatcher_path() - if dispatcher_path is not None and not dispatcher_path.is_file(): - raise FileNotFoundError( - f"Pre-built libsimpler_aicpu_dispatcher.so not found at {dispatcher_path}.\n" - "Run 'pip install .' or pass --build to compile it." - ) - return RuntimeBinaries( host_path=paths["host"], aicpu_path=paths["aicpu"], aicore_path=paths["aicore"], simpler_log_path=simpler_log_path, sim_context_path=sim_context_path, - dispatcher_path=dispatcher_path, ) def get_binaries(self, name: str, build: bool = False) -> RuntimeBinaries: @@ -236,11 +216,6 @@ def get_binaries(self, name: str, build: bool = False) -> RuntimeBinaries: arch, variant = self._arch, self._variant output_dir = self._LIB_DIR / arch / variant / name - # Per-arch shared destination for libsimpler_aicpu_dispatcher.so. The - # dispatcher has no runtime-specific code, so all runtimes on a given - # arch reuse the same SO instead of carrying a copy each (~50 KB × N). - # None on sim — sim variants have no dispatcher. - dispatcher_staging_dir = self._LIB_DIR / arch / "dispatcher" if variant != "sim" else None if not build: return self._lookup_binaries(name, output_dir) @@ -272,7 +247,6 @@ def _compile_target(target: str) -> Path: source_dirs, build_dir=str(cache_dir), output_dir=output_dir, - dispatcher_dest=dispatcher_staging_dir if target == "aicpu" else None, ) logger.info("Compiling AICore, AICPU, Host in parallel...") @@ -294,35 +268,14 @@ def _compile_target(target: str) -> Path: self._place_compile_commands(name) logger.info("Build complete!") - # runtime_compiler stages libsimpler_aicpu_dispatcher.so into the - # per-arch shared directory when target=='aicpu'. Surface it through - # RuntimeBinaries so ChipWorker.init can pass the path to - # LoadAicpuOp::BootstrapDispatcher. - dispatcher_path = self._resolve_dispatcher_path() - if dispatcher_path is not None and not dispatcher_path.is_file(): - dispatcher_path = None return RuntimeBinaries( host_path=host_path, aicpu_path=aicpu_path, aicore_path=aicore_path, simpler_log_path=simpler_log_path, sim_context_path=sim_context_path, - dispatcher_path=dispatcher_path, ) - def _resolve_dispatcher_path(self) -> Optional[Path]: - """Return path to libsimpler_aicpu_dispatcher.so for onboard variants. - - Returns ``None`` for sim variants (no dispatcher needed: sim's AICPU - runs in-process). For onboard, runtime_compiler stages one shared - copy per arch under ``build/lib//dispatcher/`` (the dispatcher - has no runtime-specific code, so all onboard runtimes on a given - arch use the same SO). Validated separately by ``_lookup_binaries``. - """ - if self._variant == "sim": - return None - return self._LIB_DIR / self._arch / "dispatcher" / "libsimpler_aicpu_dispatcher.so" - def _resolve_sim_context_path(self) -> Optional[Path]: """Return path to libcpu_sim_context.so for sim platforms, None for onboard. diff --git a/simpler_setup/runtime_compiler.py b/simpler_setup/runtime_compiler.py index 6e679f20d..3185984f0 100644 --- a/simpler_setup/runtime_compiler.py +++ b/simpler_setup/runtime_compiler.py @@ -201,7 +201,6 @@ def compile( source_dirs: list[str], build_dir: Optional[str] = None, output_dir: Optional[Union[str, Path]] = None, - dispatcher_dest: Optional[Union[str, Path]] = None, ) -> Union[bytes, Path]: """ Compile binary for the specified target platform. @@ -213,12 +212,6 @@ def compile( build_dir: The directory path for compiling. When None, use a temporal path. output_dir: Directory to copy the final binary into. When set, returns Path. When None, returns bytes (backward-compatible). - dispatcher_dest: Directory to stage libsimpler_aicpu_dispatcher.so into. - Only consumed when target_platform == 'aicpu' (the aicpu - CMakeLists builds the dispatcher target as a side product). - When None, the dispatcher SO is not exported. Used by - runtime_builder to share one dispatcher SO across all - runtimes for a given arch. Returns: If output_dir is set: Path to the compiled binary in output_dir. @@ -251,21 +244,6 @@ def _build(actual_build_dir: str) -> Union[bytes, Path]: platform=platform, build_dir=actual_build_dir, ) - # Stage the AICPU dispatcher SO into the per-arch shared directory - # provided by runtime_builder. The dispatcher has no runtime-specific - # code (same source under any RUNTIME_NAME), so one copy per arch - # serves every runtime variant — the path is later surfaced through - # RuntimeBinaries.dispatcher_path. Only fires when the aicpu cmake - # build actually produced the dispatcher SO as a side product. - if target_platform == "aicpu" and dispatcher_dest is not None: - dispatcher_name = "libsimpler_aicpu_dispatcher.so" - dispatcher_so = Path(actual_build_dir) / dispatcher_name - if dispatcher_so.is_file(): - dest_dir = Path(dispatcher_dest) - dest_dir.mkdir(parents=True, exist_ok=True) - dest_dispatcher = dest_dir / dispatcher_name - shutil.copy2(dispatcher_so, dest_dispatcher) - subprocess.run(["strip", "-s", str(dest_dispatcher)], check=True) if output_dir is not None: od = Path(output_dir) od.mkdir(parents=True, exist_ok=True) diff --git a/src/a2a3/platform/onboard/aicpu/CMakeLists.txt b/src/a2a3/platform/onboard/aicpu/CMakeLists.txt index 627001511..6edf9eb93 100644 --- a/src/a2a3/platform/onboard/aicpu/CMakeLists.txt +++ b/src/a2a3/platform/onboard/aicpu/CMakeLists.txt @@ -20,7 +20,6 @@ list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../include list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/task_interface") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/log/include") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/device_comm") -list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher") if(DEFINED CUSTOM_INCLUDE_DIRS) foreach(INC_DIR ${CUSTOM_INCLUDE_DIRS}) list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${INC_DIR}") @@ -85,53 +84,3 @@ target_link_directories(aicpu_kernel # Output name set_target_properties(aicpu_kernel PROPERTIES OUTPUT_NAME aicpu_kernel) - -# Build dispatcher SO — bootstrap-only upload helper. The dispatcher has NO -# runtime-specific code; libaicpu_extend_kernels loads it once via -# rtAicpuKernelLaunchExWithArgs(KERNEL_TYPE_AICPU_KFC), invokes -# DynTileFwkBackendKernelServerInit, which writes the bundled inner SO bytes -# (passed via the extended DeviceArgs at offsets 120/128) to -# /usr/lib64/aicpu_kernels/0/aicpu_kernels_device/simpler_inner_.so under -# a content-fingerprint basename. After bootstrap the host registers the -# preinstall file via Mode B (rtsBinaryLoadFromFile + rtsFuncGetByName) and -# launches per-task through rtsLaunchCpuKernel; the dispatcher SO itself is -# never referenced again. -# -# Output name is fixed ("simpler_aicpu_dispatcher"). See -# src/common/aicpu_dispatcher/{aicpu_dispatcher.h,README.md} for the -# extended DeviceArgs layout and the FNV-1a/Build-ID fingerprint protocol. -set(AICPU_DISPATCHER_SOURCES - "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher/aicpu_dispatcher.cpp" -) -add_library(aicpu_dispatcher SHARED ${AICPU_DISPATCHER_SOURCES}) - -target_compile_options(aicpu_dispatcher - PRIVATE - -Wall - -Wextra - -rdynamic - -O3 - -fPIC - -g - $<$:-std=gnu++17> -) - -target_include_directories(aicpu_dispatcher - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR} - ${CMAKE_CUSTOM_INCLUDE_DIRS} - # src/common is needed so `#include "utils/elf_build_id.h"` resolves; - # host_runtime.so already has this on its include path (see host - # CMakeLists), and the dispatcher uses the same header to fingerprint - # the inner SO bytes by their ELF Build-ID rather than a 64-byte FNV - # over the (mostly-shared) ELF header. - ${CMAKE_CURRENT_SOURCE_DIR}/../../../../common - ${ASCEND_HOME_PATH}/include -) - -target_link_libraries(aicpu_dispatcher PRIVATE dl) - -set_target_properties(aicpu_dispatcher PROPERTIES - LINK_FLAGS "-Wl,--build-id" - OUTPUT_NAME "simpler_aicpu_dispatcher" -) diff --git a/src/a2a3/platform/onboard/aicpu/kernel.cpp b/src/a2a3/platform/onboard/aicpu/kernel.cpp index 95640bd4e..8109aa1dc 100644 --- a/src/a2a3/platform/onboard/aicpu/kernel.cpp +++ b/src/a2a3/platform/onboard/aicpu/kernel.cpp @@ -24,8 +24,8 @@ #include "runtime.h" // Run-wall capture: g_device_start_cycle is set once in -// simpler_aicpu_init (single-threaded launch); each thread -// of the multi-threaded simpler_aicpu_exec writes the converted +// DynTileFwkBackendKernelServerInit (single-threaded launch); each thread +// of the multi-threaded DynTileFwkBackendKernelServer writes the converted // (end - start) into KernelArgs.device_wall_ns on exit. Plain stores — // last-writer-wins is fine for wall measurement (concurrent exiting threads' // `my_end` values differ by µs, the final overwrite is within benchmark @@ -35,20 +35,27 @@ static uint64_t g_device_start_cycle = 0; // Forward declaration of aicpu_execute (implemented in aicpu_executor.cpp) extern "C" int aicpu_execute(Runtime *arg); +extern "C" __attribute__((visibility("default"))) int StaticTileFwkBackendKernelServer(void *arg) { + if (arg == nullptr) { + LOG_ERROR("%s", "Invalid kernel arguments: null pointer"); + return -1; + } + + return 0; +} + /** - * AICPU kernel initialization entry point. + * AICPU kernel initialization entry point * - * Called once per run by the main aicpu_scheduler. Host registers this SO - * via `rtsBinaryLoadFromFile` (Mode B JSON load, cpuKernelMode=0) and - * resolves this symbol via `rtsFuncGetByName`; the per-task launch goes - * through `rtsLaunchCpuKernel` on the cached `rtFuncHandle`. The bootstrap - * dispatcher only writes this SO to the preinstall path — it does not - * dlsym these symbols itself. + * This function is called once during kernel initialization by the CANN + * runtime. It initializes logging and validates kernel arguments. + * + * Note: Function name is hardcoded in libaicpu_extend_kernels.so * * @param arg Pointer to KernelArgs structure * @return 0 on success, -1 on error */ -extern "C" __attribute__((visibility("default"))) int simpler_aicpu_init(void *arg) { +extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelServerInit(void *arg) { init_log_switch(); if (arg == nullptr) { LOG_ERROR("%s", "Invalid kernel arguments: null pointer"); @@ -60,7 +67,7 @@ extern "C" __attribute__((visibility("default"))) int simpler_aicpu_init(void *a // Init is launched single-threaded (block_dim=1), so the race-free spot // to capture run start and reset the wall accumulator. Subsequent - // simpler_aicpu_exec threads stamp end on their way out, via + // DynTileFwkBackendKernelServer threads stamp end on their way out, via // the device-resident 8-byte buffer addressed by device_wall_data_base. g_device_start_cycle = get_sys_cnt_aicpu(); if (k_args->device_wall_data_base != 0) { @@ -72,16 +79,17 @@ extern "C" __attribute__((visibility("default"))) int simpler_aicpu_init(void *a } /** - * AICPU kernel main execution entry point. + * AICPU kernel main execution entry point + * + * This is the main entry point for the AICPU runtime executor kernel. + * It extracts the Runtime from KernelArgs and delegates to AicpuExecute. * - * Called per-thread by the main aicpu_scheduler via the cached - * `rtFuncHandle` resolved during host-side Mode B init (see - * `simpler_aicpu_init` docstring for the load path). + * Note: Function name is hardcoded in libaicpu_extend_kernels.so * * @param arg Pointer to KernelArgs structure containing runtime_args * @return 0 on success, non-zero on error */ -extern "C" __attribute__((visibility("default"))) int simpler_aicpu_exec(void *arg) { +extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelServer(void *arg) { if (arg == nullptr) { LOG_ERROR("%s", "Invalid kernel arguments: null pointer"); return -1; @@ -120,13 +128,13 @@ extern "C" __attribute__((visibility("default"))) int simpler_aicpu_exec(void *a return 0; } - LOG_INFO_V0("%s", "simpler_aicpu_exec: Calling aicpu_execute with Runtime"); + LOG_INFO_V0("%s", "DynTileFwkBackendKernelServer: Calling aicpu_execute with Runtime"); int rc = aicpu_execute(runtime); if (rc != 0) { - LOG_ERROR("simpler_aicpu_exec: aicpu_execute failed with rc=%d", rc); + LOG_ERROR("DynTileFwkBackendKernelServer: aicpu_execute failed with rc=%d", rc); return rc; } - LOG_INFO_V0("%s", "simpler_aicpu_exec: aicpu_execute completed successfully"); + LOG_INFO_V0("%s", "DynTileFwkBackendKernelServer: aicpu_execute completed successfully"); // Stamp end into the device_wall buffer (addressed via // device_wall_data_base). Last-writer-wins across threads — wall diff --git a/src/a2a3/platform/onboard/host/CMakeLists.txt b/src/a2a3/platform/onboard/host/CMakeLists.txt index e607e5fd7..f0f01d438 100644 --- a/src/a2a3/platform/onboard/host/CMakeLists.txt +++ b/src/a2a3/platform/onboard/host/CMakeLists.txt @@ -22,8 +22,6 @@ list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../c list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/worker") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/log/include") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/device_comm") -list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host") -list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher") if(DEFINED CUSTOM_INCLUDE_DIRS) foreach(INC_DIR ${CUSTOM_INCLUDE_DIRS}) list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${INC_DIR}") @@ -61,10 +59,6 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/pmu_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/dep_gen_collector.cpp" ) -# Add common/host sources (LoadAicpuOp) -list(APPEND HOST_RUNTIME_SOURCES - "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host/load_aicpu_op.cpp" -) if(DEFINED CUSTOM_SOURCE_DIRS) foreach(SRC_DIR ${CUSTOM_SOURCE_DIRS}) file(GLOB_RECURSE DIR_SOURCES "${SRC_DIR}/*.cpp" "${SRC_DIR}/*.c") @@ -114,14 +108,17 @@ target_include_directories(host_runtime ${ASCEND_HOME_PATH}/include ${ASCEND_HOME_PATH}/pkg_inc ${ASCEND_HOME_PATH}/pkg_inc/runtime - # pkg_inc/runtime/runtime exposes rts_kernel.h + kernel.h (CANN 7.0+ - # rtsLaunchCpuKernel API used by LoadAicpuOp). - ${ASCEND_HOME_PATH}/pkg_inc/runtime/runtime ${ASCEND_HOME_PATH}/pkg_inc/profiling ${ASCEND_HOME_PATH}/${CMAKE_SYSTEM_PROCESSOR}-linux/asc/include ${ASCEND_HOME_PATH}/${CMAKE_SYSTEM_PROCESSOR}-linux/include/driver ) +target_link_directories(host_runtime + PRIVATE + ${ASCEND_HOME_PATH}/lib64 + ${ASCEND_HOME_PATH}/runtime/lib64 +) + if(SIMPLER_ENABLE_PTO_SDMA_WORKSPACE) target_link_directories(host_runtime PRIVATE ${ASCEND_HOME_PATH}/${CMAKE_SYSTEM_PROCESSOR}-linux/lib64) endif() @@ -159,10 +156,4 @@ if(SIMPLER_ENABLE_PTO_SDMA_WORKSPACE) target_link_libraries(host_runtime PRIVATE nnopbase) endif() -target_link_directories(host_runtime - PRIVATE - ${ASCEND_HOME_PATH}/lib64 - ${ASCEND_HOME_PATH}/runtime/lib64 -) - set_target_properties(host_runtime PROPERTIES OUTPUT_NAME "host_runtime") diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index be027d244..4f04a94ce 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -18,7 +18,6 @@ #include "device_runner.h" #include "host_log.h" -#include "load_aicpu_op.h" #include @@ -466,50 +465,14 @@ int DeviceRunner::ensure_binaries_loaded() { return -1; } - if (dispatcher_so_binary_.empty()) { - LOG_ERROR( - "DeviceRunner: dispatcher SO bytes not provided; pass dispatcher_path through ChipWorker.init " - "(RuntimeBinaries.dispatcher_path)" - ); - return -1; - } - - // Bundle dispatcher SO + inner SO bytes into one Mode A KFC call: - // libaicpu_extend_kernels invokes our dispatcher, which writes the inner - // SO bytes to /usr/lib64/aicpu_kernels/0/aicpu_kernels_device/simpler_inner_.so - // using sched-thread (HwHiAiUser) write permission. The dispatcher itself - // never lands at preinstall — only its transient libaicpu_extend_kernels - // dlopen. Per-task launches afterwards go through Mode B - // (rtsBinaryLoadFromFile + rtsFuncGetByName + rtsLaunchCpuKernel) directly - // against the preinstall file. - int rc = load_aicpu_op_.BootstrapDispatcher( - dispatcher_so_binary_.data(), dispatcher_so_binary_.size(), aicpu_so_binary_.data(), aicpu_so_binary_.size(), - stream_aicpu_ - ); - if (rc != 0) { - LOG_ERROR("LoadAicpuOp::BootstrapDispatcher failed: %d", rc); - return rc; - } - LOG_INFO_V2("DeviceRunner: inner SO uploaded to preinstall via dispatcher bootstrap"); - - // JSON-register the inner SO and resolve simpler_aicpu_init / _exec handles. - rc = load_aicpu_op_.Init(); - if (rc != 0) { - LOG_ERROR("LoadAicpuOp::Init failed: %d", rc); - return rc; - } - LOG_INFO_V2("DeviceRunner: inner SO registered (simpler_aicpu_init/exec handles ready)"); - - // H2D copy aicpu kernel SO bytes and stamp the resulting device pointer - // into device_args_.aicpu_so_bin/len. The bytes are no longer needed by - // the preinstall-based load path, but the device-side memory is still - // load-bearing on a5 onboard — dropping the allocation surfaced 207001 - // AICore launch failures + 507899 stream-create failures in CI. - rc = so_info_.init(aicpu_so_binary_, mem_alloc_); + // Load AICPU SO + int rc = so_info_.init(aicpu_so_binary_, mem_alloc_); if (rc != 0) { LOG_ERROR("AicpuSoInfo::init failed: %d", rc); return rc; } + + // Initialize device args device_args_.aicpu_so_bin = so_info_.aicpu_so_bin; device_args_.aicpu_so_len = so_info_.aicpu_so_len; rc = kernel_args_.init_device_args(device_args_, mem_alloc_); @@ -519,15 +482,6 @@ int DeviceRunner::ensure_binaries_loaded() { return rc; } - // Release host bytes — bootstrap is done. Mode B per-task launches go - // through the cached rtFuncHandle owned by LoadAicpuOp; dispatcher SO - // bytes are never referenced again; the aicpu kernel SO's host buffer is - // also free to drop now that so_info_ already H2D'd the bytes above. - dispatcher_so_binary_.clear(); - dispatcher_so_binary_.shrink_to_fit(); - aicpu_so_binary_.clear(); - aicpu_so_binary_.shrink_to_fit(); - binaries_loaded_ = true; LOG_INFO_V0("DeviceRunner: binaries loaded"); return 0; @@ -828,16 +782,18 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { dep_gen_collector_.start(thread_factory); } - LOG_INFO_V0("=== launch_aicpu_kernel %s ===", host::KernelNames::InitName); - rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, host::KernelNames::InitName, 1); + LOG_INFO_V0("=== launch_aicpu_kernel DynTileFwkKernelServerInit ==="); + // Launch AICPU init kernel + rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServerInit", 1); if (rc != 0) { LOG_ERROR("launch_aicpu_kernel (init) failed: %d", rc); return rc; } - LOG_INFO_V0("=== launch_aicpu_kernel %s ===", host::KernelNames::RunName); + LOG_INFO_V0("=== launch_aicpu_kernel DynTileFwkKernelServer ==="); + // Launch AICPU main kernel (over-launch for affinity gate) rc = launch_aicpu_kernel( - stream_aicpu_, &kernel_args_.args, host::KernelNames::RunName, PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH + stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServer", PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH ); if (rc != 0) { LOG_ERROR("launch_aicpu_kernel (main) failed: %d", rc); @@ -1173,12 +1129,9 @@ int DeviceRunner::finalize() { // Cleanup kernel args (deviceArgs) kernel_args_.finalize_device_args(); - // Cleanup AICPU SO H2D allocation + // Cleanup AICPU SO so_info_.finalize(); - // load_aicpu_op_ has no per-task device-side state to release (Mode A - // type 2 launches don't keep handles). The dispatcher itself was a - // transient libaicpu_extend_kernels dlopen — nothing to unload from host. binaries_loaded_ = false; // Release any chip callable buffers uploaded via upload_chip_callable_buffer. @@ -1263,11 +1216,27 @@ int DeviceRunner::finalize() { } int DeviceRunner::launch_aicpu_kernel(rtStream_t stream, KernelArgs *k_args, const char *kernel_name, int aicpu_num) { - // kernel_name is host::KernelNames::InitName / RunName — the runtime SO's - // actual exported symbol (simpler_aicpu_init / simpler_aicpu_exec). The - // Mode A type 2 launch in LaunchBuiltInOp embeds it in the args struct - // for the main aicpu_scheduler to dlsym. - return load_aicpu_op_.LaunchBuiltInOp(stream, k_args, aicpu_num, kernel_name); + struct Args { + KernelArgs k_args; + char kernel_name[32]; + const char so_name[32] = {"libaicpu_extend_kernels.so"}; + const char op_name[32] = {""}; + } args; + + args.k_args = *k_args; + std::strncpy(args.kernel_name, kernel_name, sizeof(args.kernel_name) - 1); + args.kernel_name[sizeof(args.kernel_name) - 1] = '\0'; + + rtAicpuArgsEx_t rt_args; + std::memset(&rt_args, 0, sizeof(rt_args)); + rt_args.args = &args; + rt_args.argsSize = sizeof(args); + rt_args.kernelNameAddrOffset = offsetof(struct Args, kernel_name); + rt_args.soNameAddrOffset = offsetof(struct Args, so_name); + + return rtAicpuKernelLaunchExWithArgs( + rtKernelType_t::KERNEL_TYPE_AICPU_KFC, "AST_DYN_AICPU", aicpu_num, &rt_args, nullptr, stream, 0 + ); } int DeviceRunner::launch_aicore_kernel(rtStream_t stream, KernelArgs *k_args) { diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h index f8e1b7acf..bd9d088b0 100644 --- a/src/a2a3/platform/onboard/host/device_runner.h +++ b/src/a2a3/platform/onboard/host/device_runner.h @@ -51,7 +51,6 @@ #include "host/tensor_dump_collector.h" #include "host/pmu_collector.h" #include "host/dep_gen_collector.h" -#include "load_aicpu_op.h" #include "runtime.h" /** @@ -59,10 +58,7 @@ * * This structure contains pointers to device memory for the AICPU shared * object. The layout is hardcoded in libaicpu_extend_kernels.so, which expects - * specific offsets for aicpu_so_bin and aicpu_so_len fields. The fields are - * load-bearing on a5 onboard (CI surfaced 207001 AICore launch failures and - * 507899 stream-create failures when they were dropped); treat the layout as - * part of the device-side contract even though our own kernels do not read it. + * specific offsets for aicpu_so_bin and aicpu_so_len fields. */ struct DeviceArgs { uint64_t unused[12] = {0}; @@ -70,23 +66,6 @@ struct DeviceArgs { uint64_t aicpu_so_len{0}; }; -/** - * AICPU shared object information and management - * - * Manages the host→device copy of the runtime AICPU SO bytes that backs - * DeviceArgs.aicpu_so_bin / aicpu_so_len. Required on a5 onboard even though - * our own runtime AICPU SO never dereferences these fields — removing the - * H2D allocation destabilized CI (see DeviceArgs comment above). - */ -struct AicpuSoInfo { - uint64_t aicpu_so_bin{0}; - uint64_t aicpu_so_len{0}; - MemoryAllocator *allocator_{nullptr}; - - int init(const std::vector &aicpu_so_binary, MemoryAllocator &allocator); - int finalize(); -}; - /** * Helper class for managing KernelArgs with device memory * @@ -170,6 +149,34 @@ struct KernelArgsHelper { KernelArgs *operator&() { return &args; } }; +/** + * AICPU shared object information and management + * + * This class manages loading and device memory allocation for AICPU + * shared object (.so) files. + */ +struct AicpuSoInfo { + uint64_t aicpu_so_bin{0}; + uint64_t aicpu_so_len{0}; + MemoryAllocator *allocator_{nullptr}; + + /** + * Load shared object binary data and copy to device memory + * + * @param aicpu_so_binary Binary data of the AICPU shared object + * @param allocator Memory allocator to use + * @return 0 on success, error code on failure + */ + int init(const std::vector &aicpu_so_binary, MemoryAllocator &allocator); + + /** + * Free device memory allocated for shared object + * + * @return 0 on success, error code on failure + */ + int finalize(); +}; + /** * Device runner for kernel execution * @@ -281,19 +288,6 @@ class DeviceRunner { aicore_kernel_binary_ = std::move(aicore_kernel_binary); } - /** - * Take ownership of the dispatcher SO bytes. Called by simpler_init when - * the caller provided a dispatcher path; ensure_binaries_loaded() hands - * the buffer to LoadAicpuOp::BootstrapDispatcher on the first run. - * Leaving this unset (empty buffer) makes ensure_binaries_loaded() fail - * with a clear message — callers that drive _ChipWorker.init directly - * without a dispatcher path get a deterministic error at run() time - * rather than a confusing dladdr-derived path. - */ - void set_dispatcher_binary(std::vector dispatcher_so_binary) { - dispatcher_so_binary_ = std::move(dispatcher_so_binary); - } - /** The device id captured by simpler_init's attach_current_thread call. */ int device_id() const { return device_id_; } @@ -571,23 +565,9 @@ class DeviceRunner { int cores_per_blockdim_{PLATFORM_CORES_PER_BLOCKDIM}; int worker_count_{0}; // Stored for print_handshake_results in destructor // Executor binaries — populated once via set_executors() during - // simpler_init. aicore_kernel_binary_ stays resident (launch_aicore_kernel - // re-registers it via rtRegisterAllKernel on every run). aicpu_so_binary_ - // is released by ensure_binaries_loaded() after bootstrap; bootstrap is - // the only consumer and Mode B per-task launches go through the cached - // rtFuncHandle on LoadAicpuOp, not the host bytes. + // simpler_init, owned by this runner for the rest of its lifetime. std::vector aicpu_so_binary_; std::vector aicore_kernel_binary_; - // Dispatcher SO bytes — populated once via set_dispatcher_binary() during - // simpler_init. Consumed exclusively by BootstrapDispatcher on the first - // run() and released by ensure_binaries_loaded() right after. Empty buffer - // is permitted at init time (callers that drive ChipWorker.init without a - // dispatcher path); ensure_binaries_loaded() then fails fast with a clear - // message if/when bootstrap is actually attempted. - std::vector dispatcher_so_binary_; - - // AICPU op loader — handles dispatcher bootstrap and per-task launches. - host::LoadAicpuOp load_aicpu_op_; // Memory management MemoryAllocator mem_alloc_; diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp index 744b7291c..f36aa6f0d 100644 --- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp @@ -226,7 +226,7 @@ int finalize_device(DeviceContextHandle ctx) { int simpler_init( DeviceContextHandle ctx, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, - const uint8_t *aicore_binary, size_t aicore_size, const uint8_t *dispatcher_binary, size_t dispatcher_size + const uint8_t *aicore_binary, size_t aicore_size ) { if (ctx == NULL) return -1; @@ -258,16 +258,6 @@ int simpler_init( std::vector aicpu_vec(aicpu_binary, aicpu_binary + aicpu_size); std::vector aicore_vec(aicore_binary, aicore_binary + aicore_size); runner->set_executors(std::move(aicpu_vec), std::move(aicore_vec)); - // Dispatcher SO bytes are passed alongside the executors. Onboard - // requires a non-empty buffer: BootstrapDispatcher reads from it on - // the first run() to upload the dispatcher + inner SO bundle through - // libaicpu_extend_kernels. If the caller drives _ChipWorker.init - // directly without a dispatcher path, this stays empty and any later - // run() fails fast in ensure_binaries_loaded with a clear message. - if (dispatcher_binary != NULL && dispatcher_size > 0) { - std::vector dispatcher_vec(dispatcher_binary, dispatcher_binary + dispatcher_size); - runner->set_dispatcher_binary(std::move(dispatcher_vec)); - } } catch (...) { return -1; } diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp index 3eb4a09e0..7c1e3cb7e 100644 --- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp @@ -217,15 +217,8 @@ int destroy_comm_stream_ctx(DeviceContextHandle ctx, void *stream) { int simpler_init( DeviceContextHandle ctx, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, - const uint8_t *aicore_binary, size_t aicore_size, const uint8_t *dispatcher_binary, size_t dispatcher_size + const uint8_t *aicore_binary, size_t aicore_size ) { - // Sim has no AICPU dispatcher (the simulator runs AICPU in-process). Accept - // the parameters for ABI parity with the onboard implementation and ignore - // them — callers that pass dispatcher bytes get the same shape as onboard, - // and Mode B path on sim isn't taken anyway. - (void)dispatcher_binary; - (void)dispatcher_size; - if (ctx == NULL) return -1; DeviceRunner *runner = static_cast(ctx); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index f8e35917b..9022e033b 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -266,7 +266,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { return -1; } - // Try multiple paths that may allow execution on AICPU. + // Try multiple paths that may allow execution on AICPU char so_path[256]; bool file_created = false; const char *candidate_dirs[] = { diff --git a/src/a5/platform/onboard/aicpu/CMakeLists.txt b/src/a5/platform/onboard/aicpu/CMakeLists.txt index 2c95f25fc..6edf9eb93 100644 --- a/src/a5/platform/onboard/aicpu/CMakeLists.txt +++ b/src/a5/platform/onboard/aicpu/CMakeLists.txt @@ -20,7 +20,6 @@ list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../include list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/task_interface") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/log/include") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/device_comm") -list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher") if(DEFINED CUSTOM_INCLUDE_DIRS) foreach(INC_DIR ${CUSTOM_INCLUDE_DIRS}) list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${INC_DIR}") @@ -85,38 +84,3 @@ target_link_directories(aicpu_kernel # Output name set_target_properties(aicpu_kernel PROPERTIES OUTPUT_NAME aicpu_kernel) - -# See src/a2a3/platform/onboard/aicpu/CMakeLists.txt for design rationale. -# Direction 1: stable single dispatcher + runtime AICPU kernel uploaded at runtime. -set(AICPU_DISPATCHER_SOURCES - "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher/aicpu_dispatcher.cpp" -) -add_library(aicpu_dispatcher SHARED ${AICPU_DISPATCHER_SOURCES}) - -target_compile_options(aicpu_dispatcher - PRIVATE - -Wall - -Wextra - -rdynamic - -O3 - -fPIC - -g - $<$:-std=gnu++17> -) - -target_include_directories(aicpu_dispatcher - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR} - ${CMAKE_CUSTOM_INCLUDE_DIRS} - # src/common is needed so `#include "utils/elf_build_id.h"` resolves - # (matches a2a3 sibling; same Build-ID fingerprint protocol). - ${CMAKE_CURRENT_SOURCE_DIR}/../../../../common - ${ASCEND_HOME_PATH}/include -) - -target_link_libraries(aicpu_dispatcher PRIVATE dl) - -set_target_properties(aicpu_dispatcher PROPERTIES - LINK_FLAGS "-Wl,--build-id" - OUTPUT_NAME "simpler_aicpu_dispatcher" -) diff --git a/src/a5/platform/onboard/aicpu/kernel.cpp b/src/a5/platform/onboard/aicpu/kernel.cpp index 189731073..c4011b2b1 100644 --- a/src/a5/platform/onboard/aicpu/kernel.cpp +++ b/src/a5/platform/onboard/aicpu/kernel.cpp @@ -23,8 +23,8 @@ #include "runtime.h" // Run-wall capture: g_device_start_cycle is set once in -// simpler_aicpu_init (single-threaded launch); each thread -// of the multi-threaded simpler_aicpu_exec writes the converted +// DynTileFwkBackendKernelServerInit (single-threaded launch); each thread +// of the multi-threaded DynTileFwkBackendKernelServer writes the converted // (end - start) into KernelArgs.device_wall_ns on exit. Plain stores — // last-writer-wins is fine for wall measurement. static uint64_t g_device_start_cycle = 0; @@ -32,20 +32,27 @@ static uint64_t g_device_start_cycle = 0; // Forward declaration of aicpu_execute (implemented in aicpu_executor.cpp) extern "C" int aicpu_execute(Runtime *arg); +extern "C" __attribute__((visibility("default"))) int StaticTileFwkBackendKernelServer(void *arg) { + if (arg == nullptr) { + LOG_ERROR("%s", "Invalid kernel arguments: null pointer"); + return -1; + } + + return 0; +} + /** - * AICPU kernel initialization entry point. + * AICPU kernel initialization entry point * - * Called once per run by the main aicpu_scheduler. Host registers this SO - * via `rtsBinaryLoadFromFile` (Mode B JSON load, cpuKernelMode=0) and - * resolves this symbol via `rtsFuncGetByName`; the per-task launch goes - * through `rtsLaunchCpuKernel` on the cached `rtFuncHandle`. The bootstrap - * dispatcher only writes this SO to the preinstall path — it does not - * dlsym these symbols itself. + * This function is called once during kernel initialization by the CANN + * runtime. It initializes logging and validates kernel arguments. + * + * Note: Function name is hardcoded in libaicpu_extend_kernels.so * * @param arg Pointer to KernelArgs structure * @return 0 on success, -1 on error */ -extern "C" __attribute__((visibility("default"))) int simpler_aicpu_init(void *arg) { +extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelServerInit(void *arg) { init_log_switch(); if (arg == nullptr) { LOG_ERROR("%s", "Invalid kernel arguments: null pointer"); @@ -67,16 +74,17 @@ extern "C" __attribute__((visibility("default"))) int simpler_aicpu_init(void *a } /** - * AICPU kernel main execution entry point. + * AICPU kernel main execution entry point + * + * This is the main entry point for the AICPU runtime executor kernel. + * It extracts the Runtime from KernelArgs and delegates to AicpuExecute. * - * Called per-thread by the main aicpu_scheduler via the cached - * `rtFuncHandle` resolved during host-side Mode B init (see - * `simpler_aicpu_init` docstring for the load path). + * Note: Function name is hardcoded in libaicpu_extend_kernels.so * * @param arg Pointer to KernelArgs structure containing runtime_args * @return 0 on success, non-zero on error */ -extern "C" __attribute__((visibility("default"))) int simpler_aicpu_exec(void *arg) { +extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelServer(void *arg) { if (arg == nullptr) { LOG_ERROR("%s", "Invalid kernel arguments: null pointer"); return -1; @@ -113,13 +121,13 @@ extern "C" __attribute__((visibility("default"))) int simpler_aicpu_exec(void *a return 0; } - LOG_INFO_V0("%s", "simpler_aicpu_exec: Calling aicpu_execute with Runtime"); + LOG_INFO_V0("%s", "DynTileFwkBackendKernelServer: Calling aicpu_execute with Runtime"); int rc = aicpu_execute(runtime); if (rc != 0) { - LOG_ERROR("simpler_aicpu_exec: aicpu_execute failed with rc=%d", rc); + LOG_ERROR("DynTileFwkBackendKernelServer: aicpu_execute failed with rc=%d", rc); return rc; } - LOG_INFO_V0("%s", "simpler_aicpu_exec: aicpu_execute completed successfully"); + LOG_INFO_V0("%s", "DynTileFwkBackendKernelServer: aicpu_execute completed successfully"); // Stamp end into the device_wall buffer. Last-writer-wins across threads. uint64_t my_end = get_sys_cnt_aicpu(); diff --git a/src/a5/platform/onboard/host/CMakeLists.txt b/src/a5/platform/onboard/host/CMakeLists.txt index c1a006cef..e5b57bf7a 100644 --- a/src/a5/platform/onboard/host/CMakeLists.txt +++ b/src/a5/platform/onboard/host/CMakeLists.txt @@ -23,8 +23,6 @@ list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../c list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/log/include") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/device_comm") -list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host") -list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher") if(DEFINED CUSTOM_INCLUDE_DIRS) foreach(INC_DIR ${CUSTOM_INCLUDE_DIRS}) list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${INC_DIR}") @@ -46,10 +44,6 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/pmu_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/tensor_dump_collector.cpp" ) -# Add common/host sources (LoadAicpuOp) -list(APPEND HOST_RUNTIME_SOURCES - "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host/load_aicpu_op.cpp" -) if(DEFINED CUSTOM_SOURCE_DIRS) foreach(SRC_DIR ${CUSTOM_SOURCE_DIRS}) file(GLOB_RECURSE DIR_SOURCES "${SRC_DIR}/*.cpp" "${SRC_DIR}/*.c") @@ -91,13 +85,16 @@ target_include_directories(host_runtime ${ASCEND_HOME_PATH}/include ${ASCEND_HOME_PATH}/pkg_inc ${ASCEND_HOME_PATH}/pkg_inc/runtime - # pkg_inc/runtime/runtime exposes rts_kernel.h + kernel.h (CANN 7.0+ - # rtsLaunchCpuKernel API used by LoadAicpuOp). - ${ASCEND_HOME_PATH}/pkg_inc/runtime/runtime ${ASCEND_HOME_PATH}/pkg_inc/profiling ${ASCEND_HOME_PATH}/${CMAKE_SYSTEM_PROCESSOR}-linux/include/driver ) +target_link_directories(host_runtime + PRIVATE + ${ASCEND_HOME_PATH}/lib64 + ${ASCEND_HOME_PATH}/runtime/lib64 +) + # Link against CANN runtime libraries # ascend_hal is dynamically loaded at runtime via dlopen in device_runner # when performance profiling is enabled @@ -108,10 +105,4 @@ target_link_libraries(host_runtime dl ) -target_link_directories(host_runtime - PRIVATE - ${ASCEND_HOME_PATH}/lib64 - ${ASCEND_HOME_PATH}/runtime/lib64 -) - set_target_properties(host_runtime PROPERTIES OUTPUT_NAME "host_runtime") diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp index 71ef32f30..8017f9b09 100644 --- a/src/a5/platform/onboard/host/device_runner.cpp +++ b/src/a5/platform/onboard/host/device_runner.cpp @@ -22,8 +22,6 @@ #include -#include "load_aicpu_op.h" - #include #include #include @@ -348,45 +346,14 @@ int DeviceRunner::ensure_binaries_loaded() { return -1; } - if (dispatcher_so_binary_.empty()) { - LOG_ERROR( - "DeviceRunner: dispatcher SO bytes not provided; pass dispatcher_path through ChipWorker.init " - "(RuntimeBinaries.dispatcher_path)" - ); - return -1; - } - - // Bundle dispatcher SO + inner SO bytes into one Mode A KFC call: - // libaicpu_extend_kernels invokes our dispatcher, which writes the inner - // SO bytes to simpler_inner_.so in preinstall. Dispatcher itself never - // persists. Per-task launches afterwards go through Mode B - // (rtsBinaryLoadFromFile + rtsFuncGetByName + rtsLaunchCpuKernel) directly - // against the preinstall file. - int rc = load_aicpu_op_.BootstrapDispatcher( - dispatcher_so_binary_.data(), dispatcher_so_binary_.size(), aicpu_so_binary_.data(), aicpu_so_binary_.size(), - stream_aicpu_ - ); - if (rc != 0) { - LOG_ERROR("LoadAicpuOp::BootstrapDispatcher failed: %d", rc); - return rc; - } - LOG_INFO_V2("DeviceRunner: inner SO uploaded to preinstall via dispatcher bootstrap"); - - rc = load_aicpu_op_.Init(); - if (rc != 0) { - LOG_ERROR("LoadAicpuOp::Init failed: %d", rc); - return rc; - } - LOG_INFO_V2("DeviceRunner: inner SO registered (simpler_aicpu_init/exec handles ready)"); - - // H2D copy aicpu kernel SO bytes and stamp the resulting device pointer - // into device_args_.aicpu_so_bin/len (see a2a3 sibling — load-bearing on - // a5 onboard even though our own AICPU SO doesn't read these fields). - rc = so_info_.init(aicpu_so_binary_, mem_alloc_); + // Load AICPU SO + int rc = so_info_.init(aicpu_so_binary_, mem_alloc_); if (rc != 0) { LOG_ERROR("AicpuSoInfo::init failed: %d", rc); return rc; } + + // Initialize device args device_args_.aicpu_so_bin = so_info_.aicpu_so_bin; device_args_.aicpu_so_len = so_info_.aicpu_so_len; rc = kernel_args_.init_device_args(device_args_, mem_alloc_); @@ -396,15 +363,6 @@ int DeviceRunner::ensure_binaries_loaded() { return rc; } - // Release host bytes — Mode B per-task launches use the cached rtFuncHandle - // on LoadAicpuOp; dispatcher SO bytes are never referenced again; the - // aicpu kernel SO's host buffer is also free to drop now that so_info_ - // already H2D'd the bytes above. - dispatcher_so_binary_.clear(); - dispatcher_so_binary_.shrink_to_fit(); - aicpu_so_binary_.clear(); - aicpu_so_binary_.shrink_to_fit(); - binaries_loaded_ = true; LOG_INFO_V0("DeviceRunner: binaries loaded"); return 0; @@ -648,16 +606,16 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { pmu_collector_.start(thread_factory); } - LOG_INFO_V0("=== launch_aicpu_kernel %s ===", host::KernelNames::InitName); - rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, host::KernelNames::InitName, 1); + LOG_INFO_V0("=== launch_aicpu_kernel DynTileFwkKernelServerInit ==="); + rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServerInit", 1); if (rc != 0) { LOG_ERROR("launch_aicpu_kernel (init) failed: %d", rc); return rc; } - LOG_INFO_V0("=== launch_aicpu_kernel %s ===", host::KernelNames::RunName); + LOG_INFO_V0("=== launch_aicpu_kernel DynTileFwkKernelServer ==="); rc = launch_aicpu_kernel( - stream_aicpu_, &kernel_args_.args, host::KernelNames::RunName, PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH + stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServer", PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH ); if (rc != 0) { LOG_ERROR("launch_aicpu_kernel (main) failed: %d", rc); @@ -980,12 +938,9 @@ int DeviceRunner::finalize() { // are released by runtime_args_cleanup RAII so they also unwind on errors. kernel_args_.finalize_device_args(); - // Cleanup AICPU SO H2D allocation + // Cleanup AICPU SO so_info_.finalize(); - // load_aicpu_op_ has no per-task device-side state to release (Mode A - // type 2 launches don't keep handles). The dispatcher itself was a - // transient libaicpu_extend_kernels dlopen — nothing to unload from host. binaries_loaded_ = false; // Release any chip callable buffers uploaded via upload_chip_callable_buffer. @@ -1067,11 +1022,27 @@ int DeviceRunner::finalize() { } int DeviceRunner::launch_aicpu_kernel(rtStream_t stream, KernelArgs *k_args, const char *kernel_name, int aicpu_num) { - // kernel_name is host::KernelNames::InitName / RunName — the runtime SO's - // actual exported symbol (simpler_aicpu_init / simpler_aicpu_exec). The - // Mode A type 2 launch in LaunchBuiltInOp embeds it in the args struct - // for the main aicpu_scheduler to dlsym. - return load_aicpu_op_.LaunchBuiltInOp(stream, k_args, aicpu_num, kernel_name); + struct Args { + KernelArgs k_args; + char kernel_name[32]; + const char so_name[32] = {"libaicpu_extend_kernels.so"}; + const char op_name[32] = {""}; + } args; + + args.k_args = *k_args; + std::strncpy(args.kernel_name, kernel_name, sizeof(args.kernel_name) - 1); + args.kernel_name[sizeof(args.kernel_name) - 1] = '\0'; + + rtAicpuArgsEx_t rt_args; + std::memset(&rt_args, 0, sizeof(rt_args)); + rt_args.args = &args; + rt_args.argsSize = sizeof(args); + rt_args.kernelNameAddrOffset = offsetof(struct Args, kernel_name); + rt_args.soNameAddrOffset = offsetof(struct Args, so_name); + + return rtAicpuKernelLaunchExWithArgs( + rtKernelType_t::KERNEL_TYPE_AICPU_KFC, "AST_DYN_AICPU", aicpu_num, &rt_args, nullptr, stream, 0 + ); } int DeviceRunner::launch_aicore_kernel(rtStream_t stream, KernelArgs *k_args) { diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h index b880c67b3..cab1c2d8c 100644 --- a/src/a5/platform/onboard/host/device_runner.h +++ b/src/a5/platform/onboard/host/device_runner.h @@ -50,7 +50,6 @@ #include "host/l2_perf_collector.h" #include "host/pmu_collector.h" #include "host/tensor_dump_collector.h" -#include "load_aicpu_op.h" #include "runtime.h" /** @@ -58,10 +57,7 @@ * * This structure contains pointers to device memory for the AICPU shared * object. The layout is hardcoded in libaicpu_extend_kernels.so, which expects - * specific offsets for aicpu_so_bin and aicpu_so_len fields. The fields are - * load-bearing on a5 onboard (CI surfaced 207001 AICore launch failures and - * 507899 stream-create failures when they were dropped); treat the layout as - * part of the device-side contract even though our own kernels do not read it. + * specific offsets for aicpu_so_bin and aicpu_so_len fields. */ struct DeviceArgs { uint64_t unused[12] = {0}; @@ -69,23 +65,6 @@ struct DeviceArgs { uint64_t aicpu_so_len{0}; }; -/** - * AICPU shared object information and management - * - * Manages the host→device copy of the runtime AICPU SO bytes that backs - * DeviceArgs.aicpu_so_bin / aicpu_so_len. Required on a5 onboard even though - * our own runtime AICPU SO never dereferences these fields — removing the - * H2D allocation destabilized CI (see DeviceArgs comment above). - */ -struct AicpuSoInfo { - uint64_t aicpu_so_bin{0}; - uint64_t aicpu_so_len{0}; - MemoryAllocator *allocator_{nullptr}; - - int init(const std::vector &aicpu_so_binary, MemoryAllocator &allocator); - int finalize(); -}; - /** * Helper class for managing KernelArgs with device memory * @@ -159,6 +138,34 @@ struct KernelArgsHelper { KernelArgs *operator&() { return &args; } }; +/** + * AICPU shared object information and management + * + * This class manages loading and device memory allocation for AICPU + * shared object (.so) files. + */ +struct AicpuSoInfo { + uint64_t aicpu_so_bin{0}; + uint64_t aicpu_so_len{0}; + MemoryAllocator *allocator_{nullptr}; + + /** + * Load shared object binary data and copy to device memory + * + * @param aicpu_so_binary Binary data of the AICPU shared object + * @param allocator Memory allocator to use + * @return 0 on success, error code on failure + */ + int init(const std::vector &aicpu_so_binary, MemoryAllocator &allocator); + + /** + * Free device memory allocated for shared object + * + * @return 0 on success, error code on failure + */ + int finalize(); +}; + /** * Device runner for kernel execution * @@ -269,19 +276,6 @@ class DeviceRunner { aicore_kernel_binary_ = std::move(aicore_kernel_binary); } - /** - * Take ownership of the dispatcher SO bytes. Called by simpler_init when - * the caller provided a dispatcher path; ensure_binaries_loaded() hands - * the buffer to LoadAicpuOp::BootstrapDispatcher on the first run. - * Leaving this unset (empty buffer) makes ensure_binaries_loaded() fail - * with a clear message — callers that drive _ChipWorker.init directly - * without a dispatcher path get a deterministic error at run() time - * rather than a confusing dladdr-derived path. - */ - void set_dispatcher_binary(std::vector dispatcher_so_binary) { - dispatcher_so_binary_ = std::move(dispatcher_so_binary); - } - /** The device id captured by simpler_init's attach_current_thread call. */ int device_id() const { return device_id_; } @@ -480,23 +474,9 @@ class DeviceRunner { int cores_per_blockdim_{PLATFORM_CORES_PER_BLOCKDIM}; int worker_count_{0}; // Stored for print_handshake_results in destructor // Executor binaries — populated once via set_executors() during - // simpler_init. aicore_kernel_binary_ stays resident (launch_aicore_kernel - // re-registers it via rtRegisterAllKernel on every run). aicpu_so_binary_ - // is released by ensure_binaries_loaded() after bootstrap; bootstrap is - // the only consumer and Mode B per-task launches go through the cached - // rtFuncHandle on LoadAicpuOp, not the host bytes. + // simpler_init, owned by this runner for the rest of its lifetime. std::vector aicpu_so_binary_; std::vector aicore_kernel_binary_; - // Dispatcher SO bytes — populated once via set_dispatcher_binary() during - // simpler_init. Consumed exclusively by BootstrapDispatcher on the first - // run() and released by ensure_binaries_loaded() right after. Empty buffer - // is permitted at init time (callers that drive ChipWorker.init without a - // dispatcher path); ensure_binaries_loaded() then fails fast with a clear - // message if/when bootstrap is actually attempted. - std::vector dispatcher_so_binary_; - - // AICPU op loader — handles dispatcher bootstrap and per-task launches. - host::LoadAicpuOp load_aicpu_op_; // Memory management MemoryAllocator mem_alloc_; diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp index 0cc17c81f..21f919fd0 100644 --- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp @@ -295,7 +295,7 @@ int comm_destroy(void *handle) { int simpler_init( DeviceContextHandle ctx, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, - const uint8_t *aicore_binary, size_t aicore_size, const uint8_t *dispatcher_binary, size_t dispatcher_size + const uint8_t *aicore_binary, size_t aicore_size ) { if (ctx == NULL) return -1; @@ -324,13 +324,6 @@ int simpler_init( std::vector aicpu_vec(aicpu_binary, aicpu_binary + aicpu_size); std::vector aicore_vec(aicore_binary, aicore_binary + aicore_size); runner->set_executors(std::move(aicpu_vec), std::move(aicore_vec)); - // Dispatcher SO bytes — see a2a3 sibling for rationale. Empty buffer - // is permitted at simpler_init time; ensure_binaries_loaded surfaces - // the error if/when the bootstrap is actually attempted. - if (dispatcher_binary != NULL && dispatcher_size > 0) { - std::vector dispatcher_vec(dispatcher_binary, dispatcher_binary + dispatcher_size); - runner->set_dispatcher_binary(std::move(dispatcher_vec)); - } } catch (...) { return -1; } diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp index 81e9b138f..debf09f75 100644 --- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp @@ -217,13 +217,8 @@ int destroy_comm_stream_ctx(DeviceContextHandle ctx, void *stream) { int simpler_init( DeviceContextHandle ctx, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, - const uint8_t *aicore_binary, size_t aicore_size, const uint8_t *dispatcher_binary, size_t dispatcher_size + const uint8_t *aicore_binary, size_t aicore_size ) { - // Sim has no AICPU dispatcher (the simulator runs AICPU in-process). See - // a2a3 sim sibling for rationale; parameters accepted for ABI parity. - (void)dispatcher_binary; - (void)dispatcher_size; - if (ctx == NULL) return -1; DeviceRunner *runner = static_cast(ctx); diff --git a/src/common/aicpu_dispatcher/README.md b/src/common/aicpu_dispatcher/README.md deleted file mode 100644 index fcdc54beb..000000000 --- a/src/common/aicpu_dispatcher/README.md +++ /dev/null @@ -1,36 +0,0 @@ -# Simpler AICPU Dispatcher SO - -Source for `libsimpler_aicpu_dispatcher.so` — a transient bootstrap-only helper -loaded by CANN's preinstalled `libaicpu_extend_kernels.so`. Its only job is to -write the bundled runtime SO bytes to the main `aicpu_scheduler`'s preinstall -path under a content-fingerprint filename: - -```text -/usr/lib64/aicpu_kernels/0/aicpu_kernels_device/simpler_inner_.so -``` - -The dispatcher SO itself is **never** persisted to disk and **never** dispatches -at per-task launch time. After bootstrap, the host registers the preinstall -file via `rtsBinaryLoadFromFile` (Mode B JSON load, cpuKernelMode=0) and -resolves `simpler_aicpu_init` / `simpler_aicpu_exec` once via -`rtsFuncGetByName`; per-task launches go through `rtsLaunchCpuKernel` on the -cached `rtFuncHandle`s. The main `aicpu_scheduler` owns the dlopen of the -preinstall file; the dispatcher is out of the picture once bootstrap returns. - -The source is runtime-agnostic. It is built per-arch under -`build/lib//onboard//libsimpler_aicpu_dispatcher.so` as a -sibling of each runtime's host_runtime.so. A single process binding multiple -runtimes can share one dispatcher SO on disk; the host process-level -fingerprint cache deduplicates bootstrap calls by inner-SO Build-ID. - -## Exported entry points - -Three C-style symbols are exposed; `libaicpu_extend_kernels.so::SetTileFwkKernelMap` -dlsym's all three at load time, but only DynInit does real work: - -1. `StaticTileFwkBackendKernelServer` — stub -2. `DynTileFwkBackendKernelServerInit` — bootstrap upload (real work) -3. `DynTileFwkBackendKernelServer` — stub - -See `aicpu_dispatcher.h` for the bootstrap protocol details (extended DeviceArgs -with `inner_so_bin`/`inner_so_len`, FNV-1a content fingerprint). diff --git a/src/common/aicpu_dispatcher/aicpu_dispatcher.cpp b/src/common/aicpu_dispatcher/aicpu_dispatcher.cpp deleted file mode 100644 index 54d5e61ea..000000000 --- a/src/common/aicpu_dispatcher/aicpu_dispatcher.cpp +++ /dev/null @@ -1,201 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * AICPU Dispatcher implementation — transient bootstrap-only upload helper. - * - * See aicpu_dispatcher.h for architecture. The dispatcher SO exists only - * to provide a piece of code that runs with sched-thread (HwHiAiUser) - * permissions for one purpose: write the bundled runtime SO bytes to - * the main aicpu_scheduler's preinstall path under a content-fingerprint - * filename. Once Init returns, this SO is no longer referenced — host's - * subsequent Mode B loads target the runtime SO file directly. - */ - -#include "aicpu_dispatcher.h" - -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "utils/elf_build_id.h" - -// dlog wrapper so error paths show up in device log without depending on -// our common/unified_log machinery (this SO is loaded standalone by CANN). -extern "C" void DlogRecord(int moduleId, int level, const char *fmt, ...); - -namespace simpler_dispatcher { -constexpr int kDlogModuleCcecpu = 3; -constexpr int kDlogLevelError = 3; - -void DispatcherLog(const char *fmt, ...) { - char buf[1024]; - va_list ap; - va_start(ap, fmt); - vsnprintf(buf, sizeof(buf), fmt, ap); - va_end(ap); - // DlogRecord is a non-weak extern: if it failed to resolve, this SO - // would not have dlopen'd in the first place, so an address-vs-nullptr - // guard here is dead code (and is folded to `true` by most compilers). - DlogRecord(kDlogModuleCcecpu, kDlogLevelError, "[simpler-dispatcher] %s", buf); -} -} // namespace simpler_dispatcher - -// Bootstrap-time DeviceArgs view. Layout shared with host's BootstrapDispatcher. -// libaicpu_extend_kernels reads aicpu_so_bin/len/deviceId; we additionally read -// inner_so_bin/len (an extra qword pair past deviceId). -struct KernelArgs { - uint64_t unused[5] = {0}; - void *device_args{nullptr}; - void *runtime_args{nullptr}; - uint64_t regs{0}; -}; -struct DeviceArgs { - uint64_t unused[12] = {0}; - uint64_t aicpu_so_bin{0}; // 96 — dispatcher bytes (libaicpu_extend_kernels) - uint64_t aicpu_so_len{0}; // 104 - uint64_t device_id{0}; // 112 - uint64_t inner_so_bin{0}; // 120 — runtime SO bytes (dispatcher) - uint64_t inner_so_len{0}; // 128 -}; -static_assert(offsetof(KernelArgs, device_args) == 40, "KernelArgs::device_args offset drift"); -static_assert(offsetof(DeviceArgs, aicpu_so_bin) == 96, "DeviceArgs::aicpu_so_bin offset drift"); -static_assert(offsetof(DeviceArgs, aicpu_so_len) == 104, "DeviceArgs::aicpu_so_len offset drift"); -static_assert(offsetof(DeviceArgs, device_id) == 112, "DeviceArgs::device_id offset drift"); -static_assert(offsetof(DeviceArgs, inner_so_bin) == 120, "DeviceArgs::inner_so_bin offset drift"); -static_assert(offsetof(DeviceArgs, inner_so_len) == 128, "DeviceArgs::inner_so_len offset drift"); - -namespace simpler_dispatcher { - -// ELF Build-ID-derived 64-bit fingerprint (linker SHA-1 truncated to 8 -// bytes by `-Wl,--build-id`). Falls back to full-buffer FNV-1a if the SO -// was somehow linked without a build-id note. Host's -// load_aicpu_op.cpp::FingerprintBytes calls the same helper, so both sides -// produce identical fingerprints with no other channel of communication. -// -// The earlier "FNV-1a over the first 64 bytes XOR len" scheme collided in -// practice on same-toolchain runtime SOs whose ELF headers + size matched -// — wrong-code risk on the multi-runtime path. Build-IDs are strong by -// linker contract: identical Build-IDs imply byte-identical SOs. -uint64_t Fingerprint(const char *data, uint64_t len) { - return simpler::common::utils::elf_build_id_64(data, static_cast(len)); -} - -// Preinstall path — HwHiAiUser owns this dir, the sched thread can write here. -// device-side /tmp is mounted read-only / restricted in CANN 9.0. -std::string MakeInnerSoPath(uint64_t fp) { - char buf[256]; - snprintf(buf, sizeof(buf), "/usr/lib64/aicpu_kernels/0/aicpu_kernels_device/simpler_inner_%016lx.so", fp); - return buf; -} - -// Atomic write: write to a per-process temp path, then rename onto the target. -// Several CI workers may bootstrap on different devices simultaneously and all -// land at the same fingerprinted target path; without atomic rename a reader -// (a sibling aicpu_scheduler's dlopen during its Mode B load) can observe a -// truncated/partially-written file and fail with 507018 or 507046. -// -// Same fingerprint → same content, so whichever rename wins yields identical -// bytes; existing dlopen handles in any aicpu_scheduler stay bound to their -// captured inode and are unaffected by later renames. We don't fast-path on -// the file already existing — a stale corrupt file from a pre-fix run could -// match the fingerprint by chance, and the atomic rename overwrites cheaply. -bool WriteBytes(const std::string &path, const char *data, uint64_t len) { - char tmp_path[320]; - snprintf(tmp_path, sizeof(tmp_path), "%s.tmp.%d", path.c_str(), static_cast(getpid())); - { - std::ofstream f(tmp_path, std::ios::binary | std::ios::trunc); - if (!f.is_open()) { - DispatcherLog("open %s for write failed: %s", tmp_path, strerror(errno)); - return false; - } - f.write(data, static_cast(len)); - bool good = f.good(); - f.close(); - if (!good) { - DispatcherLog("write %s failed", tmp_path); - unlink(tmp_path); - return false; - } - } - (void)chmod(tmp_path, 0755); - if (rename(tmp_path, path.c_str()) != 0) { - DispatcherLog("rename %s -> %s failed: %s", tmp_path, path.c_str(), strerror(errno)); - unlink(tmp_path); - return false; - } - return true; -} - -} // namespace simpler_dispatcher - -// ============================================================================= -// C-style exported entry points dlsym'd by libaicpu_extend_kernels. -// ============================================================================= - -extern "C" { - -// Stubs — libaicpu_extend_kernels::SetTileFwkKernelMap dlsym's all three at -// load time; absence makes the whole SO unmappable. We only reach Init in -// practice, but return 0 (success) here to mirror the happy-path return of -// the old AICPU kernel stubs we replaced. If a future CANN version begins -// invoking Static as a warm-up probe, returning failure would be a silent -// regression versus the prior behavior. -__attribute__((visibility("default"))) int StaticTileFwkBackendKernelServer(void *args) { - (void)args; - simpler_dispatcher::DispatcherLog("Static: stub (not expected to be called)"); - return 0; -} - -__attribute__((visibility("default"))) uint32_t DynTileFwkBackendKernelServer(void *args) { - (void)args; - simpler_dispatcher::DispatcherLog("Server: stub (dispatcher is upload-only, not expected to be called)"); - return 0; -} - -// Init: write the bundled runtime SO bytes to a fingerprint-named file under -// the main scheduler's preinstall path, return. Once this returns, host's -// Mode B JSON load can resolve the runtime SO directly — this dispatcher SO -// never gets referenced again. -__attribute__((visibility("default"))) uint32_t DynTileFwkBackendKernelServerInit(void *args) { - if (args == nullptr) { - simpler_dispatcher::DispatcherLog("Init: args==nullptr"); - return 1; - } - auto *k = reinterpret_cast(args); - auto *d = reinterpret_cast(k->device_args); - if (d == nullptr) { - simpler_dispatcher::DispatcherLog("Init: device_args==nullptr"); - return 1; - } - if (d->inner_so_bin == 0 || d->inner_so_len == 0) { - simpler_dispatcher::DispatcherLog( - "Init: empty inner SO bundle (bin=%lx len=%lu)", d->inner_so_bin, d->inner_so_len - ); - return 1; - } - const char *inner_bytes = reinterpret_cast(d->inner_so_bin); - uint64_t fp = simpler_dispatcher::Fingerprint(inner_bytes, d->inner_so_len); - std::string path = simpler_dispatcher::MakeInnerSoPath(fp); - if (!simpler_dispatcher::WriteBytes(path, inner_bytes, d->inner_so_len)) { - return 1; - } - simpler_dispatcher::DispatcherLog("Init: wrote %s (%lu bytes)", path.c_str(), d->inner_so_len); - return 0; -} - -} // extern "C" diff --git a/src/common/aicpu_dispatcher/aicpu_dispatcher.h b/src/common/aicpu_dispatcher/aicpu_dispatcher.h deleted file mode 100644 index 72cd297ab..000000000 --- a/src/common/aicpu_dispatcher/aicpu_dispatcher.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * AICPU Dispatcher — transient bootstrap-only upload helper. - * - * Architecture - * ============ - * - * This dispatcher SO has one job: write the bundled runtime SO bytes to the - * main aicpu_scheduler's preinstall path. It is **never** written to disk - * itself and **never** dispatches at per-task launch time. - * - * Bootstrap flow (host → libaicpu_extend_kernels → dispatcher → preinstall): - * - * 1. host calls `rtAicpuKernelLaunchExWithArgs` (kernel_type = - * `KERNEL_TYPE_AICPU_KFC`) targeting libaicpu_extend_kernels with - * DeviceArgs containing: - * - aicpu_so_bin / aicpu_so_len → dispatcher SO bytes (libaicpu_extend_kernels reads) - * - inner_so_bin / inner_so_len → runtime SO bytes (dispatcher reads) - * 2. libaicpu_extend_kernels writes the dispatcher bytes to its own private - * path (some /tmp on device, often unlinked after open), dlopens us, - * dlsym's the three CANN-contract symbols (Static + DynInit + Dyn), - * invokes our `DynTileFwkBackendKernelServerInit`. - * 3. Our Init reads inner_so_bin/inner_so_len from DeviceArgs, fingerprints - * the bytes (FNV-1a over first 64 bytes XOR len), and writes them to - * `/usr/lib64/aicpu_kernels/0/aicpu_kernels_device/simpler_inner_.so`. - * The sched thread (HwHiAiUser) owns this dir, so the write succeeds. - * 4. host computes the same fingerprint locally to derive the same - * preinstall filename. - * 5. Per-task launches (Mode B): host calls `rtsBinaryLoadFromFile` to - * JSON-register the preinstall file (cpuKernelMode=0), resolves - * `simpler_aicpu_init` / `simpler_aicpu_exec` via `rtsFuncGetByName`, - * then dispatches each task through `rtsLaunchCpuKernel` on the cached - * `rtFuncHandle`. The main aicpu_scheduler owns the dlopen of the - * preinstall file; this dispatcher SO is no longer in the picture. - * - * Multi-runtime in one host process: each DeviceRunner bootstraps with the - * same dispatcher bytes + its own runtime SO bytes. A process-level - * fingerprint cache in LoadAicpuOp short-circuits repeat invocations for - * the same runtime SO content, so libaicpu_extend_kernels' one-shot - * `firstCreatSo_` latch fires at most once per (process, fingerprint). - */ - -#ifndef COMMON_AICPU_DISPATCHER_AICPU_DISPATCHER_H_ -#define COMMON_AICPU_DISPATCHER_AICPU_DISPATCHER_H_ - -#include - -// C-style exports required by libaicpu_extend_kernels' SetTileFwkKernelMap -// dlsym contract. Only DynInit does real work; the other two are stubs that -// log + return failure if ever invoked (they shouldn't be — dispatcher is -// upload-only and host's per-task launches target the runtime SO directly). -extern "C" { -__attribute__((visibility("default"))) int StaticTileFwkBackendKernelServer(void *args); -__attribute__((visibility("default"))) uint32_t DynTileFwkBackendKernelServerInit(void *args); -__attribute__((visibility("default"))) uint32_t DynTileFwkBackendKernelServer(void *args); -} - -#endif // COMMON_AICPU_DISPATCHER_AICPU_DISPATCHER_H_ diff --git a/src/common/host/load_aicpu_op.cpp b/src/common/host/load_aicpu_op.cpp deleted file mode 100644 index 71bcfb922..000000000 --- a/src/common/host/load_aicpu_op.cpp +++ /dev/null @@ -1,387 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * AICPU Operation Loader Implementation - */ - -#include "load_aicpu_op.h" - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "acl/acl.h" -#include "common/unified_log.h" -#include "runtime/rt.h" -#include "utils/elf_build_id.h" - -namespace host { - -namespace { - -std::string MakeInnerSoBasename(uint64_t fp) { - char buf[64]; - snprintf(buf, sizeof(buf), "simpler_inner_%016lx.so", fp); - return buf; -} - -// Per-runtime unique opType — different LoadAicpuOp instances in the same -// process may register the same plain symbol names (simpler_aicpu_init / _exec); -// suffixing with the runtime SO fingerprint keeps CANN's global op registry -// from collapsing distinct registrations. -std::string MakeUniqueOpType(const char *base, uint64_t fp) { - char buf[128]; - snprintf(buf, sizeof(buf), "%s_%016lx", base, fp); - return buf; -} - -// ELF Build-ID-derived 64-bit fingerprint. Dispatcher SO uses the same -// helper on the device side, so both sides agree on the preinstall -// basename without any other channel of communication. See -// src/common/utils/elf_build_id.h for the fallback behavior when the SO -// was linked without a build-id note. -uint64_t FingerprintBytes(const void *data, size_t len) { return simpler::common::utils::elf_build_id_64(data, len); } - -struct DeviceBuf { - void *ptr = nullptr; - ~DeviceBuf() { - if (ptr != nullptr) (void)aclrtFree(ptr); - } - aclError alloc(size_t bytes) { return aclrtMalloc(&ptr, bytes, ACL_MEM_MALLOC_HUGE_FIRST); } -}; - -// Process-level cache of inner-SO fingerprints we've already bootstrapped. -// Multiple DeviceRunner instances in the same process share one entry per -// runtime here; same-content uploads short-circuit. Guarded by a mutex so -// that callers releasing the Python GIL (e.g. nanobind methods marked -// `nb::call_guard`) cannot race on the set's -// internals. The lock is uncontended on the steady-state path and only -// touched at DeviceRunner init time, so the overhead is negligible -// compared to keeping the invariant alive in a comment. -std::unordered_set &BootstrappedFps() { - static std::unordered_set kSet; - return kSet; -} -std::mutex &BootstrappedFpsMutex() { - static std::mutex m; - return m; -} - -} // namespace - -int LoadAicpuOp::BootstrapDispatcher( - const void *dispatcher_so_data, size_t dispatcher_so_len, const void *inner_so_data, size_t inner_so_len, - rtStream_t stream -) { - if (dispatcher_so_data == nullptr || dispatcher_so_len == 0) { - LOG_ERROR("BootstrapDispatcher: empty dispatcher SO bytes"); - return -1; - } - if (inner_so_data == nullptr || inner_so_len == 0) { - LOG_ERROR("BootstrapDispatcher: empty inner SO bytes"); - return -1; - } - inner_fp_ = FingerprintBytes(inner_so_data, inner_so_len); - inner_so_basename_ = MakeInnerSoBasename(inner_fp_); - - { - std::lock_guard lk(BootstrappedFpsMutex()); - if (BootstrappedFps().count(inner_fp_) > 0) { - LOG_INFO_V2("BootstrapDispatcher: inner SO fp=%016lx already bootstrapped, skipping", inner_fp_); - return 0; - } - } - // Note: we deliberately drop the lock for the heavy bootstrap work and - // re-take it for the post-insert below. Two threads racing on the same - // fingerprint will each perform a bootstrap, which is harmless: CANN's - // libaicpu_extend_kernels has a one-shot `firstCreatSo_` latch, and the - // atomic tmp+rename in WriteBytes is idempotent across same-content - // racers. Holding the lock across the upload would serialize all - // multi-runtime ChipWorker init in the process — a real regression. - - size_t dispatcher_len = dispatcher_so_len; - const char *inner_bytes = reinterpret_cast(inner_so_data); - size_t inner_len = inner_so_len; - - DeviceBuf dev_dispatcher; - DeviceBuf dev_inner; - aclError rc = dev_dispatcher.alloc(dispatcher_len); - if (rc != ACL_SUCCESS) { - LOG_ERROR("BootstrapDispatcher: aclrtMalloc(dispatcher) failed: %d", rc); - return rc; - } - rc = aclrtMemcpy(dev_dispatcher.ptr, dispatcher_len, dispatcher_so_data, dispatcher_len, ACL_MEMCPY_HOST_TO_DEVICE); - if (rc != ACL_SUCCESS) { - LOG_ERROR("BootstrapDispatcher: aclrtMemcpy(dispatcher) failed: %d", rc); - return rc; - } - rc = dev_inner.alloc(inner_len); - if (rc != ACL_SUCCESS) { - LOG_ERROR("BootstrapDispatcher: aclrtMalloc(inner) failed: %d", rc); - return rc; - } - rc = aclrtMemcpy(dev_inner.ptr, inner_len, inner_bytes, inner_len, ACL_MEMCPY_HOST_TO_DEVICE); - if (rc != ACL_SUCCESS) { - LOG_ERROR("BootstrapDispatcher: aclrtMemcpy(inner) failed: %d", rc); - return rc; - } - - constexpr size_t kDeviceArgsBytes = 160; - char host_dev_args[kDeviceArgsBytes] = {}; - auto write_qword = [&](size_t offset, uint64_t value) { - std::memcpy(host_dev_args + offset, &value, sizeof(value)); - }; - write_qword(96, reinterpret_cast(dev_dispatcher.ptr)); - write_qword(104, static_cast(dispatcher_len)); - write_qword(112, 0); - write_qword(120, reinterpret_cast(dev_inner.ptr)); - write_qword(128, static_cast(inner_len)); - - DeviceBuf dev_args; - rc = dev_args.alloc(kDeviceArgsBytes); - if (rc != ACL_SUCCESS) { - LOG_ERROR("BootstrapDispatcher: aclrtMalloc(device_args) failed: %d", rc); - return rc; - } - rc = aclrtMemcpy(dev_args.ptr, kDeviceArgsBytes, host_dev_args, kDeviceArgsBytes, ACL_MEMCPY_HOST_TO_DEVICE); - if (rc != ACL_SUCCESS) { - LOG_ERROR("BootstrapDispatcher: aclrtMemcpy(device_args) failed: %d", rc); - return rc; - } - - struct Args { - struct { - uint64_t unused[5] = {0}; - uint64_t device_args_ptr = 0; - uint64_t pad[20] = {0}; - } k_args; - char kernel_name[32]; - char so_name[32]; - char op_name[32]; - } args = {}; - args.k_args.device_args_ptr = reinterpret_cast(dev_args.ptr); - std::strncpy(args.kernel_name, "DynTileFwkKernelServerInit", sizeof(args.kernel_name) - 1); - std::strncpy(args.so_name, "libaicpu_extend_kernels.so", sizeof(args.so_name) - 1); - args.op_name[0] = '\0'; - - rtAicpuArgsEx_t rt_args = {}; - rt_args.args = &args; - rt_args.argsSize = sizeof(args); - rt_args.kernelNameAddrOffset = offsetof(Args, kernel_name); - rt_args.soNameAddrOffset = offsetof(Args, so_name); - - rtError_t rrc = rtAicpuKernelLaunchExWithArgs( - rtKernelType_t::KERNEL_TYPE_AICPU_KFC, "AST_DYN_AICPU", 1, &rt_args, nullptr, stream, 0 - ); - if (rrc != RT_ERROR_NONE) { - LOG_ERROR("BootstrapDispatcher: rtAicpuKernelLaunchExWithArgs failed: %d", rrc); - return rrc; - } - rc = aclrtSynchronizeStream(stream); - if (rc != ACL_SUCCESS) { - LOG_ERROR("BootstrapDispatcher: aclrtSynchronizeStream failed: %d", rc); - return rc; - } - LOG_INFO_V0( - "BootstrapDispatcher: bundled dispatcher (%zu B) + inner SO (%zu B) uploaded; inner SO at %s", dispatcher_len, - inner_len, inner_so_basename_.c_str() - ); - { - std::lock_guard lk(BootstrappedFpsMutex()); - BootstrappedFps().insert(inner_fp_); - } - return 0; -} - -void LoadAicpuOp::Finalize() { - if (binary_handle_ != nullptr) { - rtError_t rc = rtsBinaryUnload(binary_handle_); - if (rc != RT_ERROR_NONE) { - LOG_WARN("rtsBinaryUnload failed: %d", rc); - } - binary_handle_ = nullptr; - } - func_handles_.clear(); - inner_fp_ = 0; - inner_so_basename_.clear(); - if (!json_file_path_.empty()) { - std::remove(json_file_path_.c_str()); - json_file_path_.clear(); - } -} - -LoadAicpuOp::~LoadAicpuOp() { Finalize(); } - -bool LoadAicpuOp::GenerateAicpuOpJson(const std::string &json_path, const std::string &kernel_so) { - // Inputs are a closed set: opType / functionName are KernelNames::* - // constants suffixed with a hex fingerprint, kernelSo is also hex-only, - // and the remaining fields are hard-coded literals. No characters that - // require JSON escaping can appear, so manual string concatenation is - // safe. If you add a field whose value can be user-derived (paths, - // user-supplied identifiers, etc.), switch to a real JSON serializer - // before letting it through. - std::ofstream json_file(json_path); - if (!json_file.is_open()) { - LOG_ERROR("Failed to open JSON file for writing: %s", json_path.c_str()); - return false; - } - auto make_cfg = [&](const char *symbol_name) { - AicpuOpConfig c; - c.opType = MakeUniqueOpType(symbol_name, inner_fp_); - c.functionName = symbol_name; - c.kernelSo = kernel_so; - c.opKernelLib = "AICPUKernel"; - c.userDefined = "False"; - return c; - }; - std::vector op_configs = { - make_cfg(KernelNames::InitName), - make_cfg(KernelNames::RunName), - }; - json_file << "{\n"; - for (size_t i = 0; i < op_configs.size(); ++i) { - const auto &c = op_configs[i]; - json_file << " \"" << c.opType << "\": {\n"; - json_file << " \"opInfo\": {\n"; - json_file << " \"functionName\": \"" << c.functionName << "\",\n"; - json_file << " \"kernelSo\": \"" << c.kernelSo << "\",\n"; - json_file << " \"opKernelLib\": \"" << c.opKernelLib << "\",\n"; - json_file << " \"computeCost\": \"" << c.computeCost << "\",\n"; - json_file << " \"engine\": \"" << c.engine << "\",\n"; - json_file << " \"flagAsync\": \"" << c.flagAsync << "\",\n"; - json_file << " \"flagPartial\": \"" << c.flagPartial << "\",\n"; - json_file << " \"userDefined\": \"" << c.userDefined << "\"\n"; - json_file << " }\n"; - json_file << " }" << (i < op_configs.size() - 1 ? "," : "") << "\n"; - } - json_file << "}\n"; - return true; -} - -int LoadAicpuOp::Init() { - if (inner_fp_ == 0) { - LOG_ERROR("LoadAicpuOp::Init: BootstrapDispatcher must be called first"); - return -1; - } - - // Per-process JSON path. /tmp is always writable. - char json_name_buf[128]; - snprintf( - json_name_buf, sizeof(json_name_buf), "/tmp/simpler_inner_%016lx_%d.json", inner_fp_, static_cast(getpid()) - ); - json_file_path_ = json_name_buf; - - if (!GenerateAicpuOpJson(json_file_path_, inner_so_basename_)) { - json_file_path_.clear(); - return -1; - } - - // RAII cleanups: any non-zero return path below unwinds via these guards. - // .release() flips them off once the corresponding state becomes part of - // the LoadAicpuOp's steady-state ownership. - struct JsonGuard { - std::string &path; - bool active = true; - ~JsonGuard() { - if (active && !path.empty()) { - std::remove(path.c_str()); - path.clear(); - } - } - void release() { active = false; } - } json_guard{json_file_path_}; - - struct BinaryGuard { - void *&handle; - bool active = true; - ~BinaryGuard() { - if (active && handle != nullptr) { - (void)rtsBinaryUnload(handle); - handle = nullptr; - } - } - void release() { active = false; } - } binary_guard{binary_handle_}; - - rtLoadBinaryOption_t option = {}; - option.optionId = RT_LOAD_BINARY_OPT_CPU_KERNEL_MODE; - option.value.cpuKernelMode = 0; - - rtLoadBinaryConfig_t load_config = {}; - load_config.options = &option; - load_config.numOpt = 1; - - LOG_INFO_V2("LoadAicpuOp::Init: JSON=%s inner_basename=%s", json_file_path_.c_str(), inner_so_basename_.c_str()); - - rtError_t rc = rtsBinaryLoadFromFile(json_file_path_.c_str(), &load_config, &binary_handle_); - if (rc != RT_ERROR_NONE) { - LOG_ERROR("rtsBinaryLoadFromFile failed for %s: %d", json_file_path_.c_str(), rc); - // binary_handle_ stays null; json_guard removes the JSON file. - return rc; - } - LOG_INFO_V2("LoadAicpuOp: Loaded inner SO via JSON, handle=%p", binary_handle_); - - const char *symbol_names[] = {KernelNames::InitName, KernelNames::RunName}; - for (const char *name : symbol_names) { - std::string lookup_name = MakeUniqueOpType(name, inner_fp_); - rtFuncHandle func_handle = nullptr; - rc = rtsFuncGetByName(binary_handle_, lookup_name.c_str(), &func_handle); - if (rc != RT_ERROR_NONE) { - LOG_ERROR("rtsFuncGetByName failed for %s: %d", lookup_name.c_str(), rc); - // binary_guard unloads the partially-registered binary, json_guard - // removes the JSON file. Symmetric with the rtsBinaryLoadFromFile - // failure branch above. - return rc; - } - func_handles_[name] = func_handle; - LOG_INFO_V2("LoadAicpuOp: resolved handle for %s (opType=%s): %p", name, lookup_name.c_str(), func_handle); - } - - binary_guard.release(); - json_guard.release(); - return 0; -} - -int LoadAicpuOp::AicpuKernelLaunch(rtFuncHandle func_handle, rtStream_t stream, KernelArgs *k_args, int aicpu_num) { - rtCpuKernelArgs_t cpu_args = {}; - cpu_args.baseArgs.args = k_args; - cpu_args.baseArgs.argsSize = sizeof(KernelArgs); - - rtKernelLaunchCfg_t kernelLaunchCfg = {nullptr, 0U}; - auto launchKernelAttr = std::make_unique(); - kernelLaunchCfg.attrs = launchKernelAttr.get(); - - rtError_t rc = - rtsLaunchCpuKernel(func_handle, static_cast(aicpu_num), stream, &kernelLaunchCfg, &cpu_args); - if (rc != RT_ERROR_NONE) { - LOG_ERROR("rtsLaunchCpuKernel failed: %d", rc); - return rc; - } - return 0; -} - -int LoadAicpuOp::LaunchBuiltInOp(rtStream_t stream, KernelArgs *k_args, int aicpu_num, const std::string &func_name) { - auto it = func_handles_.find(func_name); - if (it == func_handles_.end()) { - LOG_ERROR("Function not found: %s", func_name.c_str()); - return -1; - } - return AicpuKernelLaunch(it->second, stream, k_args, aicpu_num); -} - -} // namespace host diff --git a/src/common/host/load_aicpu_op.h b/src/common/host/load_aicpu_op.h deleted file mode 100644 index dd4c94fa5..000000000 --- a/src/common/host/load_aicpu_op.h +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * @file load_aicpu_op.h - * @brief Host-side AICPU operation loader. - * - * Three-phase architecture: - * - * 1. BootstrapDispatcher (per-DeviceRunner, idempotent across instances in - * the same process via a content-fingerprint cache): bundles dispatcher - * SO bytes + runtime SO bytes into a single Mode A KFC launch - * (`rtAicpuKernelLaunchExWithArgs`, kernel_type = - * `KERNEL_TYPE_AICPU_KFC`) targeting libaicpu_extend_kernels. Our - * dispatcher then writes the runtime SO to - * `/usr/lib64/aicpu_kernels/0/aicpu_kernels_device/simpler_inner_.so` - * using sched-thread (HwHiAiUser) write permission. The dispatcher SO - * itself is never persisted to disk. - * - * 2. Init (per-DeviceRunner): JSON-registers the runtime SO via - * `rtsBinaryLoadFromFile` (cpuKernelMode=0, kernelSo points at the - * preinstall basename), then resolves `simpler_aicpu_init` and - * `simpler_aicpu_exec` to `rtFuncHandle`s via `rtsFuncGetByName`. JSON - * is per-process (`/tmp/simpler_inner__.json`) so concurrent - * multi-chip / multi-worker tests don't race on a shared file. - * - * 3. LaunchBuiltInOp (per-task): `rtsLaunchCpuKernel` on the cached - * `rtFuncHandle`. No per-launch string marshalling, no global op - * registry lookups. - * - * See common/aicpu_dispatcher/aicpu_dispatcher.h for the bootstrap protocol - * details (extended DeviceArgs with inner_so_bin/inner_so_len, - * fingerprint-named preinstall files). - */ - -#ifndef COMMON_HOST_LOAD_AICPU_OP_H_ -#define COMMON_HOST_LOAD_AICPU_OP_H_ - -#include -#include -#include - -#include "common/kernel_args.h" -#include "runtime/runtime/rts/rts_kernel.h" -#include "runtime/rt.h" - -namespace host { - -/** - * @brief AICPU operation configuration for JSON descriptor generation. - */ -struct AicpuOpConfig { - std::string functionName; - std::string kernelSo; - std::string opKernelLib; - std::string computeCost = "100"; - std::string engine = "DNN_VM_AICPU"; - std::string flagAsync = "False"; - std::string flagPartial = "False"; - std::string userDefined = "False"; - std::string opType; -}; - -/** - * @brief Host-side AICPU operation loader. - * - * One instance per DeviceRunner; manages bootstrap (dispatcher upload) + - * JSON registration of the runtime SO + per-task launches via the runtime - * SO's direct rtFuncHandles. - */ -class LoadAicpuOp { -public: - LoadAicpuOp() = default; - ~LoadAicpuOp(); - - LoadAicpuOp(const LoadAicpuOp &) = delete; - LoadAicpuOp &operator=(const LoadAicpuOp &) = delete; - LoadAicpuOp(LoadAicpuOp &&) = delete; - LoadAicpuOp &operator=(LoadAicpuOp &&) = delete; - - /** - * @brief One-shot bootstrap: upload runtime SO to preinstall via dispatcher. - * - * @param dispatcher_so_data Dispatcher SO bytes (caller-owned, must outlive call) - * @param dispatcher_so_len Dispatcher SO size - * @param inner_so_data Runtime SO bytes (caller-owned, must outlive call) - * @param inner_so_len Runtime SO size - * @param stream Stream on which to enqueue the bootstrap - * @return 0 on success, error code on failure - */ - int BootstrapDispatcher( - const void *dispatcher_so_data, size_t dispatcher_so_len, const void *inner_so_data, size_t inner_so_len, - rtStream_t stream - ); - - /** - * @brief JSON-register the runtime SO and resolve its Init/Exec handles. - */ - int Init(); - - /** @brief Release binary handle + function handles + temporary JSON. */ - void Finalize(); - - /** - * @brief Launch a runtime SO entry point via rtsLaunchCpuKernel. - * - * @param stream RTS stream - * @param k_args Kernel arguments - * @param aicpu_num Number of AICPU threads (1 for Init, N for Exec) - * @param func_name Lookup key in func_handles_ (KernelNames::InitName/RunName) - * @return 0 on success, error code on failure - */ - int LaunchBuiltInOp(rtStream_t stream, KernelArgs *k_args, int aicpu_num, const std::string &func_name); - -private: - void *binary_handle_ = nullptr; - std::unordered_map func_handles_; - std::string json_file_path_; - uint64_t inner_fp_ = 0; - std::string inner_so_basename_; - - bool GenerateAicpuOpJson(const std::string &json_path, const std::string &kernel_so); - int AicpuKernelLaunch(rtFuncHandle func_handle, rtStream_t stream, KernelArgs *k_args, int aicpu_num); -}; - -// Runtime SO's actual exported symbol names. Both are looked up via the -// runtime SO's own JSON registration (no dispatcher hop at runtime). -namespace KernelNames { -constexpr const char *InitName = "simpler_aicpu_init"; // single-threaded init -constexpr const char *RunName = "simpler_aicpu_exec"; // multi-threaded exec -} // namespace KernelNames - -} // namespace host - -#endif // COMMON_HOST_LOAD_AICPU_OP_H_ diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp index 2ee392ab1..7fab4c295 100644 --- a/src/common/worker/chip_worker.cpp +++ b/src/common/worker/chip_worker.cpp @@ -58,8 +58,7 @@ std::vector read_binary_file(const std::string &path) { ChipWorker::~ChipWorker() { finalize(); } void ChipWorker::init( - const std::string &host_lib_path, const std::string &aicpu_path, const std::string &aicore_path, - const std::string &dispatcher_path, int device_id + const std::string &host_lib_path, const std::string &aicpu_path, const std::string &aicore_path, int device_id ) { if (finalized_) { throw std::runtime_error("ChipWorker already finalized; cannot reinitialize"); @@ -156,19 +155,8 @@ void ChipWorker::init( try { std::vector aicpu_bytes = read_binary_file(aicpu_path); std::vector aicore_bytes = read_binary_file(aicore_path); - // dispatcher_path is empty on sim (no dispatcher) and on tests that - // exercise _ChipWorker.init directly without a RuntimeBinaries. - // simpler_init treats a null/empty buffer as "no dispatcher" — onboard - // ensure_binaries_loaded raises with a clear message if the bootstrap - // is actually attempted, sim ignores it entirely. - std::vector dispatcher_bytes; - if (!dispatcher_path.empty()) { - dispatcher_bytes = read_binary_file(dispatcher_path); - } - const uint8_t *dispatcher_ptr = dispatcher_bytes.empty() ? nullptr : dispatcher_bytes.data(); init_rc = simpler_init_fn_( - device_ctx_, device_id, aicpu_bytes.data(), aicpu_bytes.size(), aicore_bytes.data(), aicore_bytes.size(), - dispatcher_ptr, dispatcher_bytes.size() + device_ctx_, device_id, aicpu_bytes.data(), aicpu_bytes.size(), aicore_bytes.data(), aicore_bytes.size() ); } catch (...) { destroy_device_context_fn_(device_ctx_); diff --git a/src/common/worker/chip_worker.h b/src/common/worker/chip_worker.h index e1632eb2a..2227245f1 100644 --- a/src/common/worker/chip_worker.h +++ b/src/common/worker/chip_worker.h @@ -42,8 +42,7 @@ class ChipWorker { /// globals. The Python `ChipWorker` wrapper does this with `ctypes.CDLL(..., /// mode=RTLD_GLOBAL)`. void init( - const std::string &host_lib_path, const std::string &aicpu_path, const std::string &aicore_path, - const std::string &dispatcher_path, int device_id + const std::string &host_lib_path, const std::string &aicpu_path, const std::string &aicore_path, int device_id ); /// Tear down everything: device resources and runtime library. @@ -139,8 +138,7 @@ class ChipWorker { // From host_runtime.so. Single platform-side init that does (a) thread // attach + device-id record, (b) executor binary takeover, (c) onboard // CANN dlog sync. Reads the current log level off HostLogger itself. - using SimplerInitFn = - int (*)(void *, int, const uint8_t *, size_t, const uint8_t *, size_t, const uint8_t *, size_t); + using SimplerInitFn = int (*)(void *, int, const uint8_t *, size_t, const uint8_t *, size_t); using PrepareCallableFn = int (*)(void *, int32_t, const void *); using RunPreparedFn = int (*)(void *, void *, int32_t, const void *, int, int, int, int, int, int, const char *, PtoRunTiming *); diff --git a/src/common/worker/pto_runtime_c_api.h b/src/common/worker/pto_runtime_c_api.h index c4f6b7adf..00debb446 100644 --- a/src/common/worker/pto_runtime_c_api.h +++ b/src/common/worker/pto_runtime_c_api.h @@ -132,7 +132,7 @@ int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *de */ int simpler_init( DeviceContextHandle ctx, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, - const uint8_t *aicore_binary, size_t aicore_size, const uint8_t *dispatcher_binary, size_t dispatcher_size + const uint8_t *aicore_binary, size_t aicore_size ); /** diff --git a/tests/st/aicore_op_timeout/test_aicore_op_timeout.py b/tests/st/aicore_op_timeout/test_aicore_op_timeout.py index 0201d490f..5f5fd1002 100644 --- a/tests/st/aicore_op_timeout/test_aicore_op_timeout.py +++ b/tests/st/aicore_op_timeout/test_aicore_op_timeout.py @@ -75,22 +75,11 @@ def test_aicore_op_timeout_surfaces_as_runtime_error(st_platform, st_device_ids) config.aicpu_thread_num = 2 t0 = time.monotonic() - # Acceptable error codes for the STARS-killed AICore op. Which one - # surfaces is timing-dependent — it's whichever stream sync sees the - # AIC failure first: - # 507046 = ACL_ERROR_RT_STREAM_SYNC_TIMEOUT — AICore stream's 2 s - # sync budget fires before AICPU sync notices. - # 507018 = ACL_ERROR_RT_AICPU_EXCEPTION — AICPU stream sync surfaces - # the AICore failure as an AICPU exception when the - # orchestration kernel detects the dead AIC task first. - # 507000 = ACL_ERROR_RT_INTERNAL_ERROR — same detection on a5, - # mapped through a different code path. - # All three are valid on both a2a3 and a5: the timing race is between - # AICPU and AICore stream sync on host, not arch-specific. The - # regression we care about is that the timeout chain reaps the hang - # in single-digit seconds and surfaces *some* 507xxx code rather than - # deadlocking. - with pytest.raises(RuntimeError, match=r"run_prepared failed with code 507(046|018|000)"): + # 507046 = ACL_ERROR_RT_STREAM_SYNC_TIMEOUT — what + # aclrtSynchronizeStreamWithTimeout returns when the AICore stream + # (carrying the STARS-killed op) doesn't drain within the host's 2 s + # budget. Observed elapsed on Ascend910 / a2a3 onboard: ~6.3 s. + with pytest.raises(RuntimeError, match=r"run_prepared failed with code 507046"): worker.run(cid, ChipStorageTaskArgs(), config) elapsed = time.monotonic() - t0 diff --git a/tests/ut/py/test_chip_worker.py b/tests/ut/py/test_chip_worker.py index 5057b50ac..0d6762c14 100644 --- a/tests/ut/py/test_chip_worker.py +++ b/tests/ut/py/test_chip_worker.py @@ -90,17 +90,17 @@ def test_init_after_finalize_raises(self): worker = _ChipWorker() worker.finalize() with pytest.raises(RuntimeError, match="finalized"): - worker.init("/nonexistent/libfoo.so", "/dev/null", "/dev/null", "", device_id=0) + worker.init("/nonexistent/libfoo.so", "/dev/null", "/dev/null", device_id=0) def test_init_with_nonexistent_lib_raises(self): worker = _ChipWorker() with pytest.raises(RuntimeError, match="dlopen"): - worker.init("/nonexistent/libfoo.so", "/dev/null", "/dev/null", "", device_id=0) + worker.init("/nonexistent/libfoo.so", "/dev/null", "/dev/null", device_id=0) def test_init_with_negative_device_id_raises(self): worker = _ChipWorker() with pytest.raises(RuntimeError, match="device_id"): - worker.init("/nonexistent/libfoo.so", "/dev/null", "/dev/null", "", -1) + worker.init("/nonexistent/libfoo.so", "/dev/null", "/dev/null", -1) def test_prepare_callable_before_init_raises(self): from _task_interface import ChipCallable # noqa: PLC0415