Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,3 @@ compile_commands.json
python/_task_interface*.so
python/_task_interface*.dylib
.claude/scheduled_tasks.lock

# Log files
*.log
profiling_logs_*/
2 changes: 1 addition & 1 deletion python/bindings/task_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -708,7 +708,7 @@ NB_MODULE(_task_interface, m) {
.def(nb::init<>())
.def(
"init", &ChipWorker::init, nb::arg("host_lib_path"), nb::arg("aicpu_path"), nb::arg("aicore_path"),
nb::arg("dispatcher_path"), nb::arg("device_id")
nb::arg("device_id")
)
.def("finalize", &ChipWorker::finalize)
.def(
Expand Down
9 changes: 1 addition & 8 deletions python/simpler/task_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,9 +319,7 @@ def init(self, device_id, bins, log_level=None, log_info_v=None):
device_id: NPU device ID to attach the calling thread to.
bins: A `simpler_setup.runtime_builder.RuntimeBinaries` (or any
object exposing host_path / aicpu_path / aicore_path /
simpler_log_path / sim_context_path / dispatcher_path).
``dispatcher_path`` is required for onboard platforms and
ignored on sim (set to None).
simpler_log_path / sim_context_path).
log_level: Severity floor (0=DEBUG..4=NUL). Defaults to a snapshot
of the simpler logger via `_log.get_current_config()`.
log_info_v: INFO verbosity threshold (0..9). Same default.
Expand Down Expand Up @@ -356,15 +354,10 @@ def init(self, device_id, bins, log_level=None, log_info_v=None):
_preload_global(str(bins.sim_context_path))

# 3. host_runtime.so is dlopen'd RTLD_LOCAL inside _impl.init.
# dispatcher_path is passed as an empty string on sim (where bins
# has dispatcher_path=None); the onboard simpler_init reads it
# via LoadAicpuOp::BootstrapDispatcher, sim ignores it.
dispatcher_path = getattr(bins, "dispatcher_path", None)
self._impl.init(
str(bins.host_path),
str(bins.aicpu_path),
str(bins.aicore_path),
"" if dispatcher_path is None else str(dispatcher_path),
int(device_id),
)

Expand Down
8 changes: 1 addition & 7 deletions simpler_setup/build_runtimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def build_all(
raise

for platform in platforms:
arch, _ = parse_platform(platform)
arch, variant = parse_platform(platform)
runtimes = discover_runtimes(arch)

if not runtimes:
Expand All @@ -152,12 +152,6 @@ def build_all(
logger.error(f" Failed to build {platform}/{runtime_name}: {e}")
raise

# No device-side deployment step here. The dispatcher SO is uploaded
# into the main aicpu_scheduler at runtime, on the first
# DeviceRunner::ensure_binaries_loaded call, via
# LoadAicpuOp::BootstrapDispatcher (see src/common/host/load_aicpu_op.cpp
# and src/common/aicpu_dispatcher/aicpu_dispatcher.h for architecture).


def main():
parser = argparse.ArgumentParser(description="Pre-build runtime binaries for available platforms")
Expand Down
49 changes: 1 addition & 48 deletions simpler_setup/runtime_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,21 +69,13 @@ def _invalidate_cache_if_stale(target_cache_dir: Path, current_commit: str) -> N

@dataclass
class RuntimeBinaries:
"""Paths to the compiled runtime binaries.

``dispatcher_path`` points at ``libsimpler_aicpu_dispatcher.so`` and is
required for onboard platforms (host bootstrap reads its bytes and ships
them to the device alongside the inner SO). Sim platforms have no
dispatcher; the field is ``None`` there. ``_lookup_binaries`` resolves
and validates the path against the build output directory.
"""
"""Paths to the compiled runtime binaries."""

host_path: Path
aicpu_path: Path
aicore_path: Path
simpler_log_path: Path
sim_context_path: Optional[Path] = None
dispatcher_path: Optional[Path] = None


class RuntimeBuilder:
Expand Down Expand Up @@ -194,24 +186,12 @@ def _lookup_binaries(self, name: str, output_dir: Path) -> RuntimeBinaries:
"Run 'pip install .' or pass --build to compile it."
)

# Resolve and validate libsimpler_aicpu_dispatcher.so for onboard
# platforms. runtime_compiler stages one copy per arch into
# <LIB_DIR>/<arch>/dispatcher/ (shared across all runtimes); sim
# platforms have no dispatcher.
dispatcher_path = self._resolve_dispatcher_path()
if dispatcher_path is not None and not dispatcher_path.is_file():
raise FileNotFoundError(
f"Pre-built libsimpler_aicpu_dispatcher.so not found at {dispatcher_path}.\n"
"Run 'pip install .' or pass --build to compile it."
)

return RuntimeBinaries(
host_path=paths["host"],
aicpu_path=paths["aicpu"],
aicore_path=paths["aicore"],
simpler_log_path=simpler_log_path,
sim_context_path=sim_context_path,
dispatcher_path=dispatcher_path,
)

def get_binaries(self, name: str, build: bool = False) -> RuntimeBinaries:
Expand All @@ -236,11 +216,6 @@ def get_binaries(self, name: str, build: bool = False) -> RuntimeBinaries:

arch, variant = self._arch, self._variant
output_dir = self._LIB_DIR / arch / variant / name
# Per-arch shared destination for libsimpler_aicpu_dispatcher.so. The
# dispatcher has no runtime-specific code, so all runtimes on a given
# arch reuse the same SO instead of carrying a copy each (~50 KB × N).
# None on sim — sim variants have no dispatcher.
dispatcher_staging_dir = self._LIB_DIR / arch / "dispatcher" if variant != "sim" else None

if not build:
return self._lookup_binaries(name, output_dir)
Expand Down Expand Up @@ -272,7 +247,6 @@ def _compile_target(target: str) -> Path:
source_dirs,
build_dir=str(cache_dir),
output_dir=output_dir,
dispatcher_dest=dispatcher_staging_dir if target == "aicpu" else None,
)

logger.info("Compiling AICore, AICPU, Host in parallel...")
Expand All @@ -294,35 +268,14 @@ def _compile_target(target: str) -> Path:

self._place_compile_commands(name)
logger.info("Build complete!")
# runtime_compiler stages libsimpler_aicpu_dispatcher.so into the
# per-arch shared directory when target=='aicpu'. Surface it through
# RuntimeBinaries so ChipWorker.init can pass the path to
# LoadAicpuOp::BootstrapDispatcher.
dispatcher_path = self._resolve_dispatcher_path()
if dispatcher_path is not None and not dispatcher_path.is_file():
dispatcher_path = None
return RuntimeBinaries(
host_path=host_path,
aicpu_path=aicpu_path,
aicore_path=aicore_path,
simpler_log_path=simpler_log_path,
sim_context_path=sim_context_path,
dispatcher_path=dispatcher_path,
)

def _resolve_dispatcher_path(self) -> Optional[Path]:
"""Return path to libsimpler_aicpu_dispatcher.so for onboard variants.

Returns ``None`` for sim variants (no dispatcher needed: sim's AICPU
runs in-process). For onboard, runtime_compiler stages one shared
copy per arch under ``build/lib/<arch>/dispatcher/`` (the dispatcher
has no runtime-specific code, so all onboard runtimes on a given
arch use the same SO). Validated separately by ``_lookup_binaries``.
"""
if self._variant == "sim":
return None
return self._LIB_DIR / self._arch / "dispatcher" / "libsimpler_aicpu_dispatcher.so"

def _resolve_sim_context_path(self) -> Optional[Path]:
"""Return path to libcpu_sim_context.so for sim platforms, None for onboard.

Expand Down
22 changes: 0 additions & 22 deletions simpler_setup/runtime_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,6 @@ def compile(
source_dirs: list[str],
build_dir: Optional[str] = None,
output_dir: Optional[Union[str, Path]] = None,
dispatcher_dest: Optional[Union[str, Path]] = None,
) -> Union[bytes, Path]:
"""
Compile binary for the specified target platform.
Expand All @@ -213,12 +212,6 @@ def compile(
build_dir: The directory path for compiling. When None, use a temporal path.
output_dir: Directory to copy the final binary into. When set, returns Path.
When None, returns bytes (backward-compatible).
dispatcher_dest: Directory to stage libsimpler_aicpu_dispatcher.so into.
Only consumed when target_platform == 'aicpu' (the aicpu
CMakeLists builds the dispatcher target as a side product).
When None, the dispatcher SO is not exported. Used by
runtime_builder to share one dispatcher SO across all
runtimes for a given arch.

Returns:
If output_dir is set: Path to the compiled binary in output_dir.
Expand Down Expand Up @@ -251,21 +244,6 @@ def _build(actual_build_dir: str) -> Union[bytes, Path]:
platform=platform,
build_dir=actual_build_dir,
)
# Stage the AICPU dispatcher SO into the per-arch shared directory
# provided by runtime_builder. The dispatcher has no runtime-specific
# code (same source under any RUNTIME_NAME), so one copy per arch
# serves every runtime variant — the path is later surfaced through
# RuntimeBinaries.dispatcher_path. Only fires when the aicpu cmake
# build actually produced the dispatcher SO as a side product.
if target_platform == "aicpu" and dispatcher_dest is not None:
dispatcher_name = "libsimpler_aicpu_dispatcher.so"
dispatcher_so = Path(actual_build_dir) / dispatcher_name
if dispatcher_so.is_file():
dest_dir = Path(dispatcher_dest)
dest_dir.mkdir(parents=True, exist_ok=True)
dest_dispatcher = dest_dir / dispatcher_name
shutil.copy2(dispatcher_so, dest_dispatcher)
subprocess.run(["strip", "-s", str(dest_dispatcher)], check=True)
if output_dir is not None:
od = Path(output_dir)
od.mkdir(parents=True, exist_ok=True)
Expand Down
51 changes: 0 additions & 51 deletions src/a2a3/platform/onboard/aicpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../include
list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/task_interface")
list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/log/include")
list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/device_comm")
list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher")
if(DEFINED CUSTOM_INCLUDE_DIRS)
foreach(INC_DIR ${CUSTOM_INCLUDE_DIRS})
list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${INC_DIR}")
Expand Down Expand Up @@ -85,53 +84,3 @@ target_link_directories(aicpu_kernel

# Output name
set_target_properties(aicpu_kernel PROPERTIES OUTPUT_NAME aicpu_kernel)

# Build dispatcher SO — bootstrap-only upload helper. The dispatcher has NO
# runtime-specific code; libaicpu_extend_kernels loads it once via
# rtAicpuKernelLaunchExWithArgs(KERNEL_TYPE_AICPU_KFC), invokes
# DynTileFwkBackendKernelServerInit, which writes the bundled inner SO bytes
# (passed via the extended DeviceArgs at offsets 120/128) to
# /usr/lib64/aicpu_kernels/0/aicpu_kernels_device/simpler_inner_<fp>.so under
# a content-fingerprint basename. After bootstrap the host registers the
# preinstall file via Mode B (rtsBinaryLoadFromFile + rtsFuncGetByName) and
# launches per-task through rtsLaunchCpuKernel; the dispatcher SO itself is
# never referenced again.
#
# Output name is fixed ("simpler_aicpu_dispatcher"). See
# src/common/aicpu_dispatcher/{aicpu_dispatcher.h,README.md} for the
# extended DeviceArgs layout and the FNV-1a/Build-ID fingerprint protocol.
set(AICPU_DISPATCHER_SOURCES
"${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher/aicpu_dispatcher.cpp"
)
add_library(aicpu_dispatcher SHARED ${AICPU_DISPATCHER_SOURCES})

target_compile_options(aicpu_dispatcher
PRIVATE
-Wall
-Wextra
-rdynamic
-O3
-fPIC
-g
$<$<COMPILE_LANGUAGE:CXX>:-std=gnu++17>
)

target_include_directories(aicpu_dispatcher
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CUSTOM_INCLUDE_DIRS}
# src/common is needed so `#include "utils/elf_build_id.h"` resolves;
# host_runtime.so already has this on its include path (see host
# CMakeLists), and the dispatcher uses the same header to fingerprint
# the inner SO bytes by their ELF Build-ID rather than a 64-byte FNV
# over the (mostly-shared) ELF header.
${CMAKE_CURRENT_SOURCE_DIR}/../../../../common
${ASCEND_HOME_PATH}/include
)

target_link_libraries(aicpu_dispatcher PRIVATE dl)

set_target_properties(aicpu_dispatcher PROPERTIES
LINK_FLAGS "-Wl,--build-id"
OUTPUT_NAME "simpler_aicpu_dispatcher"
)
46 changes: 27 additions & 19 deletions src/a2a3/platform/onboard/aicpu/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
#include "runtime.h"

// Run-wall capture: g_device_start_cycle is set once in
// simpler_aicpu_init (single-threaded launch); each thread
// of the multi-threaded simpler_aicpu_exec writes the converted
// DynTileFwkBackendKernelServerInit (single-threaded launch); each thread
// of the multi-threaded DynTileFwkBackendKernelServer writes the converted
// (end - start) into KernelArgs.device_wall_ns on exit. Plain stores —
// last-writer-wins is fine for wall measurement (concurrent exiting threads'
// `my_end` values differ by µs, the final overwrite is within benchmark
Expand All @@ -35,20 +35,27 @@ static uint64_t g_device_start_cycle = 0;
// Forward declaration of aicpu_execute (implemented in aicpu_executor.cpp)
extern "C" int aicpu_execute(Runtime *arg);

extern "C" __attribute__((visibility("default"))) int StaticTileFwkBackendKernelServer(void *arg) {
if (arg == nullptr) {
LOG_ERROR("%s", "Invalid kernel arguments: null pointer");
return -1;
}

return 0;
}

/**
* AICPU kernel initialization entry point.
* AICPU kernel initialization entry point
*
* Called once per run by the main aicpu_scheduler. Host registers this SO
* via `rtsBinaryLoadFromFile` (Mode B JSON load, cpuKernelMode=0) and
* resolves this symbol via `rtsFuncGetByName`; the per-task launch goes
* through `rtsLaunchCpuKernel` on the cached `rtFuncHandle`. The bootstrap
* dispatcher only writes this SO to the preinstall path — it does not
* dlsym these symbols itself.
* This function is called once during kernel initialization by the CANN
* runtime. It initializes logging and validates kernel arguments.
*
* Note: Function name is hardcoded in libaicpu_extend_kernels.so
*
* @param arg Pointer to KernelArgs structure
* @return 0 on success, -1 on error
*/
extern "C" __attribute__((visibility("default"))) int simpler_aicpu_init(void *arg) {
extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelServerInit(void *arg) {
init_log_switch();
if (arg == nullptr) {
LOG_ERROR("%s", "Invalid kernel arguments: null pointer");
Expand All @@ -60,7 +67,7 @@ extern "C" __attribute__((visibility("default"))) int simpler_aicpu_init(void *a

// Init is launched single-threaded (block_dim=1), so the race-free spot
// to capture run start and reset the wall accumulator. Subsequent
// simpler_aicpu_exec threads stamp end on their way out, via
// DynTileFwkBackendKernelServer threads stamp end on their way out, via
// the device-resident 8-byte buffer addressed by device_wall_data_base.
g_device_start_cycle = get_sys_cnt_aicpu();
if (k_args->device_wall_data_base != 0) {
Expand All @@ -72,16 +79,17 @@ extern "C" __attribute__((visibility("default"))) int simpler_aicpu_init(void *a
}

/**
* AICPU kernel main execution entry point.
* AICPU kernel main execution entry point
*
* This is the main entry point for the AICPU runtime executor kernel.
* It extracts the Runtime from KernelArgs and delegates to AicpuExecute.
*
* Called per-thread by the main aicpu_scheduler via the cached
* `rtFuncHandle` resolved during host-side Mode B init (see
* `simpler_aicpu_init` docstring for the load path).
* Note: Function name is hardcoded in libaicpu_extend_kernels.so
*
* @param arg Pointer to KernelArgs structure containing runtime_args
* @return 0 on success, non-zero on error
*/
extern "C" __attribute__((visibility("default"))) int simpler_aicpu_exec(void *arg) {
extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelServer(void *arg) {
if (arg == nullptr) {
LOG_ERROR("%s", "Invalid kernel arguments: null pointer");
return -1;
Expand Down Expand Up @@ -120,13 +128,13 @@ extern "C" __attribute__((visibility("default"))) int simpler_aicpu_exec(void *a
return 0;
}

LOG_INFO_V0("%s", "simpler_aicpu_exec: Calling aicpu_execute with Runtime");
LOG_INFO_V0("%s", "DynTileFwkBackendKernelServer: Calling aicpu_execute with Runtime");
int rc = aicpu_execute(runtime);
if (rc != 0) {
LOG_ERROR("simpler_aicpu_exec: aicpu_execute failed with rc=%d", rc);
LOG_ERROR("DynTileFwkBackendKernelServer: aicpu_execute failed with rc=%d", rc);
return rc;
}
LOG_INFO_V0("%s", "simpler_aicpu_exec: aicpu_execute completed successfully");
LOG_INFO_V0("%s", "DynTileFwkBackendKernelServer: aicpu_execute completed successfully");

// Stamp end into the device_wall buffer (addressed via
// device_wall_data_base). Last-writer-wins across threads — wall
Expand Down
Loading
Loading