diff --git a/docs/L3-L2-host-device-communication.md b/docs/L3-L2-host-device-communication.md new file mode 100644 index 000000000..05dfaea51 --- /dev/null +++ b/docs/L3-L2-host-device-communication.md @@ -0,0 +1,628 @@ +# L3/L2 Host-Device Communication + +This page explains Simpler's L3/L2 host-device communication primitive: +`HostDeviceMappedRegion`. + +It is written for people learning the Simpler library. It covers ownership, +lifetime, the public ABI, Python usage, datacopy, signal semantics, and how +mapped regions relate to the normal `TaskArgs` tensor path. It intentionally +does not document mailbox payload layouts, backend allocation internals, or +test implementation details. + +For the surrounding runtime model, see +[hierarchical_level_runtime.md](hierarchical_level_runtime.md) and +[task-flow.md](task-flow.md). + +## Why It Exists + +Simpler's normal task path is tensor-oriented: + +1. User code builds `TaskArgs`. +2. The runtime stages tensor inputs to device memory. +3. A chip task runs. +4. Output tensors are copied back or validated through the task runtime. + +That path is the right default for ordinary kernel inputs and outputs. It is +less suitable when host code and device code need to coordinate across several +steps, reuse one device-visible buffer, or exchange small phase signals without +turning every phase into a new task payload. + +`HostDeviceMappedRegion` fills that gap. A mapped region gives host code: + +- a reusable data area that is visible to device code, +- one or more signal slots for simple phase handshakes, +- explicit host-to-region and region-to-host byte copies, and +- explicit notify/wait operations. + +The primitive is intentionally small. It does not define a queue, channel, +message format, tensor descriptor, or scheduler dependency policy. Higher-level +protocols can build those rules on top of the data area and signal slots. + +## Runtime Ownership + +The mapped region is owned by the process that owns the chip-side +`DeviceContext`. + +At L2, a `Worker(level=2)` owns one `ChipWorker` and talks to one chip +directly: + +```text +Python Worker(level=2) + | + +-- ChipWorker + | + +-- L2 chip runtime and kernels +``` + +At L3, a `Worker(level=3)` owns one chip child process per device id. The L3 +parent exposes the same mapped-region methods, and `worker_id` selects which +chip child owns the region: + +```text +Python Worker(level=3) + | + +-- chip child 0 -> ChipWorker -> L2 chip 0 + +-- chip child 1 -> ChipWorker -> L2 chip 1 + +-- sub workers +``` + +In L3 process mode, the chip child process owns `ChipWorker`, `DeviceRunner`, +the loaded host runtime, and the mapped-region registry. The L3 parent owns +only a Python `MappedRegion` wrapper and reaches the child through the +existing parent-child control path. + +The NPU does not own the allocation lifetime. Device code participates by +reading and writing the device-visible addresses returned by +`mapped_region_info()`. + +Pointer semantics follow the ownership rule: + +- `device_data_ptr` and `device_signal_ptr` are public device-visible + addresses. They can be passed to kernels through `TaskArgs` scalars or + tensor metadata. +- `host_data_ptr` and `host_signal_ptr` are not public Python dereferenceable + addresses. The Python API reports them as `0`. +- Host code accesses the region through datacopy methods, not by writing a + mapped host pointer directly. + +The L3 mailbox is only a host-side proxy transport. It is not the CPU-NPU +mapped-region primitive itself. + +## Lifetime And Handle Ownership + +Each opened region is registered in the owning `DeviceContext`. All `info`, +datacopy, notify, wait, and close operations validate that the handle belongs +to that context. + +A handle from another context, a stale handle, a closed handle, or a +double-close is invalid. In Python, `MappedRegion` also records the owning +`worker_id`; passing a different `worker_id` to a later operation is rejected +before the request is sent to the chip child. + +`close_mapped_region()` releases the owner-side host mapping and device +allocation. It is not a device synchronization operation. The caller must +ensure no in-flight kernel, AICPU code, or other device participant still uses +`device_data_ptr` or `device_signal_ptr` before closing the region. + +Usually that means waiting for task completion or for the protocol's +completion signal before close: + +```text +host writes input +host notify input_ready +device reads input and writes output +device publish output_ready +host wait output_ready +host reads output +host close mapped region +``` + +`Worker.close()`, `finalize_device()`, and `destroy_device_context()` clean up +remaining mapped regions as a resource fallback. That cleanup does not make it +safe to close while device code is still accessing the region. + +## Public ABI + +The runtime C ABI defines an opaque handle plus config and info structures in +`src/common/worker/pto_runtime_c_api.h`. Most library users do not call this +ABI directly, but it is the stable boundary that `ChipWorker` resolves from a +runtime shared object. + +```cpp +typedef void *HostDeviceMappedRegionHandle; + +typedef struct HostDeviceMappedRegionConfig { + uint64_t data_bytes; + uint32_t signal_count; + uint32_t flags; +} HostDeviceMappedRegionConfig; + +typedef struct HostDeviceMappedRegionInfo { + uint64_t host_data_ptr; + uint64_t device_data_ptr; + uint64_t data_bytes; + uint64_t host_signal_ptr; + uint64_t device_signal_ptr; + uint32_t signal_count; + uint32_t reserved0; + uint64_t total_bytes; + uint32_t flags; + uint32_t reserved1; +} HostDeviceMappedRegionInfo; +``` + +`flags` is reserved and must be `0`. + +The C ABI entry points are: + +```cpp +int open_host_device_mapped_region_ctx( + DeviceContextHandle ctx, + const HostDeviceMappedRegionConfig *cfg, + HostDeviceMappedRegionHandle *out_region +); + +int close_host_device_mapped_region_ctx( + DeviceContextHandle ctx, + HostDeviceMappedRegionHandle region +); + +int host_device_mapped_region_info_ctx( + DeviceContextHandle ctx, + HostDeviceMappedRegionHandle region, + HostDeviceMappedRegionInfo *info +); + +int host_device_mapped_region_datacopy_h2region_ctx( + DeviceContextHandle ctx, + HostDeviceMappedRegionHandle region, + uint64_t offset, + const void *src, + size_t nbytes +); + +int host_device_mapped_region_datacopy_region2h_ctx( + DeviceContextHandle ctx, + HostDeviceMappedRegionHandle region, + uint64_t offset, + void *dst, + size_t nbytes +); + +int host_device_mapped_region_notify_ctx( + DeviceContextHandle ctx, + HostDeviceMappedRegionHandle region, + uint32_t signal_id, + uint32_t value +); + +int host_device_mapped_region_wait_ctx( + DeviceContextHandle ctx, + HostDeviceMappedRegionHandle region, + uint32_t signal_id, + uint32_t target, + uint32_t timeout_us +); +``` + +The ABI uses negative errno-style return codes: + +- `0`: success. +- `-EINVAL`: invalid context, handle, config, range, signal id, value, or + pointer. +- `-EAGAIN` / `-EWOULDBLOCK`: non-blocking wait miss or bounded wait timeout. +- `-ENOMEM`: allocation or wrapper construction failure. +- `-EIO`: backend mapping, datacopy, or signal failure. +- `-ENOTSUP`: unsupported platform or unsupported backend feature. + +Python maps invalid user input to `ValueError`, wait miss or timeout to +`TimeoutError`, and backend or unsupported-platform failures to `RuntimeError`. + +## Python API + +The user-facing API is exposed through `Worker`: + +```python +region = worker.open_mapped_region( + data_bytes, + signal_count=2, + flags=0, + worker_id=0, +) + +info = worker.mapped_region_info(region) + +worker.mapped_region_datacopy_h2region(region, offset, payload) +payload = worker.mapped_region_datacopy_region2h(region, offset, nbytes) + +worker.mapped_region_notify(region, signal_id, value) +worker.mapped_region_wait(region, signal_id, target, timeout_us) + +worker.close_mapped_region(region) +``` + +Direct L2 calls execute in the owner process. L3 calls route to the selected +chip child while preserving the same public method names. + +`MappedRegion` is a lightweight Python wrapper, not a pointer. It records: + +- the opaque runtime handle, +- the owning `worker_id`, +- the requested `data_bytes`, +- the requested `signal_count`, +- the reserved `flags` value, and +- whether the region is closed. + +Follow-up operations default to `region.worker_id`. Passing a different +`worker_id` is a user error. Operations on a closed region are also user +errors. + +`mapped_region_info()` returns a `MappedRegionInfo` object with: + +- `device_data_ptr`: device-visible base address of the data area, +- `device_signal_ptr`: device-visible base address of the signal slots, +- `data_bytes`: usable data bytes, +- `signal_count`: number of signal slots, +- `total_bytes`: backend allocation size, and +- `flags`: currently `0`. + +The host pointer fields are always reported as `0` in the public Python API. + +`mapped_region_datacopy_h2region()` accepts bytes-like contiguous buffers. +`str` is rejected; encode text explicitly before passing it. Non-contiguous +buffers are invalid. + +`mapped_region_datacopy_region2h()` returns a new `bytes` object. + +## Datacopy Semantics + +The datacopy APIs move raw bytes between a caller-provided host buffer and the +mapped region data area: + +```text +datacopy_h2region: + host buffer -> region data[offset:offset+nbytes] + +datacopy_region2h: + region data[offset:offset+nbytes] -> host buffer +``` + +The data area is raw bytes. Simpler does not interpret offsets inside it, does +not construct tensor descriptors, and does not attach protocol meaning to a +range. The caller's protocol decides which offsets contain inputs, outputs, +headers, or message payloads. + +Bounds are checked against the configured `data_bytes`. A zero-length copy at +`offset == data_bytes` is valid; a non-zero copy past the end is invalid. + +Datacopy does not wait, notify, check protocol phase, update ring metadata, or +publish TensorMap dependencies. Protocols compose the primitives explicitly: + +```text +producer write = datacopy_h2region + notify +consumer read = wait + datacopy_region2h +``` + +Datacopy alone is not a synchronization boundary. Visibility to the other +participant is established by composing datacopy with the signal protocol +described below. + +For direct L2 calls, the runtime can copy through the owner process's mapped +host view. For L3 parent calls, the same Python method is proxied to the chip +child: + +```text +h2region: + parent buffer -> child request payload -> child mapped region + +region2h: + child mapped region -> child reply payload -> parent bytes +``` + +The proxy path is an implementation detail. The Python contract is the same +for L2 and L3. + +## Signal Semantics + +Signal slots provide lightweight phase or sequence synchronization: + +```text +notify(signal_id, value) + publish value to signal[signal_id] + +wait(signal_id, target, timeout_us) + complete when observed signal[signal_id] >= target + otherwise raise or return a timeout result +``` + +Signal values are `uint32_t`. A signal slot is best treated as a phase word +for bounded protocol epochs, not as a long-lived channel sequence counter. +Higher-level protocols that need long-running head, tail, or sequence values +should define their own metadata in the mapped data area. + +Signal values should be monotonic within one protocol epoch. Wrap-around +handling is not part of this primitive. + +`wait` has two modes: + +- `timeout_us == 0`: non-blocking probe. +- `timeout_us > 0`: bounded wait. + +There is no infinite wait mode. + +All signal slots start at zero. Therefore a non-blocking wait for target zero +can succeed immediately. + +`device_signal_ptr` points to the device-visible signal slot array. Device code +may use the documented signal layout directly. For example, a kernel can poll +signal slot 0, read input data, write output data, and then publish signal +slot 1. + +### Memory Ordering + +`notify` is a release publication point for writes sequenced before it. `wait` +is an acquire observation point for reads sequenced after it. + +For CPU produces / NPU consumes: + +```text +host datacopy_h2region(...) +host notify(signal_id, seq) +device wait or poll signal_id >= seq +device reads data +``` + +If device code observes `signal_id >= seq`, device reads after that +observation must see host writes completed before the matching `notify`. + +For NPU produces / CPU consumes: + +```text +device writes data +device publishes signal_id = seq +host wait(signal_id, seq, timeout_us) +host datacopy_region2h(...) +``` + +If host wait succeeds, host reads after wait must see device writes completed +before the matching device signal publication. + +Device code that accesses signal slots directly must preserve the same +ordering contract: polling a signal that reaches the target is an acquire +operation, and publishing a signal after data writes is a release operation. + +## Relationship To Task Tensor Payloads + +Simpler's normal task tensor payload path is built around `TaskArgs` and +`ContinuousTensor`. It is task-scoped and tensor-oriented: + +1. The user adds a `ContinuousTensor` to `TaskArgs`. +2. The task is dispatched to a chip child. +3. The chip runtime prepares device-side task arguments. +4. For ordinary tensors, the runtime allocates device memory and copies from + the host pointer in `ContinuousTensor.data`. +5. The runtime replaces the tensor's data pointer with the device pointer + before launching device orchestration and kernels. +6. During validation or copy-back, recorded tensor pairs can be copied from + device memory back to the original host pointer. + +That path is convenient for normal kernel inputs and outputs. The runtime owns +the per-task tensor staging details, and the user describes tensors rather than +explicit data movement phases. + +`child_memory=True` is an opt-out from that automatic staging path. When a +`ContinuousTensor` is marked as child memory, the chip runtime treats +`ContinuousTensor.data` as an existing child-managed device pointer. It passes +the tensor through without allocating new device memory and without staging the +contents again. The caller is responsible for allocating and populating the +device buffer, commonly through `orch.malloc` plus `orch.copy_to`. + +`HostDeviceMappedRegion` is different from both: + +- Ordinary task tensor: `ContinuousTensor` host pointer, implicit staging and + optional copy-back around a task, task/runtime-managed lifetime, TensorMap + dependencies for synchronization. +- `child_memory=True` tensor: existing device pointer, caller-managed copies, + caller/child-managed lifetime, TensorMap can still see the tensor argument. +- Mapped region: data offsets plus signal slots, explicit datacopy, explicit + open/close on a chip-owned region, explicit notify/wait. + +### Difference From `copy_to_device()` + +`copy_to_device()` copies from a host buffer into ordinary device memory. It is +used by the task runtime to stage tensor payloads before execution, and it is +also exposed through worker/orchestrator copy helpers for manually managed +device buffers. + +Mapped-region datacopy targets the mapped region's data area, not an arbitrary +device allocation. It is paired with `mapped_region_info()`, which exposes +device-side views of the region, and with signal slots that let a protocol +publish readiness or completion. + +In short: + +```text +copy_to_device: + host buffer -> device allocation + +mapped_region_datacopy_h2region: + host buffer -> chip-owned mapped-region data area +``` + +The mapped-region path is not a replacement for tensor staging. It is the +primitive to use when host and device need a persistent data area plus explicit +synchronization semantics. + +### Difference From `child_memory=True` + +`child_memory=True` changes how the task runtime interprets a tensor argument. +It says: this `ContinuousTensor.data` value is already a valid child-side +device pointer, so the runtime should not allocate, copy, or free it as an +ordinary task tensor. + +Mapped regions can provide such a pointer, but they do not by themselves make +a tensor. A caller may wrap `info.device_data_ptr` in a +`ContinuousTensor(..., child_memory=True)` when a kernel expects tensor +metadata. The mapped region still owns the backing allocation and signal +slots; `child_memory=True` only prevents the task runtime from trying to stage +that pointer again. + +This composition is useful for a kernel-facing data path: + +```text +host: + region = open_mapped_region(...) + info = mapped_region_info(region) + mapped_region_datacopy_h2region(region, 0, input_bytes) + mapped_region_notify(region, 0, 1) + +task args: + tensor = ContinuousTensor.make( + info.device_data_ptr, + shape, + dtype, + child_memory=True, + ) + args.add_tensor(tensor, TensorArgType.NO_DEP) + args.add_scalar(info.device_signal_ptr) + +device: + wait or poll signal[0] + read or write data through device_data_ptr + publish signal[1] + +host: + mapped_region_wait(region, 1, 1, timeout_us) + output = mapped_region_datacopy_region2h(region, output_offset, nbytes) +``` + +The important boundary is that `child_memory=True` is a task-argument staging +flag, while `HostDeviceMappedRegion` is an allocation, address exposure, +datacopy, and signal primitive. + +### Choosing A Data Path + +Use ordinary tensors for standard task input/output payloads. Use +`child_memory=True` for manually allocated device buffers that should be passed +as tensors without automatic staging. Use `HostDeviceMappedRegion` when a +protocol needs a CPU/NPU-visible data area, persistent lifetime, explicit +byte-level datacopy, and signal slots. + +These choices can be combined. A mapped region can provide the backing device +address for a `child_memory=True` tensor, while the mapped-region signal slots +provide the protocol ordering. + +Mapped-region datacopy and signal operations do not publish TensorMap +dependencies and do not replace `TensorArgType` dependency tags. If a +mapped-region-backed tensor is submitted through `TaskArgs`, choose the tensor +tag deliberately. `NO_DEP` is usually the right tag when synchronization is +handled by the mapped-region signal protocol. + +## L2 Example + +This is the direct one-chip shape. It opens one region, reuses it across +iterations, and passes the device-visible addresses to a chip callable: + +```python +worker = Worker( + level=2, + platform="a2a3sim", + runtime="tensormap_and_ringbuffer", + device_id=0, +) +worker.init() + +region = worker.open_mapped_region(data_bytes * 2, signal_count=2) +info = worker.mapped_region_info(region) + +for seq in range(1, 11): + worker.mapped_region_datacopy_h2region(region, 0, make_payload(seq)) + worker.mapped_region_notify(region, 0, seq) + + args = TaskArgs() + args.add_scalar(info.device_data_ptr) + args.add_scalar(info.device_signal_ptr) + args.add_scalar(seq) + args.add_scalar(data_bytes) + worker.run(chip_cid, args, cfg) + + worker.mapped_region_wait(region, 1, seq, 1_000_000) + out = worker.mapped_region_datacopy_region2h( + region, + data_bytes, + data_bytes, + ) + +worker.close_mapped_region(region) +worker.close() +``` + +## L3 Example + +In L3, the parent `Worker` may have multiple chip children. `worker_id` +selects the chip child that owns the mapped region: + +```python +worker = Worker( + level=3, + device_ids=[0, 1], + platform="a2a3sim", + runtime="tensormap_and_ringbuffer", +) +worker.init() + +region0 = worker.open_mapped_region( + data_bytes, + signal_count=2, + worker_id=0, +) +info0 = worker.mapped_region_info(region0) + +region1 = worker.open_mapped_region( + data_bytes, + signal_count=2, + worker_id=1, +) +info1 = worker.mapped_region_info(region1) +``` + +Each region belongs to exactly one chip child. Do not pass a region opened for +`worker_id=0` to operations for `worker_id=1`, and do not pass its device +pointers to a task running on a different chip unless a higher-level protocol +explicitly supports that. + +## Platform Support + +Mapped regions are available on: + +- `a2a3sim` +- `a5sim` +- `a2a3` onboard + +`a5` onboard currently reports mapped regions as unsupported. + +The portable contract is the public Python behavior described here: raw byte +datacopy, explicit signal notify/wait, masked host pointers, opaque handles, +and device-visible addresses suitable for task arguments. + +## Example Location + +The round-trip example lives at: + +```text +examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/ +``` + +Run it on simulation with: + +```bash +cd examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip +python main.py -p a2a3sim -d 0 +``` + +Run it on a2a3 hardware with: + +```bash +cd examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip +python main.py -p a2a3 -d 0 +``` diff --git a/docs/dynamic-linking.md b/docs/dynamic-linking.md index 2539c8464..642ae404d 100644 --- a/docs/dynamic-linking.md +++ b/docs/dynamic-linking.md @@ -38,7 +38,7 @@ Python process (ChipWorker) | +-- rtRegisterAllKernel(aicore_binary) ← CANN kernel registration | +-- rtAicpuKernelLaunchExWithArgs(...) ← device-side execution | - +-- dlopen("libascend_hal.so", RTLD_NOW | RTLD_LOCAL) ← CANN HAL (profiling only) + +-- dlopen("libascend_hal.so", RTLD_NOW | RTLD_LOCAL) ← CANN HAL ``` Key difference: onboard does **not** dlopen AICPU/AICore as host-side SOs. @@ -93,9 +93,11 @@ execution. ### CANN HAL: `RTLD_NOW | RTLD_LOCAL` -`libascend_hal.so` is loaded only for performance profiling (SVM memory -mapping via `halHostRegister`/`halHostUnregister`). The handle is cached -in a file-scope `g_hal_handle` and never explicitly dlclosed. +`libascend_hal.so` is loaded for onboard HAL services that need SVM memory +mapping, including performance profiling buffers and a2a3 +`HostDeviceMappedRegion` host mappings via +`halHostRegister`/`halHostUnregister`. The handle is cached in a file-scope +`g_hal_handle` and never explicitly dlclosed. ## All dlsym(RTLD_DEFAULT) Calls diff --git a/docs/worker-manager.md b/docs/worker-manager.md index 0be0480ea..87e70fa55 100644 --- a/docs/worker-manager.md +++ b/docs/worker-manager.md @@ -183,7 +183,56 @@ mailbox_size_ = HEADER_SIZE // 8 B (state + error) Per-worker total: ~2 KB. Typical pool: 4-8 workers → ~8-16 KB shm total. -### 3.4 Shutdown +### 3.4 Control-plane commands + +The mailbox is also the per-child control channel. When the parent writes +`CONTROL_REQUEST`, offset 8 carries a `CTRL_*` sub-command instead of a task +callable id. The child loop handles the command in the same polling state +machine as `TASK_READY`, writes `MAILBOX_OFF_ERROR` / `MAILBOX_OFF_ERROR_MSG` +and any scalar result, then publishes `CONTROL_DONE`. + +Task dispatch and control commands share one mailbox. Parent-side +`dispatch_process()` and every `control_*()` method serialize on the same +`mailbox_mu_`, so a control request issued while a task is running waits for +that task's mailbox round trip to finish before it claims the state field. +This is a WorkerManager-level RPC contract; individual features only define +their own `CTRL_*` sub-command and payload schema. + +The fixed control slot layout is: + +```text +offset 8: uint64 control sub-command +offset 16: uint64 arg0 +offset 24: uint64 arg1 +offset 32: uint64 arg2 +offset 40: uint64 arg3 +offset 48: uint64 result +``` + +The meaning of `arg0..arg3` is sub-command-specific. Commands that return one +scalar or pointer write it at `CTRL_OFF_RESULT`; commands with larger request +or reply payloads pass fixed-width POSIX shared-memory names through +`MAILBOX_OFF_ARGS`. + +Current control-plane users include: + +- Device memory control from the orchestrator: + `CTRL_MALLOC`, `CTRL_FREE`, `CTRL_COPY_TO`, and `CTRL_COPY_FROM`. +- Callable lifecycle control: + `CTRL_PREPARE`, `CTRL_REGISTER`, `CTRL_UNREGISTER`, + `CTRL_PY_REGISTER`, and `CTRL_PY_UNREGISTER`. +- Communication-domain setup: + `CTRL_COMM_INIT`, `CTRL_ALLOC_DOMAIN`, and `CTRL_RELEASE_DOMAIN`. +- Host/device mapped-region operations: + `CTRL_OPEN_MAPPED_REGION`, `CTRL_CLOSE_MAPPED_REGION`, + `CTRL_MAPPED_REGION_INFO`, datacopy, notify, and wait commands. + +When adding a new control command, keep the mailbox fields limited to small +fixed arguments and move variable-sized payloads into side-band shared memory. +The child must always publish `CONTROL_DONE` with a clear error code/message +before the parent releases the mailbox back to `IDLE`. + +### 3.5 Shutdown `WorkerManager::shutdown_children()` writes `SHUTDOWN` to every registered mailbox; each child loop sees it on its next poll and exits. The Python diff --git a/examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/kernels/aiv/host_device_mapped_region_round_trip.cpp b/examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/kernels/aiv/host_device_mapped_region_round_trip.cpp new file mode 100644 index 000000000..88229c8a2 --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/kernels/aiv/host_device_mapped_region_round_trip.cpp @@ -0,0 +1,99 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include + +#include + +#ifndef __gm__ +#define __gm__ +#endif +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +#include "pipe_sync.h" + +static constexpr uint64_t kCacheLineBytes = 64; +static constexpr uint32_t kMaxPollIters = 1024U; + +static inline __aicore__ void flush_range(volatile __gm__ void *addr, uint64_t size_bytes) { +#if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__) + uintptr_t start = reinterpret_cast(addr) & ~(uintptr_t(kCacheLineBytes) - 1u); + uintptr_t end = + (reinterpret_cast(addr) + size_bytes + kCacheLineBytes - 1u) & ~(uintptr_t(kCacheLineBytes) - 1u); + for (uintptr_t p = start; p < end; p += kCacheLineBytes) { + dcci((__gm__ int32_t *)p, SINGLE_CACHE_LINE, CACHELINE_OUT); + } +#if defined(__CPU_SIM) + dsb(0); +#else + dsb(DSB_DDR); +#endif + pipe_barrier(PIPE_ALL); +#else + (void)addr; + (void)size_bytes; + __asm__ __volatile__("" ::: "memory"); +#endif +} + +static inline __aicore__ void invalidate_range(volatile __gm__ void *addr, uint64_t size_bytes) { +#if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__) + uintptr_t start = reinterpret_cast(addr) & ~(uintptr_t(kCacheLineBytes) - 1u); + uintptr_t end = + (reinterpret_cast(addr) + size_bytes + kCacheLineBytes - 1u) & ~(uintptr_t(kCacheLineBytes) - 1u); + for (uintptr_t p = start; p < end; p += kCacheLineBytes) { + dcci((__gm__ int32_t *)p, SINGLE_CACHE_LINE); + } +#if defined(__CPU_SIM) + dsb(0); +#else + dsb(DSB_DDR); +#endif +#else + (void)addr; + (void)size_bytes; + __asm__ __volatile__("" ::: "memory"); +#endif +} + +static inline __aicore__ volatile __gm__ uint32_t *signal_slot(__gm__ uint8_t *signal_base, uint32_t signal_id) { + return reinterpret_cast(signal_base + signal_id * kCacheLineBytes); +} + +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + auto *data = reinterpret_cast<__gm__ uint8_t *>(static_cast(args[0])); + auto *signal_base = reinterpret_cast<__gm__ uint8_t *>(static_cast(args[1])); + auto *signal0 = signal_slot(signal_base, 0); + auto *signal1 = signal_slot(signal_base, 1); + uint32_t seq = static_cast(args[2]); + uint32_t nbytes = static_cast(args[3]); + + bool observed = false; + for (uint32_t i = 0; i < kMaxPollIters; ++i) { + invalidate_range(signal0, kCacheLineBytes); + if (*signal0 >= seq) { + observed = true; + break; + } + } + + invalidate_range(data, nbytes); + for (uint32_t i = 0; i < nbytes; ++i) { + uint8_t mask = observed ? static_cast(seq + i * 3U) : static_cast(0xA5U); + data[nbytes + i] = static_cast(data[i] ^ mask); + } + flush_range(data + nbytes, nbytes); + + *signal1 = seq; + flush_range(signal1, kCacheLineBytes); +} diff --git a/examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/kernels/orchestration/host_device_mapped_region_round_trip_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/kernels/orchestration/host_device_mapped_region_round_trip_orch.cpp new file mode 100644 index 000000000..84f230e3c --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/kernels/orchestration/host_device_mapped_region_round_trip_orch.cpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include "pto_orchestration_api.h" // NOLINT(build/include_subdir) + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig +host_device_mapped_region_round_trip_config(const ChipStorageTaskArgs &orch_args) { + (void)orch_args; + return PTO2OrchestrationConfig{.expected_arg_count = 4}; +} + +__attribute__((visibility("default"))) PTO2OrchestrationConfig +aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { + return host_device_mapped_region_round_trip_config(orch_args); +} + +__attribute__((visibility("default"))) void host_device_mapped_region_round_trip_orch(const ChipStorageTaskArgs &orch_args) { + Arg args; + args.add_scalar(orch_args.scalar(0)); + args.add_scalar(orch_args.scalar(1)); + args.add_scalar(orch_args.scalar(2)); + args.add_scalar(orch_args.scalar(3)); + rt_submit_aiv_task(0, args); +} + +} // extern "C" diff --git a/examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/main.py b/examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/main.py new file mode 100644 index 000000000..516e0f567 --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/main.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Host CPU to device NPU round-trip through HostDeviceMappedRegion.""" + +from __future__ import annotations + +import argparse +import os +import sys +from pathlib import Path + +from simpler.task_interface import ArgDirection, CallConfig, ChipCallable, CoreCallable, TaskArgs +from simpler.worker import Worker +from simpler_setup.elf_parser import extract_text_section +from simpler_setup.kernel_compiler import KernelCompiler +from simpler_setup.pto_isa import ensure_pto_isa_root +from simpler_setup.runtime_builder import RuntimeBuilder + + +HERE = Path(__file__).resolve().parent +KERNEL_DIR = HERE / "kernels" +RUNTIME = "tensormap_and_ringbuffer" +DEFAULT_DATA_BYTES = 256 +DEFAULT_ITERS = 10 + + +def _build_callable(platform: str) -> ChipCallable: + kc = KernelCompiler(platform=platform) + pto_isa_root = ensure_pto_isa_root(clone_protocol="https") + include_dirs = kc.get_orchestration_include_dirs(RUNTIME) + + incore = kc.compile_incore( + source_path=str(KERNEL_DIR / "aiv" / "host_device_mapped_region_round_trip.cpp"), + core_type="aiv", + pto_isa_root=pto_isa_root, + extra_include_dirs=include_dirs, + ) + if not platform.endswith("sim"): + incore = extract_text_section(incore) + + orch = kc.compile_orchestration( + runtime_name=RUNTIME, + source_path=str(KERNEL_DIR / "orchestration" / "host_device_mapped_region_round_trip_orch.cpp"), + ) + return ChipCallable.build( + signature=[ArgDirection.IN, ArgDirection.IN, ArgDirection.IN, ArgDirection.IN], + func_name="host_device_mapped_region_round_trip_orch", + binary=orch, + children=[(0, CoreCallable.build(signature=[], binary=incore))], + ) + + +def _pattern(seq: int, data_bytes: int) -> bytes: + return bytes(((seq * 17 + i * 5) & 0xFF) for i in range(data_bytes)) + + +def _expected(seq: int, payload: bytes) -> bytes: + return bytes((b ^ ((seq + i * 3) & 0xFF)) for i, b in enumerate(payload)) + + +def run( + platform: str, + device_id: int, + *, + build: bool = False, + iters: int = DEFAULT_ITERS, + data_bytes: int = DEFAULT_DATA_BYTES, +) -> None: + if platform not in {"a2a3sim", "a2a3"}: + raise ValueError(f"unsupported platform: {platform}") + if iters <= 0: + raise ValueError("iters must be positive") + if data_bytes <= 0: + raise ValueError("data_bytes must be positive") + + os.environ["PTO_ISA_ROOT"] = ensure_pto_isa_root(clone_protocol="https") + RuntimeBuilder(platform=platform).get_binaries(RUNTIME, build=build) + chip_callable = _build_callable(platform) + + worker = Worker(level=2, platform=platform, runtime=RUNTIME, device_id=device_id, build=build) + worker.init() + region = None + try: + chip_cid = worker.register(chip_callable) + region = worker.open_mapped_region(data_bytes * 2, signal_count=2) + info = worker.mapped_region_info(region) + assert info.host_data_ptr == 0 + assert info.host_signal_ptr == 0 + assert info.device_data_ptr != 0 + assert info.device_signal_ptr != 0 + + cfg = CallConfig() + cfg.block_dim = 1 + cfg.aicpu_thread_num = 2 + + for seq in range(1, iters + 1): + payload = _pattern(seq, data_bytes) + worker.mapped_region_datacopy_h2region(region, 0, payload) + worker.mapped_region_notify(region, 0, seq) + + args = TaskArgs() + args.add_scalar(info.device_data_ptr) + args.add_scalar(info.device_signal_ptr) + args.add_scalar(seq) + args.add_scalar(data_bytes) + worker.run(chip_cid, args, cfg) + + worker.mapped_region_wait(region, 1, seq, 1_000_000) + got = worker.mapped_region_datacopy_region2h(region, data_bytes, data_bytes) + assert got == _expected(seq, payload) + finally: + if region is not None: + worker.close_mapped_region(region) + worker.close() + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("-p", "--platform", required=True, choices=["a2a3sim", "a2a3"]) + parser.add_argument("-d", "--device", type=int, default=0) + parser.add_argument("--build", action="store_true", help="Rebuild runtime from source.") + parser.add_argument("--iters", type=int, default=DEFAULT_ITERS) + parser.add_argument("--data-bytes", type=int, default=DEFAULT_DATA_BYTES) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + run(args.platform, args.device, build=args.build, iters=args.iters, data_bytes=args.data_bytes) + print( + "[host_device_mapped_region_round_trip] " + f"platform={args.platform} device={args.device} iters={args.iters} data_bytes={args.data_bytes} PASSED" + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/test_mapped_region_round_trip.py b/examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/test_mapped_region_round_trip.py new file mode 100644 index 000000000..ec449b40c --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/test_mapped_region_round_trip.py @@ -0,0 +1,67 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Pytest entrypoint for the HostDeviceMappedRegion round-trip example. + +This example demonstrates host CPU to device NPU communication through a +``HostDeviceMappedRegion``. The host opens one mapped region, reuses it for 10 +iterations, writes a sequence-dependent input pattern, notifies signal slot 0, +submits an AIV task with the returned device pointers, waits for signal slot 1, +then reads and checks the output bytes. + +Run directly: + + python examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/main.py -p a2a3sim -d 0 + python examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/main.py -p a2a3 -d 0 + +Use ``--build`` to rebuild the runtime from source. Use ``--iters N`` to adjust +the number of reused-region iterations; support gating should keep the default +10 iterations. +""" + +from __future__ import annotations + +import os +import subprocess +import sys +from pathlib import Path + +import pytest + + +HERE = Path(__file__).resolve().parent +REPO_ROOT = HERE.parents[3] + + +def _subprocess_env() -> dict[str, str]: + env = os.environ.copy() + paths = [str(REPO_ROOT), str(REPO_ROOT / "python")] + venv_lib = REPO_ROOT / ".venv" / "lib" + if venv_lib.exists(): + paths.extend(str(p) for p in sorted(venv_lib.glob("python*/site-packages"))) + existing = env.get("PYTHONPATH") + if existing: + paths.append(existing) + env["PYTHONPATH"] = os.pathsep.join(paths) + return env + + +@pytest.mark.platforms(["a2a3sim", "a2a3"]) +def test_host_device_mapped_region_round_trip(request): + platform = request.config.getoption("--platform", default=None) or "a2a3sim" + device = request.config.getoption("--device", default=None) + device_id = int(str(device).split(",")[0].split("-")[0]) if device is not None else 0 + result = subprocess.run( + [sys.executable, str(HERE / "main.py"), "-p", platform, "-d", str(device_id)], + text=True, + capture_output=True, + timeout=180, + check=False, + env=_subprocess_env(), + ) + assert result.returncode == 0, result.stdout + result.stderr diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp index 4ba073839..0a39065e6 100644 --- a/python/bindings/task_interface.cpp +++ b/python/bindings/task_interface.cpp @@ -18,10 +18,13 @@ */ #include +#include #include #include #include +#include + #include #include #include @@ -40,6 +43,50 @@ namespace nb = nanobind; +namespace { + +struct MappedRegionInfo { + uint64_t host_data_ptr; + uint64_t device_data_ptr; + uint64_t data_bytes; + uint64_t host_signal_ptr; + uint64_t device_signal_ptr; + uint32_t signal_count; + uint64_t total_bytes; + uint32_t flags; + + MappedRegionInfo( + uint64_t host_data_ptr_, uint64_t device_data_ptr_, uint64_t data_bytes_, uint64_t host_signal_ptr_, + uint64_t device_signal_ptr_, uint32_t signal_count_, uint64_t total_bytes_, uint32_t flags_ + ) : + host_data_ptr(host_data_ptr_), + device_data_ptr(device_data_ptr_), + data_bytes(data_bytes_), + host_signal_ptr(host_signal_ptr_), + device_signal_ptr(device_signal_ptr_), + signal_count(signal_count_), + total_bytes(total_bytes_), + flags(flags_) {} +}; + +MappedRegionInfo make_mapped_region_info(const HostDeviceMappedRegionInfo &info) { + return MappedRegionInfo{ + info.host_data_ptr, info.device_data_ptr, info.data_bytes, info.host_signal_ptr, + info.device_signal_ptr, info.signal_count, info.total_bytes, info.flags, + }; +} + +void raise_python_exception_for_mapped_region_error(const std::exception &e) { + std::string msg = e.what(); + if (msg.find("timed out") != std::string::npos) { + PyErr_SetString(PyExc_TimeoutError, msg.c_str()); + throw nb::python_error(); + } + throw; +} + +} // namespace + // ============================================================================ // Module definition // ============================================================================ @@ -703,6 +750,29 @@ NB_MODULE(_task_interface, m) { return os.str(); }); + nb::class_(m, "MappedRegionInfo") + .def( + nb::init(), + nb::arg("host_data_ptr"), nb::arg("device_data_ptr"), nb::arg("data_bytes"), nb::arg("host_signal_ptr"), + nb::arg("device_signal_ptr"), nb::arg("signal_count"), nb::arg("total_bytes"), nb::arg("flags") + ) + .def_ro("host_data_ptr", &MappedRegionInfo::host_data_ptr) + .def_ro("device_data_ptr", &MappedRegionInfo::device_data_ptr) + .def_ro("data_bytes", &MappedRegionInfo::data_bytes) + .def_ro("host_signal_ptr", &MappedRegionInfo::host_signal_ptr) + .def_ro("device_signal_ptr", &MappedRegionInfo::device_signal_ptr) + .def_ro("signal_count", &MappedRegionInfo::signal_count) + .def_ro("total_bytes", &MappedRegionInfo::total_bytes) + .def_ro("flags", &MappedRegionInfo::flags) + .def("__repr__", [](const MappedRegionInfo &info) { + std::ostringstream os; + os << "MappedRegionInfo(device_data_ptr=0x" << std::hex << info.device_data_ptr << ", device_signal_ptr=0x" + << info.device_signal_ptr << std::dec << ", data_bytes=" << info.data_bytes + << ", signal_count=" << info.signal_count << ", total_bytes=" << info.total_bytes + << ", flags=" << info.flags << ")"; + return os.str(); + }); + // --- ChipWorker --- nb::class_(m, "_ChipWorker") .def(nb::init<>()) @@ -800,6 +870,69 @@ NB_MODULE(_task_interface, m) { .def("free", &ChipWorker::free, nb::arg("ptr")) .def("copy_to", &ChipWorker::copy_to, nb::arg("dst"), nb::arg("src"), nb::arg("size")) .def("copy_from", &ChipWorker::copy_from, nb::arg("dst"), nb::arg("src"), nb::arg("size")) + .def( + "open_mapped_region", + [](ChipWorker &self, uint64_t data_bytes, uint32_t signal_count, uint32_t flags) { + return self.open_mapped_region(data_bytes, signal_count, flags); + }, + nb::arg("data_bytes"), nb::arg("signal_count") = 1, nb::arg("flags") = 0 + ) + .def("close_mapped_region", &ChipWorker::close_mapped_region, nb::arg("handle")) + .def( + "mapped_region_info", + [](ChipWorker &self, uint64_t handle) { + return make_mapped_region_info(self.mapped_region_info(handle)); + }, + nb::arg("handle") + ) + .def( + "mapped_region_datacopy_h2region", + [](ChipWorker &self, uint64_t handle, uint64_t offset, nb::object obj) { + if (PyUnicode_Check(obj.ptr())) { + throw std::invalid_argument("mapped_region_datacopy_h2region requires a bytes-like object"); + } + Py_buffer view{}; + if (PyObject_GetBuffer(obj.ptr(), &view, PyBUF_CONTIG_RO) != 0) { + throw nb::python_error(); + } + try { + self.mapped_region_datacopy_h2region(handle, offset, view.buf, static_cast(view.len)); + } catch (const std::exception &e) { + PyBuffer_Release(&view); + raise_python_exception_for_mapped_region_error(e); + } + PyBuffer_Release(&view); + }, + nb::arg("handle"), nb::arg("offset"), nb::arg("buffer") + ) + .def( + "mapped_region_datacopy_region2h", + [](ChipWorker &self, uint64_t handle, uint64_t offset, size_t nbytes) { + std::string out(nbytes, '\0'); + try { + self.mapped_region_datacopy_region2h(handle, offset, out.data(), nbytes); + } catch (const std::exception &e) { + raise_python_exception_for_mapped_region_error(e); + } + return nb::bytes(out.data(), out.size()); + }, + nb::arg("handle"), nb::arg("offset"), nb::arg("nbytes") + ) + .def( + "mapped_region_notify", &ChipWorker::mapped_region_notify, nb::arg("handle"), nb::arg("signal_id"), + nb::arg("value") + ) + .def( + "mapped_region_wait", + [](ChipWorker &self, uint64_t handle, uint32_t signal_id, uint32_t target, uint32_t timeout_us) { + try { + self.mapped_region_wait(handle, signal_id, target, timeout_us); + } catch (const std::exception &e) { + raise_python_exception_for_mapped_region_error(e); + } + }, + nb::arg("handle"), nb::arg("signal_id"), nb::arg("target"), nb::arg("timeout_us") + ) .def( "comm_init", &ChipWorker::comm_init, nb::arg("rank"), nb::arg("nranks"), nb::arg("rootinfo_path"), "Initialize a communicator for this rank. ChipWorker owns ACL + stream " diff --git a/python/bindings/worker_bind.h b/python/bindings/worker_bind.h index 19c6598dd..bafb9ae14 100644 --- a/python/bindings/worker_bind.h +++ b/python/bindings/worker_bind.h @@ -296,12 +296,49 @@ inline void bind_worker(nb::module_ &m) { "control_comm_init", &Worker::control_comm_init, nb::arg("worker_id"), nb::arg("request_shm_name"), nb::call_guard(), "Drive one NEXT_LEVEL chip child through CTRL_COMM_INIT (lazy base comm init)." + ) + .def( + "control_open_mapped_region", &Worker::control_open_mapped_region, nb::arg("worker_id"), + nb::arg("data_bytes"), nb::arg("signal_count"), nb::arg("flags"), nb::call_guard(), + "Open a HostDeviceMappedRegion on a NEXT_LEVEL chip child." + ) + .def( + "control_close_mapped_region", &Worker::control_close_mapped_region, nb::arg("worker_id"), + nb::arg("handle"), nb::call_guard(), + "Close a HostDeviceMappedRegion on a NEXT_LEVEL chip child." + ) + .def( + "control_mapped_region_payload", &Worker::control_mapped_region_payload, nb::arg("worker_id"), + nb::arg("sub_cmd"), nb::arg("shm_name"), nb::call_guard(), + "Dispatch a HostDeviceMappedRegion side-band shm payload command." + ) + .def( + "control_mapped_region_notify", &Worker::control_mapped_region_notify, nb::arg("worker_id"), + nb::arg("handle"), nb::arg("signal_id"), nb::arg("value"), nb::call_guard(), + "Notify a HostDeviceMappedRegion signal slot on a NEXT_LEVEL chip child." + ) + .def( + "control_mapped_region_wait", &Worker::control_mapped_region_wait, nb::arg("worker_id"), nb::arg("handle"), + nb::arg("signal_id"), nb::arg("target"), nb::arg("timeout_us"), nb::call_guard(), + "Wait on a HostDeviceMappedRegion signal slot on a NEXT_LEVEL chip child." ); m.attr("DEFAULT_HEAP_RING_SIZE") = static_cast(DEFAULT_HEAP_RING_SIZE); m.attr("MAILBOX_SIZE") = static_cast(MAILBOX_SIZE); m.attr("MAILBOX_OFF_ERROR_MSG") = static_cast(MAILBOX_OFF_ERROR_MSG); m.attr("MAILBOX_ERROR_MSG_SIZE") = static_cast(MAILBOX_ERROR_MSG_SIZE); + m.attr("CTRL_OFF_ARG0") = static_cast(CTRL_OFF_ARG0); + m.attr("CTRL_OFF_ARG1") = static_cast(CTRL_OFF_ARG1); + m.attr("CTRL_OFF_ARG2") = static_cast(CTRL_OFF_ARG2); + m.attr("CTRL_OFF_ARG3") = static_cast(CTRL_OFF_ARG3); + m.attr("CTRL_OFF_RESULT") = static_cast(CTRL_OFF_RESULT); + m.attr("CTRL_OPEN_MAPPED_REGION") = static_cast(CTRL_OPEN_MAPPED_REGION); + m.attr("CTRL_CLOSE_MAPPED_REGION") = static_cast(CTRL_CLOSE_MAPPED_REGION); + m.attr("CTRL_MAPPED_REGION_INFO") = static_cast(CTRL_MAPPED_REGION_INFO); + m.attr("CTRL_MAPPED_REGION_DATACOPY_H2REGION") = static_cast(CTRL_MAPPED_REGION_DATACOPY_H2REGION); + m.attr("CTRL_MAPPED_REGION_DATACOPY_REGION2H") = static_cast(CTRL_MAPPED_REGION_DATACOPY_REGION2H); + m.attr("CTRL_MAPPED_REGION_NOTIFY") = static_cast(CTRL_MAPPED_REGION_NOTIFY); + m.attr("CTRL_MAPPED_REGION_WAIT") = static_cast(CTRL_MAPPED_REGION_WAIT); m.attr("MAX_RING_DEPTH") = static_cast(MAX_RING_DEPTH); m.attr("MAX_SCOPE_DEPTH") = static_cast(MAX_SCOPE_DEPTH); diff --git a/python/simpler/task_interface.py b/python/simpler/task_interface.py index 0a06d269a..5832840c9 100644 --- a/python/simpler/task_interface.py +++ b/python/simpler/task_interface.py @@ -35,6 +35,7 @@ ContinuousTensor, CoreCallable, DataType, + MappedRegionInfo, SubmitResult, TaskArgs, TaskState, @@ -63,6 +64,7 @@ "ChipCallable", "CallConfig", "ChipWorker", + "MappedRegionInfo", "arg_direction_name", "scalar_to_uint64", # Distributed runtime @@ -428,6 +430,34 @@ def copy_from(self, dst, src, size): """Copy *size* bytes from worker *src* to host *dst*.""" self._impl.copy_from(int(dst), int(src), int(size)) + def open_mapped_region(self, data_bytes: int, signal_count: int = 1, flags: int = 0) -> int: + """Open a child-owned mapped region and return its opaque handle.""" + return int(self._impl.open_mapped_region(int(data_bytes), int(signal_count), int(flags))) + + def close_mapped_region(self, handle: int) -> None: + """Close a mapped-region handle opened on this chip worker.""" + self._impl.close_mapped_region(int(handle)) + + def mapped_region_info(self, handle: int) -> MappedRegionInfo: + """Return public mapped-region info with host pointers masked to zero.""" + return self._impl.mapped_region_info(int(handle)) + + def mapped_region_datacopy_h2region(self, handle: int, offset: int, data) -> None: + """Copy bytes-like data into the mapped region's data area.""" + self._impl.mapped_region_datacopy_h2region(int(handle), int(offset), data) + + def mapped_region_datacopy_region2h(self, handle: int, offset: int, nbytes: int) -> bytes: + """Copy bytes out of the mapped region's data area.""" + return self._impl.mapped_region_datacopy_region2h(int(handle), int(offset), int(nbytes)) + + def mapped_region_notify(self, handle: int, signal_id: int, value: int) -> None: + """Publish a mapped-region signal slot value.""" + self._impl.mapped_region_notify(int(handle), int(signal_id), int(value)) + + def mapped_region_wait(self, handle: int, signal_id: int, target: int, timeout_us: int) -> None: + """Wait until a mapped-region signal slot reaches ``target``.""" + self._impl.mapped_region_wait(int(handle), int(signal_id), int(target), int(timeout_us)) + def comm_init(self, rank: int, nranks: int, rootinfo_path: str) -> int: """Initialize a distributed communicator for this rank. diff --git a/python/simpler/worker.py b/python/simpler/worker.py index e4956e708..4ff528fb3 100644 --- a/python/simpler/worker.py +++ b/python/simpler/worker.py @@ -56,17 +56,31 @@ def my_l4_orch(orch, args, config): """ import ctypes +import errno import os import signal import struct import sys import threading import time +from dataclasses import dataclass from multiprocessing.shared_memory import SharedMemory from typing import Any, Optional import cloudpickle from _task_interface import ( # pyright: ignore[reportMissingImports] + CTRL_CLOSE_MAPPED_REGION as _CPP_CTRL_CLOSE_MAPPED_REGION, + CTRL_MAPPED_REGION_DATACOPY_H2REGION as _CPP_CTRL_MAPPED_REGION_DATACOPY_H2REGION, + CTRL_MAPPED_REGION_DATACOPY_REGION2H as _CPP_CTRL_MAPPED_REGION_DATACOPY_REGION2H, + CTRL_MAPPED_REGION_INFO as _CPP_CTRL_MAPPED_REGION_INFO, + CTRL_MAPPED_REGION_NOTIFY as _CPP_CTRL_MAPPED_REGION_NOTIFY, + CTRL_MAPPED_REGION_WAIT as _CPP_CTRL_MAPPED_REGION_WAIT, + CTRL_OFF_ARG0 as _CPP_CTRL_OFF_ARG0, + CTRL_OFF_ARG1 as _CPP_CTRL_OFF_ARG1, + CTRL_OFF_ARG2 as _CPP_CTRL_OFF_ARG2, + CTRL_OFF_ARG3 as _CPP_CTRL_OFF_ARG3, + CTRL_OFF_RESULT as _CPP_CTRL_OFF_RESULT, + CTRL_OPEN_MAPPED_REGION as _CPP_CTRL_OPEN_MAPPED_REGION, MAX_REGISTERED_CALLABLE_IDS, RunTiming, WorkerType, @@ -87,6 +101,7 @@ def my_l4_orch(orch, args, config): ChipWorker, CommBufferSpec, CommDomainHandle, + MappedRegionInfo, TaskArgs, _Worker, ) @@ -169,6 +184,13 @@ def my_l4_orch(orch, args, config): _CTRL_COMM_INIT = 9 _CTRL_PY_REGISTER = 10 _CTRL_PY_UNREGISTER = 11 +_CTRL_OPEN_MAPPED_REGION = int(_CPP_CTRL_OPEN_MAPPED_REGION) +_CTRL_CLOSE_MAPPED_REGION = int(_CPP_CTRL_CLOSE_MAPPED_REGION) +_CTRL_MAPPED_REGION_INFO = int(_CPP_CTRL_MAPPED_REGION_INFO) +_CTRL_MAPPED_REGION_DATACOPY_H2REGION = int(_CPP_CTRL_MAPPED_REGION_DATACOPY_H2REGION) +_CTRL_MAPPED_REGION_DATACOPY_REGION2H = int(_CPP_CTRL_MAPPED_REGION_DATACOPY_REGION2H) +_CTRL_MAPPED_REGION_NOTIFY = int(_CPP_CTRL_MAPPED_REGION_NOTIFY) +_CTRL_MAPPED_REGION_WAIT = int(_CPP_CTRL_MAPPED_REGION_WAIT) # Layout of the CTRL_COMM_INIT request shm. _COMM_INIT_HEADER = struct.Struct(" bytes: @@ -494,6 +529,79 @@ def _handle_ctrl_release_domain(cw: "ChipWorker", buf: memoryview) -> None: cw._impl.comm_release_domain_windows(int(handle), int(allocation_id), int(rank_count), int(domain_rank)) +def _mapped_region_exception_status(exc: BaseException) -> int: + if isinstance(exc, (ValueError, TypeError)): + return -errno.EINVAL + if isinstance(exc, TimeoutError): + return -errno.EAGAIN + msg = str(exc) + if "-95" in msg or "ENOTSUP" in msg: + return -errno.ENOTSUP + return -errno.EIO + + +def _handle_ctrl_mapped_region_payload(cw: "ChipWorker", buf: memoryview, sub_cmd: int) -> None: + shm_name = _read_shm_name(buf, _OFF_ARGS) + expected_op = { + _CTRL_MAPPED_REGION_INFO: _HDMR_OP_INFO, + _CTRL_MAPPED_REGION_DATACOPY_H2REGION: _HDMR_OP_H2REGION, + _CTRL_MAPPED_REGION_DATACOPY_REGION2H: _HDMR_OP_REGION2H, + }.get(int(sub_cmd)) + if expected_op is None: + raise RuntimeError(f"mapped-region payload: invalid sub-command {int(sub_cmd)}") + + shm = SharedMemory(name=shm_name) + try: + shm_buf = shm.buf + assert shm_buf is not None + if shm.size < _HDMR_HEADER.size: + raise RuntimeError(f"mapped-region payload too small: {shm.size} bytes") + magic, version, op, region, offset, nbytes, _status, reserved = _HDMR_HEADER.unpack_from(shm_buf, 0) + if magic != _HDMR_MAGIC: + raise RuntimeError(f"mapped-region payload invalid magic: {magic!r}") + if version != _HDMR_VERSION: + raise RuntimeError(f"mapped-region payload unsupported version: {version}") + if op != expected_op: + raise RuntimeError(f"mapped-region payload op {op} does not match sub-command {int(sub_cmd)}") + if reserved != 0: + raise RuntimeError(f"mapped-region payload reserved field must be zero, got {reserved}") + required_size = _HDMR_HEADER.size + (0 if op == _HDMR_OP_INFO else int(nbytes)) + if op == _HDMR_OP_INFO: + required_size = _HDMR_HEADER.size + _HDMR_INFO_PAYLOAD.size + if required_size > shm.size: + raise RuntimeError(f"mapped-region payload size mismatch: need {required_size}, shm={shm.size}") + + status = 0 + try: + if op == _HDMR_OP_INFO: + info = cw.mapped_region_info(int(region)) + _HDMR_INFO_PAYLOAD.pack_into( + shm_buf, + _HDMR_HEADER.size, + 0, + int(info.device_data_ptr), + int(info.data_bytes), + 0, + int(info.device_signal_ptr), + int(info.signal_count), + int(info.total_bytes), + int(info.flags), + ) + elif op == _HDMR_OP_H2REGION: + payload = bytes(shm_buf[_HDMR_HEADER.size : _HDMR_HEADER.size + int(nbytes)]) + cw.mapped_region_datacopy_h2region(int(region), int(offset), payload) + elif op == _HDMR_OP_REGION2H: + payload = cw.mapped_region_datacopy_region2h(int(region), int(offset), int(nbytes)) + shm_buf[_HDMR_HEADER.size : _HDMR_HEADER.size + int(nbytes)] = payload + else: + status = -errno.EINVAL + except Exception as e: # noqa: BLE001 + status = _mapped_region_exception_status(e) + struct.pack_into(" int: """Return the cached base-communicator handle the chip allocated during bootstrap. @@ -653,6 +761,32 @@ def _run_chip_main_loop( # noqa: PLR0912 -- TASK_READY + 6 control sub-commands _handle_ctrl_release_domain(cw, buf) elif sub_cmd == _CTRL_COMM_INIT: _handle_ctrl_comm_init(cw, buf) + elif sub_cmd == _CTRL_OPEN_MAPPED_REGION: + data_bytes = struct.unpack_from("Q", buf, _CTRL_OFF_ARG0)[0] + signal_count = struct.unpack_from("Q", buf, _CTRL_OFF_ARG1)[0] + flags = struct.unpack_from("Q", buf, _CTRL_OFF_ARG2)[0] + handle = cw.open_mapped_region(int(data_bytes), int(signal_count), int(flags)) + struct.pack_into("Q", buf, _CTRL_OFF_RESULT, int(handle)) + elif sub_cmd == _CTRL_CLOSE_MAPPED_REGION: + handle = struct.unpack_from("Q", buf, _CTRL_OFF_ARG0)[0] + cw.close_mapped_region(int(handle)) + elif sub_cmd in ( + _CTRL_MAPPED_REGION_INFO, + _CTRL_MAPPED_REGION_DATACOPY_H2REGION, + _CTRL_MAPPED_REGION_DATACOPY_REGION2H, + ): + _handle_ctrl_mapped_region_payload(cw, buf, int(sub_cmd)) + elif sub_cmd == _CTRL_MAPPED_REGION_NOTIFY: + handle = struct.unpack_from("Q", buf, _CTRL_OFF_ARG0)[0] + signal_id = struct.unpack_from("Q", buf, _CTRL_OFF_ARG1)[0] + value = struct.unpack_from("Q", buf, _CTRL_OFF_ARG2)[0] + cw.mapped_region_notify(int(handle), int(signal_id), int(value)) + elif sub_cmd == _CTRL_MAPPED_REGION_WAIT: + handle = struct.unpack_from("Q", buf, _CTRL_OFF_ARG0)[0] + signal_id = struct.unpack_from("Q", buf, _CTRL_OFF_ARG1)[0] + target = struct.unpack_from("Q", buf, _CTRL_OFF_ARG2)[0] + timeout_us = struct.unpack_from("Q", buf, _CTRL_OFF_ARG3)[0] + cw.mapped_region_wait(int(handle), int(signal_id), int(target), int(timeout_us)) else: raise RuntimeError(f"unknown control sub-command {int(sub_cmd)}") except Exception as e: # noqa: BLE001 @@ -831,6 +965,16 @@ def _child_worker_loop( # --------------------------------------------------------------------------- +@dataclass(frozen=True) +class MappedRegion: + handle: int + worker_id: int + data_bytes: int + signal_count: int + flags: int + closed: bool = False + + class Worker: """Unified worker for all hierarchy levels. @@ -1264,6 +1408,251 @@ def _init_level2(self) -> None: if isinstance(target, ChipCallable): self._chip_worker.prepare_callable(cid, target) + def _resolve_mapped_region_worker_id(self, region: MappedRegion, worker_id: Optional[int]) -> int: + selected = region.worker_id if worker_id is None else int(worker_id) + if selected != region.worker_id: + raise ValueError(f"mapped region belongs to worker_id={region.worker_id}, got worker_id={selected}") + return selected + + def _ensure_open_mapped_region(self, region: MappedRegion, worker_id: Optional[int]) -> int: + selected = self._resolve_mapped_region_worker_id(region, worker_id) + if region.closed: + raise ValueError("mapped region is closed") + return selected + + def _mapped_region_chip_worker(self, worker_id: int) -> ChipWorker: + if self.level != 2: + raise NotImplementedError("mapped-region L3 proxy support is not implemented yet") + if worker_id != 0: + raise ValueError("level-2 mapped regions only support worker_id=0") + if self._chip_worker is None: + raise RuntimeError("Worker.init() must be called before mapped-region operations") + return self._chip_worker + + def _ensure_mapped_region_l3_control_ready(self, worker_id: int) -> _Worker: + if not self._initialized: + raise RuntimeError("Worker.init() must be called before mapped-region operations") + if self.level != 3: + raise RuntimeError("mapped-region L3 proxy support requires a level-3 Worker with chip children") + self._check_chip_worker_id(worker_id) + if not getattr(self, "_hierarchical_started", False): + self._start_hierarchical() + if self._worker is None: + raise RuntimeError("mapped-region L3 proxy is not available after Worker.close()") + return self._worker + + def _raise_mapped_region_control_error(self, exc: RuntimeError) -> None: + msg = str(exc) + if "TimeoutError" in msg or "timed out" in msg: + raise TimeoutError(msg) from exc + if "ValueError" in msg or "invalid_argument" in msg or "code -22" in msg: + raise ValueError(msg) from exc + raise exc + + def _raise_mapped_region_status(self, status: int) -> None: + if status == 0: + return + if status == -errno.EAGAIN or status == -errno.EWOULDBLOCK: + raise TimeoutError(f"mapped-region operation timed out with status {status}") + if status == -errno.EINVAL: + raise ValueError(f"mapped-region operation failed with status {status}") + raise RuntimeError(f"mapped-region operation failed with status {status}") + + def _mapped_region_shm_name(self, worker_id: int) -> str: + counter = getattr(self, "_mapped_region_shm_counter", 0) + self._mapped_region_shm_counter = counter + 1 + name = f"simpler-hdmr-{os.getpid()}-{int(worker_id)}-{counter}" + if len(name.encode("utf-8")) + 1 > _CTRL_SHM_NAME_BYTES: + raise RuntimeError(f"mapped-region shm name too long: {name}") + return name + + def _mapped_region_payload_roundtrip( + self, + worker_id: int, + sub_cmd: int, + op: int, + region_handle: int, + offset: int, + payload: bytes, + reply_nbytes: int, + ) -> tuple[int, bytes]: + dw = self._ensure_mapped_region_l3_control_ready(worker_id) + nbytes = len(payload) if op == _HDMR_OP_H2REGION else int(reply_nbytes) + shm_size = _HDMR_HEADER.size + (_HDMR_INFO_PAYLOAD.size if op == _HDMR_OP_INFO else int(nbytes)) + shm = SharedMemory(name=self._mapped_region_shm_name(worker_id), create=True, size=shm_size) + try: + shm_buf = shm.buf + assert shm_buf is not None + shm_buf[:] = b"\x00" * shm_size + _HDMR_HEADER.pack_into( + shm_buf, + 0, + _HDMR_MAGIC, + _HDMR_VERSION, + int(op), + int(region_handle), + int(offset), + int(nbytes), + 0, + 0, + ) + if payload: + shm_buf[_HDMR_HEADER.size : _HDMR_HEADER.size + len(payload)] = payload + try: + dw.control_mapped_region_payload(int(worker_id), int(sub_cmd), shm.name) + except RuntimeError as e: + self._raise_mapped_region_control_error(e) + status = struct.unpack_from(" MappedRegion: + worker_id = int(worker_id) + if self.level == 2: + handle = self._mapped_region_chip_worker(worker_id).open_mapped_region( + int(data_bytes), int(signal_count), int(flags) + ) + else: + dw = self._ensure_mapped_region_l3_control_ready(worker_id) + try: + handle = dw.control_open_mapped_region(worker_id, int(data_bytes), int(signal_count), int(flags)) + except RuntimeError as e: + self._raise_mapped_region_control_error(e) + return MappedRegion( + handle=int(handle), + worker_id=int(worker_id), + data_bytes=int(data_bytes), + signal_count=int(signal_count), + flags=int(flags), + ) + + def close_mapped_region(self, region: MappedRegion, worker_id: Optional[int] = None) -> None: + selected = self._resolve_mapped_region_worker_id(region, worker_id) + if region.closed: + return + if self.level == 2: + self._mapped_region_chip_worker(selected).close_mapped_region(region.handle) + else: + dw = self._ensure_mapped_region_l3_control_ready(selected) + try: + dw.control_close_mapped_region(selected, int(region.handle)) + except RuntimeError as e: + self._raise_mapped_region_control_error(e) + object.__setattr__(region, "closed", True) + + def mapped_region_info(self, region: MappedRegion, worker_id: Optional[int] = None) -> MappedRegionInfo: + selected = self._ensure_open_mapped_region(region, worker_id) + if self.level == 2: + return self._mapped_region_chip_worker(selected).mapped_region_info(region.handle) + status, payload = self._mapped_region_payload_roundtrip( + selected, _CTRL_MAPPED_REGION_INFO, _HDMR_OP_INFO, int(region.handle), 0, b"", _HDMR_INFO_PAYLOAD.size + ) + self._raise_mapped_region_status(status) + fields = _HDMR_INFO_PAYLOAD.unpack_from(payload, 0) + return MappedRegionInfo(*fields) + + def mapped_region_datacopy_h2region( + self, + region: MappedRegion, + offset: int, + data, + worker_id: Optional[int] = None, + ) -> None: + if isinstance(data, str): + raise ValueError("mapped_region_datacopy_h2region requires a bytes-like object") + selected = self._ensure_open_mapped_region(region, worker_id) + if self.level == 2: + self._mapped_region_chip_worker(selected).mapped_region_datacopy_h2region(region.handle, int(offset), data) + return + try: + payload = memoryview(data) + except TypeError as e: + raise ValueError("mapped_region_datacopy_h2region requires a bytes-like object") from e + if not payload.contiguous: + raise ValueError("mapped_region_datacopy_h2region requires a contiguous bytes-like object") + status, _ = self._mapped_region_payload_roundtrip( + selected, + _CTRL_MAPPED_REGION_DATACOPY_H2REGION, + _HDMR_OP_H2REGION, + int(region.handle), + int(offset), + bytes(payload), + 0, + ) + self._raise_mapped_region_status(status) + + def mapped_region_datacopy_region2h( + self, + region: MappedRegion, + offset: int, + nbytes: int, + worker_id: Optional[int] = None, + ) -> bytes: + selected = self._ensure_open_mapped_region(region, worker_id) + if self.level == 2: + return self._mapped_region_chip_worker(selected).mapped_region_datacopy_region2h( + region.handle, int(offset), int(nbytes) + ) + status, payload = self._mapped_region_payload_roundtrip( + selected, + _CTRL_MAPPED_REGION_DATACOPY_REGION2H, + _HDMR_OP_REGION2H, + int(region.handle), + int(offset), + b"", + int(nbytes), + ) + self._raise_mapped_region_status(status) + return payload + + def mapped_region_notify( + self, + region: MappedRegion, + signal_id: int, + value: int, + worker_id: Optional[int] = None, + ) -> None: + selected = self._ensure_open_mapped_region(region, worker_id) + if self.level == 2: + self._mapped_region_chip_worker(selected).mapped_region_notify(region.handle, int(signal_id), int(value)) + return + dw = self._ensure_mapped_region_l3_control_ready(selected) + try: + dw.control_mapped_region_notify(selected, int(region.handle), int(signal_id), int(value)) + except RuntimeError as e: + self._raise_mapped_region_control_error(e) + + def mapped_region_wait( + self, + region: MappedRegion, + signal_id: int, + target: int, + timeout_us: int, + worker_id: Optional[int] = None, + ) -> None: + selected = self._ensure_open_mapped_region(region, worker_id) + if self.level == 2: + self._mapped_region_chip_worker(selected).mapped_region_wait( + region.handle, int(signal_id), int(target), int(timeout_us) + ) + return + dw = self._ensure_mapped_region_l3_control_ready(selected) + try: + dw.control_mapped_region_wait(selected, int(region.handle), int(signal_id), int(target), int(timeout_us)) + except RuntimeError as e: + self._raise_mapped_region_control_error(e) + def _init_hierarchical(self) -> None: device_ids = self._config.get("device_ids", []) n_sub = self._config.get("num_sub_workers", 0) diff --git a/src/a2a3/platform/onboard/host/CMakeLists.txt b/src/a2a3/platform/onboard/host/CMakeLists.txt index f0f01d438..c60ecf0b7 100644 --- a/src/a2a3/platform/onboard/host/CMakeLists.txt +++ b/src/a2a3/platform/onboard/host/CMakeLists.txt @@ -51,8 +51,10 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/device_runner.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/memory_allocator.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/pto_runtime_c_api.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/host_device_mapped_region_onboard.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/platform_compile_info.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/host_regs.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host_device_comm/host_device_mapped_region.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/l2_perf_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/tensor_dump_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/comm_hccl.cpp" diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index 4f04a94ce..967e0f139 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -495,6 +495,107 @@ void DeviceRunner::free_tensor(void *dev_ptr) { } } +int DeviceRunner::host_register_device_memory(void *dev_ptr, size_t bytes, void **host_ptr) { + if (dev_ptr == nullptr || host_ptr == nullptr || bytes == 0) { + return -1; + } + *host_ptr = nullptr; + if (device_id_ < 0) { + LOG_ERROR("host_register_device_memory requires an attached device"); + return -1; + } + if (load_hal_if_needed() != 0) { + LOG_ERROR("Failed to load ascend_hal for mapped region: %s", dlerror()); + return -1; + } + HalHostRegisterFn fn = get_halHostRegister(); + if (fn == nullptr) { + LOG_ERROR("halHostRegister symbol not found: %s", dlerror()); + return -1; + } + int rc = fn(dev_ptr, bytes, DEV_SVM_MAP_HOST, device_id_, host_ptr); + if (rc != 0) { + LOG_ERROR( + "halHostRegister mapped region failed: dev_ptr=%p size=%zu device=%d rc=%d", dev_ptr, bytes, device_id_, rc + ); + } + return rc; +} + +int DeviceRunner::host_unregister_device_memory(void *host_ptr) { + if (host_ptr == nullptr) { + return 0; + } + if (device_id_ < 0) { + LOG_ERROR("host_unregister_device_memory requires an attached device"); + return -1; + } + HalHostUnregisterFn fn = get_halHostUnregister(); + if (fn == nullptr) { + LOG_ERROR("halHostUnregister symbol not found: %s", dlerror()); + return -1; + } + int rc = fn(host_ptr, device_id_); + if (rc != 0) { + LOG_ERROR("halHostUnregister mapped region failed: host_ptr=%p device=%d rc=%d", host_ptr, device_id_, rc); + } + return rc; +} + +namespace { + +void clean_host_cache_range(void *host_ptr, size_t bytes) { + if (host_ptr == nullptr || bytes == 0) { + return; + } +#if defined(__aarch64__) + constexpr uintptr_t kCacheLineBytes = 64; + uintptr_t start = reinterpret_cast(host_ptr) & ~(kCacheLineBytes - 1U); + uintptr_t end = (reinterpret_cast(host_ptr) + bytes + kCacheLineBytes - 1U) & ~(kCacheLineBytes - 1U); + for (uintptr_t p = start; p < end; p += kCacheLineBytes) { + __asm__ __volatile__("dc cvac, %0" ::"r"(p) : "memory"); + } + __asm__ __volatile__("dsb sy" ::: "memory"); + __asm__ __volatile__("isb" ::: "memory"); +#elif defined(__x86_64__) + __asm__ __volatile__("" ::: "memory"); +#else + __asm__ __volatile__("" ::: "memory"); +#endif +} + +void clean_invalidate_host_cache_range(void *host_ptr, size_t bytes) { + if (host_ptr == nullptr || bytes == 0) { + return; + } +#if defined(__aarch64__) + constexpr uintptr_t kCacheLineBytes = 64; + uintptr_t start = reinterpret_cast(host_ptr) & ~(kCacheLineBytes - 1U); + uintptr_t end = (reinterpret_cast(host_ptr) + bytes + kCacheLineBytes - 1U) & ~(kCacheLineBytes - 1U); + for (uintptr_t p = start; p < end; p += kCacheLineBytes) { + __asm__ __volatile__("dc civac, %0" ::"r"(p) : "memory"); + } + __asm__ __volatile__("dsb sy" ::: "memory"); + __asm__ __volatile__("isb" ::: "memory"); +#elif defined(__x86_64__) + __asm__ __volatile__("" ::: "memory"); +#else + __asm__ __volatile__("" ::: "memory"); +#endif +} + +} // namespace + +int DeviceRunner::flush_host_cache_range(void *host_ptr, size_t bytes) { + clean_host_cache_range(host_ptr, bytes); + return 0; +} + +int DeviceRunner::invalidate_host_cache_range(void *host_ptr, size_t bytes) { + clean_invalidate_host_cache_range(host_ptr, bytes); + return 0; +} + int DeviceRunner::copy_to_device(void *dev_ptr, const void *host_ptr, size_t bytes) { return rtMemcpy(dev_ptr, bytes, host_ptr, bytes, RT_MEMCPY_HOST_TO_DEVICE); } diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h index bd9d088b0..8870c8d58 100644 --- a/src/a2a3/platform/onboard/host/device_runner.h +++ b/src/a2a3/platform/onboard/host/device_runner.h @@ -233,6 +233,28 @@ class DeviceRunner { */ void free_tensor(void *dev_ptr); + /** + * Map device memory into a host-visible aperture via Ascend HAL. + * + * @param dev_ptr Device allocation returned by allocate_tensor(). + * @param bytes Mapping size in bytes. + * @param host_ptr Output host mapping pointer. + * @return 0 on success, non-zero HAL/runtime error on failure. + */ + int host_register_device_memory(void *dev_ptr, size_t bytes, void **host_ptr); + + /** + * Unmap a host-visible aperture created by host_register_device_memory(). + * + * @param host_ptr Host mapping pointer returned by HAL. + * @return 0 on success, non-zero HAL/runtime error on failure. + */ + int host_unregister_device_memory(void *host_ptr); + + int flush_host_cache_range(void *host_ptr, size_t bytes); + + int invalidate_host_cache_range(void *host_ptr, size_t bytes); + /** * Copy data from host to device * diff --git a/src/a2a3/platform/onboard/host/host_device_mapped_region_onboard.cpp b/src/a2a3/platform/onboard/host/host_device_mapped_region_onboard.cpp new file mode 100644 index 000000000..1c10e6970 --- /dev/null +++ b/src/a2a3/platform/onboard/host/host_device_mapped_region_onboard.cpp @@ -0,0 +1,99 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include "host_device_mapped_region_onboard.h" + +#include + +#include + +#include "device_runner.h" + +namespace { + +struct OnboardMappedRegionResource { + DeviceRunner *runner = nullptr; + void *dev_ptr = nullptr; + void *host_ptr = nullptr; +}; + +} // namespace + +int a2a3_onboard_host_device_mapped_region_allocate( + DeviceContextHandle ctx, uint64_t total_bytes, HostDeviceMappedRegionPlatform *platform, void **host_base, + void **device_base +) { + if (ctx == NULL || platform == NULL || host_base == NULL || device_base == NULL) { + return -EINVAL; + } + *host_base = nullptr; + *device_base = nullptr; + auto *runner = static_cast(ctx); + if (runner->device_id() < 0) { + return -EIO; + } + + auto *resource = new (std::nothrow) OnboardMappedRegionResource; + if (resource == nullptr) { + return -ENOMEM; + } + resource->runner = runner; + resource->dev_ptr = runner->allocate_tensor(static_cast(total_bytes)); + if (resource->dev_ptr == nullptr) { + delete resource; + return -ENOMEM; + } + + int rc = + runner->host_register_device_memory(resource->dev_ptr, static_cast(total_bytes), &resource->host_ptr); + if (rc != 0 || resource->host_ptr == nullptr) { + runner->free_tensor(resource->dev_ptr); + delete resource; + return -EIO; + } + + platform->resource = resource; + platform->device_id = static_cast(runner->device_id()); + platform->cache_ops_cookie = runner; + platform->flush_host_range = [](HostDeviceMappedRegionPlatform *p, void *host_ptr, uint64_t bytes) { + auto *mapped_runner = static_cast(p->cache_ops_cookie); + if (mapped_runner == nullptr) { + return -1; + } + return mapped_runner->flush_host_cache_range(host_ptr, static_cast(bytes)); + }; + platform->invalidate_host_range = [](HostDeviceMappedRegionPlatform *p, void *host_ptr, uint64_t bytes) { + auto *mapped_runner = static_cast(p->cache_ops_cookie); + if (mapped_runner == nullptr) { + return -1; + } + return mapped_runner->invalidate_host_cache_range(host_ptr, static_cast(bytes)); + }; + platform->release = [](HostDeviceMappedRegionPlatform *p) { + auto *r = static_cast(p->resource); + if (r == nullptr) { + return; + } + if (r->runner != nullptr) { + if (r->host_ptr != nullptr) { + (void)r->runner->host_unregister_device_memory(r->host_ptr); + } + if (r->dev_ptr != nullptr) { + r->runner->free_tensor(r->dev_ptr); + } + } + delete r; + p->resource = nullptr; + }; + *host_base = resource->host_ptr; + *device_base = resource->dev_ptr; + return 0; +} diff --git a/src/a2a3/platform/onboard/host/host_device_mapped_region_onboard.h b/src/a2a3/platform/onboard/host/host_device_mapped_region_onboard.h new file mode 100644 index 000000000..6bfd77d6b --- /dev/null +++ b/src/a2a3/platform/onboard/host/host_device_mapped_region_onboard.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef SRC_A2A3_PLATFORM_ONBOARD_HOST_HOST_DEVICE_MAPPED_REGION_ONBOARD_H_ +#define SRC_A2A3_PLATFORM_ONBOARD_HOST_HOST_DEVICE_MAPPED_REGION_ONBOARD_H_ + +#include "host_device_comm/host_device_mapped_region.h" + +int a2a3_onboard_host_device_mapped_region_allocate( + DeviceContextHandle ctx, uint64_t total_bytes, HostDeviceMappedRegionPlatform *platform, void **host_base, + void **device_base +); + +#endif // SRC_A2A3_PLATFORM_ONBOARD_HOST_HOST_DEVICE_MAPPED_REGION_ONBOARD_H_ diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp index f36aa6f0d..457449506 100644 --- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp @@ -18,9 +18,12 @@ #include "pto_runtime_c_api.h" #include "callable.h" +#include "host_device_comm/host_device_mapped_region.h" +#include "host_device_mapped_region_onboard.h" #include "prepare_callable_common.h" #include "task_args.h" +#include #include #include @@ -144,7 +147,10 @@ DeviceContextHandle create_device_context(void) { } } -void destroy_device_context(DeviceContextHandle ctx) { delete static_cast(ctx); } +void destroy_device_context(DeviceContextHandle ctx) { + host_device_mapped_region_close_all_common(ctx); + delete static_cast(ctx); +} size_t get_runtime_size(void) { return sizeof(Runtime); } @@ -218,12 +224,59 @@ int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *de int finalize_device(DeviceContextHandle ctx) { if (ctx == NULL) return -1; try { + host_device_mapped_region_close_all_common(ctx); return static_cast(ctx)->finalize(); } catch (...) { return -1; } } +int open_host_device_mapped_region_ctx( + DeviceContextHandle ctx, const HostDeviceMappedRegionConfig *cfg, HostDeviceMappedRegionHandle *out_region +) { + return host_device_mapped_region_open_common(ctx, cfg, out_region, a2a3_onboard_host_device_mapped_region_allocate); +} + +int close_host_device_mapped_region_ctx(DeviceContextHandle ctx, HostDeviceMappedRegionHandle region) { + return host_device_mapped_region_close_common(ctx, region); +} + +int host_device_mapped_region_info_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, HostDeviceMappedRegionInfo *info +) { + int rc = host_device_mapped_region_info_common(ctx, region, info); + if (rc == 0) { + info->host_data_ptr = 0; + info->host_signal_ptr = 0; + } + return rc; +} + +int host_device_mapped_region_datacopy_h2region_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint64_t offset, const void *src, size_t nbytes +) { + return host_device_mapped_region_datacopy_h2region_common(ctx, region, offset, src, nbytes); +} + +int host_device_mapped_region_datacopy_region2h_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint64_t offset, void *dst, size_t nbytes +) { + return host_device_mapped_region_datacopy_region2h_common(ctx, region, offset, dst, nbytes); +} + +int host_device_mapped_region_notify_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint32_t signal_id, uint32_t value +) { + return host_device_mapped_region_notify_common(ctx, region, signal_id, value); +} + +int host_device_mapped_region_wait_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint32_t signal_id, uint32_t target, + uint32_t timeout_us +) { + return host_device_mapped_region_wait_common(ctx, region, signal_id, target, timeout_us); +} + int simpler_init( DeviceContextHandle ctx, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size diff --git a/src/a2a3/platform/sim/host/CMakeLists.txt b/src/a2a3/platform/sim/host/CMakeLists.txt index 49d0fe62a..c0e407eef 100644 --- a/src/a2a3/platform/sim/host/CMakeLists.txt +++ b/src/a2a3/platform/sim/host/CMakeLists.txt @@ -43,6 +43,8 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/memory_allocator.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/pto_runtime_c_api.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/platform_compile_info.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host_device_comm/host_device_mapped_region.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host_device_comm/host_device_mapped_region_sim.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/l2_perf_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/tensor_dump_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/pmu_collector.cpp" diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp index 7c1e3cb7e..b5cc84bdd 100644 --- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp @@ -18,9 +18,12 @@ #include "pto_runtime_c_api.h" #include "callable.h" +#include "host_device_comm/host_device_mapped_region.h" +#include "host_device_comm/host_device_mapped_region_sim.h" #include "prepare_callable_common.h" #include "task_args.h" +#include #include #include @@ -139,7 +142,10 @@ DeviceContextHandle create_device_context(void) { } } -void destroy_device_context(DeviceContextHandle ctx) { delete static_cast(ctx); } +void destroy_device_context(DeviceContextHandle ctx) { + host_device_mapped_region_close_all_common(ctx); + delete static_cast(ctx); +} size_t get_runtime_size(void) { return sizeof(Runtime); } @@ -180,6 +186,7 @@ int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *de int finalize_device(DeviceContextHandle ctx) { if (ctx == NULL) return -1; try { + host_device_mapped_region_close_all_common(ctx); int rc = static_cast(ctx)->finalize(); int dev = pto_cpu_sim_get_bound_device(); if (dev >= 0) { @@ -191,6 +198,52 @@ int finalize_device(DeviceContextHandle ctx) { } } +int open_host_device_mapped_region_ctx( + DeviceContextHandle ctx, const HostDeviceMappedRegionConfig *cfg, HostDeviceMappedRegionHandle *out_region +) { + return host_device_mapped_region_open_common(ctx, cfg, out_region, host_device_mapped_region_allocate_sim); +} + +int close_host_device_mapped_region_ctx(DeviceContextHandle ctx, HostDeviceMappedRegionHandle region) { + return host_device_mapped_region_close_common(ctx, region); +} + +int host_device_mapped_region_info_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, HostDeviceMappedRegionInfo *info +) { + int rc = host_device_mapped_region_info_common(ctx, region, info); + if (rc == 0) { + info->host_data_ptr = 0; + info->host_signal_ptr = 0; + } + return rc; +} + +int host_device_mapped_region_datacopy_h2region_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint64_t offset, const void *src, size_t nbytes +) { + return host_device_mapped_region_datacopy_h2region_common(ctx, region, offset, src, nbytes); +} + +int host_device_mapped_region_datacopy_region2h_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint64_t offset, void *dst, size_t nbytes +) { + return host_device_mapped_region_datacopy_region2h_common(ctx, region, offset, dst, nbytes); +} + +int host_device_mapped_region_notify_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint32_t signal_id, uint32_t value +) { + return host_device_mapped_region_notify_common(ctx, region, signal_id, value); +} + +int host_device_mapped_region_wait_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint32_t signal_id, uint32_t target, + uint32_t timeout_us +) { + return host_device_mapped_region_wait_common(ctx, region, signal_id, target, timeout_us); +} + /* =========================================================================== * ACL lifecycle stubs. Sim has no ACL / aclrtStream concept, so these * no-op to satisfy the uniform host_runtime.so ABI (ChipWorker dlsym's the diff --git a/src/a5/platform/onboard/host/CMakeLists.txt b/src/a5/platform/onboard/host/CMakeLists.txt index e5b57bf7a..44a0b2184 100644 --- a/src/a5/platform/onboard/host/CMakeLists.txt +++ b/src/a5/platform/onboard/host/CMakeLists.txt @@ -40,6 +40,7 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/pto_runtime_c_api.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/platform_compile_info.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/host_regs.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host_device_comm/host_device_mapped_region.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/l2_perf_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/pmu_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/tensor_dump_collector.cpp" diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp index 21f919fd0..b180bdc89 100644 --- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp @@ -18,9 +18,11 @@ #include "pto_runtime_c_api.h" #include "callable.h" +#include "host_device_comm/host_device_mapped_region.h" #include "prepare_callable_common.h" #include "task_args.h" +#include #include #include @@ -185,12 +187,65 @@ int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *de int finalize_device(DeviceContextHandle ctx) { if (ctx == NULL) return -1; try { + host_device_mapped_region_close_all_common(ctx); return static_cast(ctx)->finalize(); } catch (...) { return -1; } } +int open_host_device_mapped_region_ctx( + DeviceContextHandle ctx, const HostDeviceMappedRegionConfig *cfg, HostDeviceMappedRegionHandle *out_region +) { + (void)ctx; + (void)cfg; + if (out_region != NULL) { + *out_region = NULL; + } + return -ENOTSUP; +} + +int close_host_device_mapped_region_ctx(DeviceContextHandle ctx, HostDeviceMappedRegionHandle region) { + int rc = host_device_mapped_region_close_common(ctx, region); + return rc == -EINVAL ? -ENOTSUP : rc; +} + +int host_device_mapped_region_info_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, HostDeviceMappedRegionInfo *info +) { + int rc = host_device_mapped_region_info_common(ctx, region, info); + return rc == -EINVAL ? -ENOTSUP : rc; +} + +int host_device_mapped_region_datacopy_h2region_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint64_t offset, const void *src, size_t nbytes +) { + int rc = host_device_mapped_region_datacopy_h2region_common(ctx, region, offset, src, nbytes); + return rc == -EINVAL ? -ENOTSUP : rc; +} + +int host_device_mapped_region_datacopy_region2h_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint64_t offset, void *dst, size_t nbytes +) { + int rc = host_device_mapped_region_datacopy_region2h_common(ctx, region, offset, dst, nbytes); + return rc == -EINVAL ? -ENOTSUP : rc; +} + +int host_device_mapped_region_notify_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint32_t signal_id, uint32_t value +) { + int rc = host_device_mapped_region_notify_common(ctx, region, signal_id, value); + return rc == -EINVAL ? -ENOTSUP : rc; +} + +int host_device_mapped_region_wait_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint32_t signal_id, uint32_t target, + uint32_t timeout_us +) { + int rc = host_device_mapped_region_wait_common(ctx, region, signal_id, target, timeout_us); + return rc == -EINVAL ? -ENOTSUP : rc; +} + /* =========================================================================== * ACL + comm_* placeholders (distributed runtime not yet implemented on a5) * diff --git a/src/a5/platform/sim/host/CMakeLists.txt b/src/a5/platform/sim/host/CMakeLists.txt index c42ed3fa7..31e7c311c 100644 --- a/src/a5/platform/sim/host/CMakeLists.txt +++ b/src/a5/platform/sim/host/CMakeLists.txt @@ -44,6 +44,8 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/profiling_copy.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/pto_runtime_c_api.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/platform_compile_info.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host_device_comm/host_device_mapped_region.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host_device_comm/host_device_mapped_region_sim.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/l2_perf_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/pmu_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/tensor_dump_collector.cpp" diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp index debf09f75..f1746811a 100644 --- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp @@ -18,9 +18,12 @@ #include "pto_runtime_c_api.h" #include "callable.h" +#include "host_device_comm/host_device_mapped_region.h" +#include "host_device_comm/host_device_mapped_region_sim.h" #include "prepare_callable_common.h" #include "task_args.h" +#include #include #include @@ -139,7 +142,10 @@ DeviceContextHandle create_device_context(void) { } } -void destroy_device_context(DeviceContextHandle ctx) { delete static_cast(ctx); } +void destroy_device_context(DeviceContextHandle ctx) { + host_device_mapped_region_close_all_common(ctx); + delete static_cast(ctx); +} size_t get_runtime_size(void) { return sizeof(Runtime); } @@ -180,6 +186,7 @@ int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *de int finalize_device(DeviceContextHandle ctx) { if (ctx == NULL) return -1; try { + host_device_mapped_region_close_all_common(ctx); int rc = static_cast(ctx)->finalize(); int dev = pto_cpu_sim_get_bound_device(); if (dev >= 0) { @@ -191,6 +198,52 @@ int finalize_device(DeviceContextHandle ctx) { } } +int open_host_device_mapped_region_ctx( + DeviceContextHandle ctx, const HostDeviceMappedRegionConfig *cfg, HostDeviceMappedRegionHandle *out_region +) { + return host_device_mapped_region_open_common(ctx, cfg, out_region, host_device_mapped_region_allocate_sim); +} + +int close_host_device_mapped_region_ctx(DeviceContextHandle ctx, HostDeviceMappedRegionHandle region) { + return host_device_mapped_region_close_common(ctx, region); +} + +int host_device_mapped_region_info_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, HostDeviceMappedRegionInfo *info +) { + int rc = host_device_mapped_region_info_common(ctx, region, info); + if (rc == 0) { + info->host_data_ptr = 0; + info->host_signal_ptr = 0; + } + return rc; +} + +int host_device_mapped_region_datacopy_h2region_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint64_t offset, const void *src, size_t nbytes +) { + return host_device_mapped_region_datacopy_h2region_common(ctx, region, offset, src, nbytes); +} + +int host_device_mapped_region_datacopy_region2h_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint64_t offset, void *dst, size_t nbytes +) { + return host_device_mapped_region_datacopy_region2h_common(ctx, region, offset, dst, nbytes); +} + +int host_device_mapped_region_notify_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint32_t signal_id, uint32_t value +) { + return host_device_mapped_region_notify_common(ctx, region, signal_id, value); +} + +int host_device_mapped_region_wait_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint32_t signal_id, uint32_t target, + uint32_t timeout_us +) { + return host_device_mapped_region_wait_common(ctx, region, signal_id, target, timeout_us); +} + /* =========================================================================== * ACL lifecycle stubs. Sim has no ACL / aclrtStream concept, so these no-op * to satisfy the uniform host_runtime.so ABI that ChipWorker dlsym's. The diff --git a/src/common/hierarchical/worker.h b/src/common/hierarchical/worker.h index 3ff7ec1be..0b8c925c8 100644 --- a/src/common/hierarchical/worker.h +++ b/src/common/hierarchical/worker.h @@ -105,6 +105,23 @@ class Worker { void control_comm_init(int worker_id, const std::string &request_shm_name) { manager_.control_comm_init(worker_id, request_shm_name.c_str()); } + uint64_t control_open_mapped_region(int worker_id, uint64_t data_bytes, uint32_t signal_count, uint32_t flags) { + return manager_.control_open_mapped_region(worker_id, data_bytes, signal_count, flags); + } + void control_close_mapped_region(int worker_id, uint64_t handle) { + manager_.control_close_mapped_region(worker_id, handle); + } + void control_mapped_region_payload(int worker_id, uint64_t sub_cmd, const std::string &shm_name) { + manager_.control_mapped_region_payload(worker_id, sub_cmd, shm_name.c_str()); + } + void control_mapped_region_notify(int worker_id, uint64_t handle, uint32_t signal_id, uint32_t value) { + manager_.control_mapped_region_notify(worker_id, handle, signal_id, value); + } + void control_mapped_region_wait( + int worker_id, uint64_t handle, uint32_t signal_id, uint32_t target, uint32_t timeout_us + ) { + manager_.control_mapped_region_wait(worker_id, handle, signal_id, target, timeout_us); + } // Broadcast CTRL_REGISTER / CTRL_UNREGISTER for a ChipCallable cid to // every NEXT_LEVEL child in parallel. `blob_ptr`/`blob_size` describe diff --git a/src/common/hierarchical/worker_manager.cpp b/src/common/hierarchical/worker_manager.cpp index c26f3e2fe..390159af2 100644 --- a/src/common/hierarchical/worker_manager.cpp +++ b/src/common/hierarchical/worker_manager.cpp @@ -323,11 +323,13 @@ WorkerThread *WorkerManager::pick_idle_excluding(WorkerType type, const std::vec // WorkerThread — memory control (orch thread, concurrent with worker thread) // ============================================================================= -static void write_control_args(char *mbox, uint64_t sub_cmd, uint64_t a0 = 0, uint64_t a1 = 0, uint64_t a2 = 0) { +static void +write_control_args(char *mbox, uint64_t sub_cmd, uint64_t a0 = 0, uint64_t a1 = 0, uint64_t a2 = 0, uint64_t a3 = 0) { std::memcpy(mbox + MAILBOX_OFF_CALLABLE, &sub_cmd, sizeof(uint64_t)); std::memcpy(mbox + CTRL_OFF_ARG0, &a0, sizeof(uint64_t)); std::memcpy(mbox + CTRL_OFF_ARG1, &a1, sizeof(uint64_t)); std::memcpy(mbox + CTRL_OFF_ARG2, &a2, sizeof(uint64_t)); + std::memcpy(mbox + CTRL_OFF_ARG3, &a3, sizeof(uint64_t)); } static uint64_t read_control_result(const char *mbox) { @@ -336,6 +338,8 @@ static uint64_t read_control_result(const char *mbox) { return r; } +static void write_shm_name_pair(char *mbox, const char *request_shm_name, const char *reply_shm_name); + // Issue a control sub-command and block until the child publishes // CONTROL_DONE. Caller must hold `mailbox_mu_`. On a non-zero error code // from the child, throws and leaves the mailbox in IDLE before unwinding @@ -442,6 +446,54 @@ void WorkerThread::control_copy_from(uint64_t dst, uint64_t src, size_t size) { run_control_command("control_copy_from"); } +uint64_t WorkerThread::control_open_mapped_region(uint64_t data_bytes, uint32_t signal_count, uint32_t flags) { + std::lock_guard lk(mailbox_mu_); + write_control_args( + mbox(), CTRL_OPEN_MAPPED_REGION, data_bytes, static_cast(signal_count), static_cast(flags) + ); + run_control_command("control_open_mapped_region"); + return read_control_result(mbox()); +} + +void WorkerThread::control_close_mapped_region(uint64_t handle) { + std::lock_guard lk(mailbox_mu_); + write_control_args(mbox(), CTRL_CLOSE_MAPPED_REGION, handle); + run_control_command("control_close_mapped_region"); +} + +void WorkerThread::control_mapped_region_payload(uint64_t sub_cmd, const char *shm_name) { + if (sub_cmd != CTRL_MAPPED_REGION_INFO && sub_cmd != CTRL_MAPPED_REGION_DATACOPY_H2REGION && + sub_cmd != CTRL_MAPPED_REGION_DATACOPY_REGION2H) { + throw std::runtime_error("control_mapped_region_payload: invalid sub-command"); + } + if (!shm_name || !*shm_name) { + throw std::runtime_error("control_mapped_region_payload: shm name must be non-empty"); + } + std::lock_guard lk(mailbox_mu_); + std::memcpy(mbox() + MAILBOX_OFF_CALLABLE, &sub_cmd, sizeof(uint64_t)); + write_shm_name_pair(mbox(), shm_name, ""); + run_control_command("control_mapped_region_payload"); +} + +void WorkerThread::control_mapped_region_notify(uint64_t handle, uint32_t signal_id, uint32_t value) { + std::lock_guard lk(mailbox_mu_); + write_control_args( + mbox(), CTRL_MAPPED_REGION_NOTIFY, handle, static_cast(signal_id), static_cast(value) + ); + run_control_command("control_mapped_region_notify"); +} + +void WorkerThread::control_mapped_region_wait( + uint64_t handle, uint32_t signal_id, uint32_t target, uint32_t timeout_us +) { + std::lock_guard lk(mailbox_mu_); + write_control_args( + mbox(), CTRL_MAPPED_REGION_WAIT, handle, static_cast(signal_id), static_cast(target), + static_cast(timeout_us) + ); + run_control_command("control_mapped_region_wait"); +} + // Stage two NUL-terminated shm names at MAILBOX_OFF_ARGS: request first // (CTRL_SHM_NAME_BYTES wide) then reply (CTRL_SHM_NAME_BYTES wide). Pads each // slot with zeros so stale bytes from a prior op cannot leak into the child's @@ -624,6 +676,49 @@ void WorkerManager::control_comm_init(int worker_id, const char *request_shm_nam wt->control_comm_init(request_shm_name); } +uint64_t +WorkerManager::control_open_mapped_region(int worker_id, uint64_t data_bytes, uint32_t signal_count, uint32_t flags) { + auto *wt = get_worker(WorkerType::NEXT_LEVEL, worker_id); + if (wt == nullptr) { + throw std::runtime_error("control_open_mapped_region: invalid worker_id " + std::to_string(worker_id)); + } + return wt->control_open_mapped_region(data_bytes, signal_count, flags); +} + +void WorkerManager::control_close_mapped_region(int worker_id, uint64_t handle) { + auto *wt = get_worker(WorkerType::NEXT_LEVEL, worker_id); + if (wt == nullptr) { + throw std::runtime_error("control_close_mapped_region: invalid worker_id " + std::to_string(worker_id)); + } + wt->control_close_mapped_region(handle); +} + +void WorkerManager::control_mapped_region_payload(int worker_id, uint64_t sub_cmd, const char *shm_name) { + auto *wt = get_worker(WorkerType::NEXT_LEVEL, worker_id); + if (wt == nullptr) { + throw std::runtime_error("control_mapped_region_payload: invalid worker_id " + std::to_string(worker_id)); + } + wt->control_mapped_region_payload(sub_cmd, shm_name); +} + +void WorkerManager::control_mapped_region_notify(int worker_id, uint64_t handle, uint32_t signal_id, uint32_t value) { + auto *wt = get_worker(WorkerType::NEXT_LEVEL, worker_id); + if (wt == nullptr) { + throw std::runtime_error("control_mapped_region_notify: invalid worker_id " + std::to_string(worker_id)); + } + wt->control_mapped_region_notify(handle, signal_id, value); +} + +void WorkerManager::control_mapped_region_wait( + int worker_id, uint64_t handle, uint32_t signal_id, uint32_t target, uint32_t timeout_us +) { + auto *wt = get_worker(WorkerType::NEXT_LEVEL, worker_id); + if (wt == nullptr) { + throw std::runtime_error("control_mapped_region_wait: invalid worker_id " + std::to_string(worker_id)); + } + wt->control_mapped_region_wait(handle, signal_id, target, timeout_us); +} + void WorkerManager::broadcast_register_all(int32_t cid, const void *blob_ptr, size_t blob_size) { if (next_level_threads_.empty()) return; diff --git a/src/common/hierarchical/worker_manager.h b/src/common/hierarchical/worker_manager.h index 76a4bf2c7..1af45cffb 100644 --- a/src/common/hierarchical/worker_manager.h +++ b/src/common/hierarchical/worker_manager.h @@ -123,16 +123,25 @@ static constexpr uint64_t CTRL_RELEASE_DOMAIN = 8; static constexpr uint64_t CTRL_COMM_INIT = 9; static constexpr uint64_t CTRL_PY_REGISTER = 10; static constexpr uint64_t CTRL_PY_UNREGISTER = 11; +static constexpr uint64_t CTRL_OPEN_MAPPED_REGION = 12; +static constexpr uint64_t CTRL_CLOSE_MAPPED_REGION = 13; +static constexpr uint64_t CTRL_MAPPED_REGION_INFO = 14; +static constexpr uint64_t CTRL_MAPPED_REGION_DATACOPY_H2REGION = 15; +static constexpr uint64_t CTRL_MAPPED_REGION_DATACOPY_REGION2H = 16; +static constexpr uint64_t CTRL_MAPPED_REGION_NOTIFY = 17; +static constexpr uint64_t CTRL_MAPPED_REGION_WAIT = 18; // Control args reuse the task mailbox region (mutually exclusive with task dispatch): // offset 16: uint64 arg0 (size for malloc; ptr for free; dst for copy; cid for register) // offset 24: uint64 arg1 (src for copy) // offset 32: uint64 arg2 (nbytes for copy) -// offset 40: uint64 result (returned ptr from malloc) +// offset 40: uint64 arg3 (timeout_us for mapped-region wait) +// offset 48: uint64 result (returned ptr from malloc / open) static constexpr ptrdiff_t CTRL_OFF_ARG0 = 16; static constexpr ptrdiff_t CTRL_OFF_ARG1 = 24; static constexpr ptrdiff_t CTRL_OFF_ARG2 = 32; -static constexpr ptrdiff_t CTRL_OFF_RESULT = 40; +static constexpr ptrdiff_t CTRL_OFF_ARG3 = 40; +static constexpr ptrdiff_t CTRL_OFF_RESULT = 48; // CTRL_REGISTER puts the NUL-terminated POSIX shm name at MAILBOX_OFF_ARGS. // Fixed-width so the wire layout stays simple; well above the encoded length @@ -209,6 +218,11 @@ class WorkerThread { void control_free(uint64_t ptr); void control_copy_to(uint64_t dst, uint64_t src, size_t size); void control_copy_from(uint64_t dst, uint64_t src, size_t size); + uint64_t control_open_mapped_region(uint64_t data_bytes, uint32_t signal_count, uint32_t flags); + void control_close_mapped_region(uint64_t handle); + void control_mapped_region_payload(uint64_t sub_cmd, const char *shm_name); + void control_mapped_region_notify(uint64_t handle, uint32_t signal_id, uint32_t value); + void control_mapped_region_wait(uint64_t handle, uint32_t signal_id, uint32_t target, uint32_t timeout_us); // Pre-warm a chip child by triggering prepare_callable for `cid` in the // child via CTRL_PREPARE. Issued from the parent at end of init() so the @@ -309,6 +323,13 @@ class WorkerManager { void control_alloc_domain(int worker_id, const char *request_shm_name, const char *reply_shm_name); void control_release_domain(int worker_id, const char *request_shm_name); void control_comm_init(int worker_id, const char *request_shm_name); + uint64_t control_open_mapped_region(int worker_id, uint64_t data_bytes, uint32_t signal_count, uint32_t flags); + void control_close_mapped_region(int worker_id, uint64_t handle); + void control_mapped_region_payload(int worker_id, uint64_t sub_cmd, const char *shm_name); + void control_mapped_region_notify(int worker_id, uint64_t handle, uint32_t signal_id, uint32_t value); + void control_mapped_region_wait( + int worker_id, uint64_t handle, uint32_t signal_id, uint32_t target, uint32_t timeout_us + ); // Broadcast CTRL_REGISTER for `cid` to every NEXT_LEVEL worker in // parallel. Stages `blob_size` bytes from `blob_ptr` into a per-call diff --git a/src/common/host_device_comm/host_device_mapped_region.cpp b/src/common/host_device_comm/host_device_mapped_region.cpp new file mode 100644 index 000000000..66999497a --- /dev/null +++ b/src/common/host_device_comm/host_device_mapped_region.cpp @@ -0,0 +1,490 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include "host_device_comm/host_device_mapped_region.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static_assert(sizeof(HostDeviceMappedRegionHeader) == 64); +static_assert(alignof(HostDeviceMappedRegionHeader) == 64); +static_assert(sizeof(HostDeviceMappedRegionSignalSlot) == 64); +static_assert(alignof(HostDeviceMappedRegionSignalSlot) == 64); +static_assert(offsetof(HostDeviceMappedRegionSignalSlot, value) == 0); + +static_assert(offsetof(HostDeviceMappedRegionInfo, host_data_ptr) == 0); +static_assert(offsetof(HostDeviceMappedRegionInfo, device_data_ptr) == 8); +static_assert(offsetof(HostDeviceMappedRegionInfo, data_bytes) == 16); +static_assert(offsetof(HostDeviceMappedRegionInfo, host_signal_ptr) == 24); +static_assert(offsetof(HostDeviceMappedRegionInfo, device_signal_ptr) == 32); +static_assert(offsetof(HostDeviceMappedRegionInfo, signal_count) == 40); +static_assert(offsetof(HostDeviceMappedRegionInfo, total_bytes) == 48); +static_assert(offsetof(HostDeviceMappedRegionInfo, flags) == 56); +static_assert(sizeof(HostDeviceMappedRegionInfo) == 64); + +namespace { + +constexpr uint64_t kAlignment = 64; + +struct HostDeviceMappedRegion { + DeviceContextHandle owner_ctx = nullptr; + void *host_base = nullptr; + void *device_base = nullptr; + uint64_t total_bytes = 0; + uint64_t data_offset = 0; + uint64_t data_bytes = 0; + uint64_t signal_offset = 0; + uint32_t signal_count = 0; + uint32_t flags = 0; + std::mutex op_mu; + std::condition_variable op_cv; + uint32_t active_ops = 0; + HostDeviceMappedRegionPlatform platform{}; +}; + +std::mutex ®istry_mutex() { + static std::mutex mu; + return mu; +} + +std::unordered_map> ®istry_by_ctx() { + static std::unordered_map> registry; + return registry; +} + +std::unordered_map ®istry_by_handle() { + static std::unordered_map registry; + return registry; +} + +bool add_overflow(uint64_t a, uint64_t b, uint64_t *out) { + if (a > std::numeric_limits::max() - b) { + return true; + } + *out = a + b; + return false; +} + +bool mul_overflow(uint64_t a, uint64_t b, uint64_t *out) { + if (a != 0 && b > std::numeric_limits::max() / a) { + return true; + } + *out = a * b; + return false; +} + +bool align64(uint64_t value, uint64_t *out) { + uint64_t padded = 0; + if (add_overflow(value, kAlignment - 1, &padded)) { + return false; + } + *out = padded & ~(kAlignment - 1); + return true; +} + +uint8_t *byte_ptr(void *base, uint64_t offset) { return static_cast(base) + offset; } + +const uint8_t *byte_ptr(const void *base, uint64_t offset) { return static_cast(base) + offset; } + +HostDeviceMappedRegion *lookup_region_locked(DeviceContextHandle ctx, HostDeviceMappedRegionHandle handle) { + if (ctx == nullptr || handle == nullptr) { + return nullptr; + } + auto it = registry_by_handle().find(handle); + if (it == registry_by_handle().end()) { + return nullptr; + } + HostDeviceMappedRegion *region = it->second; + if (region == nullptr || region->owner_ctx != ctx) { + return nullptr; + } + return region; +} + +HostDeviceMappedRegion *acquire_region(DeviceContextHandle ctx, HostDeviceMappedRegionHandle handle) { + std::lock_guard lock(registry_mutex()); + HostDeviceMappedRegion *region = lookup_region_locked(ctx, handle); + if (region == nullptr) { + return nullptr; + } + { + std::lock_guard op_lock(region->op_mu); + ++region->active_ops; + } + return region; +} + +void release_region_op(HostDeviceMappedRegion *region) { + std::lock_guard op_lock(region->op_mu); + --region->active_ops; + if (region->active_ops == 0) { + region->op_cv.notify_all(); + } +} + +void wait_for_region_ops(HostDeviceMappedRegion *region) { + std::unique_lock op_lock(region->op_mu); + region->op_cv.wait(op_lock, [region] { + return region->active_ops == 0; + }); +} + +void release_region(HostDeviceMappedRegion *region) { + if (region == nullptr) { + return; + } + if (region->platform.release != nullptr) { + region->platform.release(®ion->platform); + } + delete region; +} + +int validate_range(HostDeviceMappedRegion *region, uint64_t offset, size_t nbytes) { + if (offset > region->data_bytes) { + return -EINVAL; + } + if (static_cast(nbytes) > region->data_bytes - offset) { + return -EINVAL; + } + return 0; +} + +HostDeviceMappedRegionSignalSlot *signal_slot(HostDeviceMappedRegion *region, uint32_t signal_id) { + auto *slots = + reinterpret_cast(byte_ptr(region->host_base, region->signal_offset)); + return &slots[signal_id]; +} + +int flush_host_range(HostDeviceMappedRegion *region, void *host_ptr, uint64_t bytes) { + if (bytes == 0 || region->platform.flush_host_range == nullptr) { + return 0; + } + return region->platform.flush_host_range(®ion->platform, host_ptr, bytes); +} + +int invalidate_host_range(HostDeviceMappedRegion *region, void *host_ptr, uint64_t bytes) { + if (bytes == 0 || region->platform.invalidate_host_range == nullptr) { + return 0; + } + return region->platform.invalidate_host_range(®ion->platform, host_ptr, bytes); +} + +} // namespace + +int host_device_mapped_region_compute_total_bytes( + const HostDeviceMappedRegionConfig *cfg, uint64_t *signal_offset, uint64_t *data_offset, uint64_t *total_bytes +) { + if (cfg == nullptr || signal_offset == nullptr || data_offset == nullptr || total_bytes == nullptr) { + return -EINVAL; + } + if (cfg->data_bytes == 0 || cfg->signal_count == 0 || cfg->flags != 0) { + return -EINVAL; + } + + uint64_t signal_bytes = 0; + if (mul_overflow(cfg->signal_count, sizeof(HostDeviceMappedRegionSignalSlot), &signal_bytes)) { + return -EINVAL; + } + + *signal_offset = sizeof(HostDeviceMappedRegionHeader); + uint64_t signals_end = 0; + if (add_overflow(*signal_offset, signal_bytes, &signals_end)) { + return -EINVAL; + } + if (!align64(signals_end, data_offset)) { + return -EINVAL; + } + + uint64_t data_end = 0; + if (add_overflow(*data_offset, cfg->data_bytes, &data_end)) { + return -EINVAL; + } + if (!align64(data_end, total_bytes)) { + return -EINVAL; + } + return 0; +} + +int host_device_mapped_region_open_common( + DeviceContextHandle ctx, const HostDeviceMappedRegionConfig *cfg, HostDeviceMappedRegionHandle *out_region, + HostDeviceMappedRegionAllocateFn allocate +) { + if (out_region != nullptr) { + *out_region = nullptr; + } + if (ctx == nullptr || cfg == nullptr || out_region == nullptr || allocate == nullptr) { + return -EINVAL; + } + + uint64_t signal_offset = 0; + uint64_t data_offset = 0; + uint64_t total_bytes = 0; + int rc = host_device_mapped_region_compute_total_bytes(cfg, &signal_offset, &data_offset, &total_bytes); + if (rc != 0) { + return rc; + } + + auto *region = new (std::nothrow) HostDeviceMappedRegion; + if (region == nullptr) { + return -ENOMEM; + } + region->owner_ctx = ctx; + region->total_bytes = total_bytes; + region->data_offset = data_offset; + region->data_bytes = cfg->data_bytes; + region->signal_offset = signal_offset; + region->signal_count = cfg->signal_count; + region->flags = cfg->flags; + + rc = allocate(ctx, total_bytes, ®ion->platform, ®ion->host_base, ®ion->device_base); + if (rc != 0) { + delete region; + return rc; + } + if (region->host_base == nullptr || region->device_base == nullptr) { + release_region(region); + return -EIO; + } + + std::memset(region->host_base, 0, static_cast(total_bytes)); + auto *header = reinterpret_cast(region->host_base); + header->magic = HDMR_MAGIC; + header->version = HDMR_VERSION; + header->flags = cfg->flags; + header->signal_count = cfg->signal_count; + header->signal_offset = signal_offset; + header->data_offset = data_offset; + header->data_bytes = cfg->data_bytes; + header->total_bytes = total_bytes; + rc = flush_host_range(region, region->host_base, total_bytes); + if (rc != 0) { + release_region(region); + return -EIO; + } + + HostDeviceMappedRegionHandle handle = static_cast(region); + { + std::lock_guard lock(registry_mutex()); + registry_by_handle()[handle] = region; + registry_by_ctx()[ctx].push_back(region); + } + *out_region = handle; + return 0; +} + +int host_device_mapped_region_close_common(DeviceContextHandle ctx, HostDeviceMappedRegionHandle handle) { + HostDeviceMappedRegion *region = nullptr; + { + std::lock_guard lock(registry_mutex()); + region = lookup_region_locked(ctx, handle); + if (region == nullptr) { + return -EINVAL; + } + registry_by_handle().erase(handle); + auto ®ions = registry_by_ctx()[ctx]; + for (auto it = regions.begin(); it != regions.end(); ++it) { + if (*it == region) { + regions.erase(it); + break; + } + } + if (regions.empty()) { + registry_by_ctx().erase(ctx); + } + } + wait_for_region_ops(region); + release_region(region); + return 0; +} + +void host_device_mapped_region_close_all_common(DeviceContextHandle ctx) { + if (ctx == nullptr) { + return; + } + std::vector regions; + { + std::lock_guard lock(registry_mutex()); + auto it = registry_by_ctx().find(ctx); + if (it == registry_by_ctx().end()) { + return; + } + regions.swap(it->second); + registry_by_ctx().erase(it); + for (HostDeviceMappedRegion *region : regions) { + registry_by_handle().erase(static_cast(region)); + } + } + for (HostDeviceMappedRegion *region : regions) { + wait_for_region_ops(region); + release_region(region); + } +} + +int host_device_mapped_region_info_common( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle handle, HostDeviceMappedRegionInfo *info +) { + if (info == nullptr) { + return -EINVAL; + } + HostDeviceMappedRegion *region = acquire_region(ctx, handle); + if (region == nullptr) { + return -EINVAL; + } + std::memset(info, 0, sizeof(*info)); + info->host_data_ptr = reinterpret_cast(byte_ptr(region->host_base, region->data_offset)); + info->device_data_ptr = reinterpret_cast(byte_ptr(region->device_base, region->data_offset)); + info->data_bytes = region->data_bytes; + info->host_signal_ptr = reinterpret_cast(byte_ptr(region->host_base, region->signal_offset)); + info->device_signal_ptr = reinterpret_cast(byte_ptr(region->device_base, region->signal_offset)); + info->signal_count = region->signal_count; + info->total_bytes = region->total_bytes; + info->flags = region->flags; + release_region_op(region); + return 0; +} + +int host_device_mapped_region_datacopy_h2region_common( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle handle, uint64_t offset, const void *src, size_t nbytes +) { + if (src == nullptr && nbytes != 0) { + return -EINVAL; + } + HostDeviceMappedRegion *region = acquire_region(ctx, handle); + if (region == nullptr) { + return -EINVAL; + } + int rc = validate_range(region, offset, nbytes); + if (rc != 0) { + release_region_op(region); + return rc; + } + if (nbytes != 0) { + uint8_t *dst = byte_ptr(region->host_base, region->data_offset + offset); + std::memcpy(dst, src, nbytes); + rc = flush_host_range(region, dst, static_cast(nbytes)); + if (rc != 0) { + release_region_op(region); + return -EIO; + } + } + release_region_op(region); + return 0; +} + +int host_device_mapped_region_datacopy_region2h_common( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle handle, uint64_t offset, void *dst, size_t nbytes +) { + if (dst == nullptr && nbytes != 0) { + return -EINVAL; + } + HostDeviceMappedRegion *region = acquire_region(ctx, handle); + if (region == nullptr) { + return -EINVAL; + } + int rc = validate_range(region, offset, nbytes); + if (rc != 0) { + release_region_op(region); + return rc; + } + if (nbytes != 0) { + uint8_t *src = byte_ptr(region->host_base, region->data_offset + offset); + rc = invalidate_host_range(region, src, static_cast(nbytes)); + if (rc != 0) { + release_region_op(region); + return -EIO; + } + std::memcpy(dst, src, nbytes); + } + release_region_op(region); + return 0; +} + +int host_device_mapped_region_notify_common( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle handle, uint32_t signal_id, uint32_t value +) { + HostDeviceMappedRegion *region = acquire_region(ctx, handle); + if (region == nullptr || signal_id >= region->signal_count) { + if (region != nullptr) { + release_region_op(region); + } + return -EINVAL; + } + auto *slot = signal_slot(region, signal_id); + auto *atomic_value = reinterpret_cast *>(const_cast(&slot->value)); + uint32_t current = atomic_value->load(std::memory_order_acquire); + if (value < current) { + release_region_op(region); + return -EINVAL; + } + atomic_value->store(value, std::memory_order_release); + int rc = flush_host_range(region, slot, sizeof(*slot)); + if (rc != 0) { + release_region_op(region); + return -EIO; + } + release_region_op(region); + return 0; +} + +int host_device_mapped_region_wait_common( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle handle, uint32_t signal_id, uint32_t target, + uint32_t timeout_us +) { + HostDeviceMappedRegion *region = acquire_region(ctx, handle); + if (region == nullptr || signal_id >= region->signal_count) { + if (region != nullptr) { + release_region_op(region); + } + return -EINVAL; + } + HostDeviceMappedRegionSignalSlot *slot = signal_slot(region, signal_id); + + int rc = invalidate_host_range(region, slot, sizeof(*slot)); + if (rc != 0) { + release_region_op(region); + return -EIO; + } + auto *atomic_value = reinterpret_cast *>(const_cast(&slot->value)); + if (atomic_value->load(std::memory_order_acquire) >= target) { + release_region_op(region); + return 0; + } + if (timeout_us == 0) { + release_region_op(region); + return -EAGAIN; + } + + const auto deadline = std::chrono::steady_clock::now() + std::chrono::microseconds(timeout_us); + do { + rc = invalidate_host_range(region, slot, sizeof(*slot)); + if (rc != 0) { + release_region_op(region); + return -EIO; + } + if (atomic_value->load(std::memory_order_acquire) >= target) { + release_region_op(region); + return 0; + } + std::this_thread::sleep_for(std::chrono::microseconds(10)); + } while (std::chrono::steady_clock::now() < deadline); + + release_region_op(region); + return -EAGAIN; +} diff --git a/src/common/host_device_comm/host_device_mapped_region.h b/src/common/host_device_comm/host_device_mapped_region.h new file mode 100644 index 000000000..9ef562893 --- /dev/null +++ b/src/common/host_device_comm/host_device_mapped_region.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef SRC_COMMON_WORKER_HOST_DEVICE_MAPPED_REGION_H_ +#define SRC_COMMON_WORKER_HOST_DEVICE_MAPPED_REGION_H_ + +#include +#include + +#include "worker/pto_runtime_c_api.h" + +static constexpr uint32_t HDMR_MAGIC = 0x48444D52U; +static constexpr uint32_t HDMR_VERSION = 1; + +struct alignas(64) HostDeviceMappedRegionHeader { + uint32_t magic; + uint32_t version; + uint32_t flags; + uint32_t signal_count; + uint64_t signal_offset; + uint64_t data_offset; + uint64_t data_bytes; + uint64_t total_bytes; + uint64_t reserved[2]; +}; + +struct alignas(64) HostDeviceMappedRegionSignalSlot { + volatile uint32_t value; + uint32_t reserved0; + uint64_t reserved[7]; +}; + +struct HostDeviceMappedRegionPlatform { + void *resource; + uint64_t device_id; + void *cache_ops_cookie; + int (*flush_host_range)(HostDeviceMappedRegionPlatform *platform, void *host_ptr, uint64_t bytes); + int (*invalidate_host_range)(HostDeviceMappedRegionPlatform *platform, void *host_ptr, uint64_t bytes); + void (*release)(HostDeviceMappedRegionPlatform *platform); +}; + +using HostDeviceMappedRegionAllocateFn = int (*)( + DeviceContextHandle ctx, uint64_t total_bytes, HostDeviceMappedRegionPlatform *platform, void **host_base, + void **device_base +); + +int host_device_mapped_region_compute_total_bytes( + const HostDeviceMappedRegionConfig *cfg, uint64_t *signal_offset, uint64_t *data_offset, uint64_t *total_bytes +); + +int host_device_mapped_region_open_common( + DeviceContextHandle ctx, const HostDeviceMappedRegionConfig *cfg, HostDeviceMappedRegionHandle *out_region, + HostDeviceMappedRegionAllocateFn allocate +); + +int host_device_mapped_region_close_common(DeviceContextHandle ctx, HostDeviceMappedRegionHandle region); + +void host_device_mapped_region_close_all_common(DeviceContextHandle ctx); + +int host_device_mapped_region_info_common( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, HostDeviceMappedRegionInfo *info +); + +int host_device_mapped_region_datacopy_h2region_common( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint64_t offset, const void *src, size_t nbytes +); + +int host_device_mapped_region_datacopy_region2h_common( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint64_t offset, void *dst, size_t nbytes +); + +int host_device_mapped_region_notify_common( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint32_t signal_id, uint32_t value +); + +int host_device_mapped_region_wait_common( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint32_t signal_id, uint32_t target, + uint32_t timeout_us +); + +#endif // SRC_COMMON_WORKER_HOST_DEVICE_MAPPED_REGION_H_ diff --git a/src/common/host_device_comm/host_device_mapped_region_sim.cpp b/src/common/host_device_comm/host_device_mapped_region_sim.cpp new file mode 100644 index 000000000..6894c5800 --- /dev/null +++ b/src/common/host_device_comm/host_device_mapped_region_sim.cpp @@ -0,0 +1,35 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include "host_device_comm/host_device_mapped_region_sim.h" + +#include + +#include + +int host_device_mapped_region_allocate_sim( + DeviceContextHandle ctx, uint64_t total_bytes, HostDeviceMappedRegionPlatform *platform, void **host_base, + void **device_base +) { + (void)ctx; + void *ptr = nullptr; + if (posix_memalign(&ptr, 64, static_cast(total_bytes)) != 0) { + return -ENOMEM; + } + platform->resource = ptr; + platform->release = [](HostDeviceMappedRegionPlatform *p) { + std::free(p->resource); + p->resource = nullptr; + }; + *host_base = ptr; + *device_base = ptr; + return 0; +} diff --git a/src/common/host_device_comm/host_device_mapped_region_sim.h b/src/common/host_device_comm/host_device_mapped_region_sim.h new file mode 100644 index 000000000..b725e8bf1 --- /dev/null +++ b/src/common/host_device_comm/host_device_mapped_region_sim.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef SRC_COMMON_HOST_DEVICE_COMM_HOST_DEVICE_MAPPED_REGION_SIM_H_ +#define SRC_COMMON_HOST_DEVICE_COMM_HOST_DEVICE_MAPPED_REGION_SIM_H_ + +#include "host_device_comm/host_device_mapped_region.h" + +int host_device_mapped_region_allocate_sim( + DeviceContextHandle ctx, uint64_t total_bytes, HostDeviceMappedRegionPlatform *platform, void **host_base, + void **device_base +); + +#endif // SRC_COMMON_HOST_DEVICE_COMM_HOST_DEVICE_MAPPED_REGION_SIM_H_ diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp index 7fab4c295..566cf0357 100644 --- a/src/common/worker/chip_worker.cpp +++ b/src/common/worker/chip_worker.cpp @@ -12,6 +12,7 @@ #include "chip_worker.h" #include +#include #include #include @@ -125,6 +126,15 @@ void ChipWorker::init( load_symbol(handle, "comm_release_domain_windows"); comm_barrier_fn_ = load_symbol(handle, "comm_barrier"); comm_destroy_fn_ = load_symbol(handle, "comm_destroy"); + open_mapped_region_fn_ = load_symbol(handle, "open_host_device_mapped_region_ctx"); + close_mapped_region_fn_ = load_symbol(handle, "close_host_device_mapped_region_ctx"); + mapped_region_info_fn_ = load_symbol(handle, "host_device_mapped_region_info_ctx"); + mapped_region_datacopy_h2region_fn_ = + load_symbol(handle, "host_device_mapped_region_datacopy_h2region_ctx"); + mapped_region_datacopy_region2h_fn_ = + load_symbol(handle, "host_device_mapped_region_datacopy_region2h_ctx"); + mapped_region_notify_fn_ = load_symbol(handle, "host_device_mapped_region_notify_ctx"); + mapped_region_wait_fn_ = load_symbol(handle, "host_device_mapped_region_wait_ctx"); } catch (...) { dlclose(handle); throw; @@ -188,6 +198,13 @@ void ChipWorker::init( comm_release_domain_windows_fn_ = nullptr; comm_barrier_fn_ = nullptr; comm_destroy_fn_ = nullptr; + open_mapped_region_fn_ = nullptr; + close_mapped_region_fn_ = nullptr; + mapped_region_info_fn_ = nullptr; + mapped_region_datacopy_h2region_fn_ = nullptr; + mapped_region_datacopy_region2h_fn_ = nullptr; + mapped_region_notify_fn_ = nullptr; + mapped_region_wait_fn_ = nullptr; runtime_buf_.clear(); throw; } @@ -227,6 +244,13 @@ void ChipWorker::init( comm_release_domain_windows_fn_ = nullptr; comm_barrier_fn_ = nullptr; comm_destroy_fn_ = nullptr; + open_mapped_region_fn_ = nullptr; + close_mapped_region_fn_ = nullptr; + mapped_region_info_fn_ = nullptr; + mapped_region_datacopy_h2region_fn_ = nullptr; + mapped_region_datacopy_region2h_fn_ = nullptr; + mapped_region_notify_fn_ = nullptr; + mapped_region_wait_fn_ = nullptr; runtime_buf_.clear(); throw std::runtime_error("simpler_init failed with code " + std::to_string(init_rc)); } @@ -276,6 +300,13 @@ void ChipWorker::finalize() { comm_release_domain_windows_fn_ = nullptr; comm_barrier_fn_ = nullptr; comm_destroy_fn_ = nullptr; + open_mapped_region_fn_ = nullptr; + close_mapped_region_fn_ = nullptr; + mapped_region_info_fn_ = nullptr; + mapped_region_datacopy_h2region_fn_ = nullptr; + mapped_region_datacopy_region2h_fn_ = nullptr; + mapped_region_notify_fn_ = nullptr; + mapped_region_wait_fn_ = nullptr; runtime_buf_.clear(); initialized_ = false; device_id_ = -1; @@ -482,6 +513,95 @@ void ChipWorker::copy_from(uint64_t dst, uint64_t src, size_t size) { } } +void ChipWorker::check_mapped_region_rc(int rc, const char *op_name) { + if (rc == 0) { + return; + } + std::string msg = op_name; + msg += " failed with code "; + msg += std::to_string(rc); + if (rc == -EINVAL) { + throw std::invalid_argument(msg); + } + throw std::runtime_error(msg); +} + +uint64_t ChipWorker::open_mapped_region(uint64_t data_bytes, uint32_t signal_count, uint32_t flags) { + if (!initialized_) { + throw std::runtime_error("ChipWorker not initialized; call init() first"); + } + HostDeviceMappedRegionConfig cfg{data_bytes, signal_count, flags}; + HostDeviceMappedRegionHandle region = nullptr; + int rc = open_mapped_region_fn_(device_ctx_, &cfg, ®ion); + check_mapped_region_rc(rc, "open_mapped_region"); + if (region == nullptr) { + throw std::runtime_error("open_mapped_region returned null handle"); + } + return reinterpret_cast(region); +} + +void ChipWorker::close_mapped_region(uint64_t handle) { + if (!initialized_) { + throw std::runtime_error("ChipWorker not initialized; call init() first"); + } + int rc = close_mapped_region_fn_(device_ctx_, reinterpret_cast(handle)); + check_mapped_region_rc(rc, "close_mapped_region"); +} + +HostDeviceMappedRegionInfo ChipWorker::mapped_region_info(uint64_t handle) { + if (!initialized_) { + throw std::runtime_error("ChipWorker not initialized; call init() first"); + } + HostDeviceMappedRegionInfo info{}; + int rc = mapped_region_info_fn_(device_ctx_, reinterpret_cast(handle), &info); + check_mapped_region_rc(rc, "mapped_region_info"); + info.host_data_ptr = 0; + info.host_signal_ptr = 0; + return info; +} + +void ChipWorker::mapped_region_datacopy_h2region(uint64_t handle, uint64_t offset, const void *src, size_t nbytes) { + if (!initialized_) { + throw std::runtime_error("ChipWorker not initialized; call init() first"); + } + int rc = mapped_region_datacopy_h2region_fn_( + device_ctx_, reinterpret_cast(handle), offset, src, nbytes + ); + check_mapped_region_rc(rc, "mapped_region_datacopy_h2region"); +} + +void ChipWorker::mapped_region_datacopy_region2h(uint64_t handle, uint64_t offset, void *dst, size_t nbytes) { + if (!initialized_) { + throw std::runtime_error("ChipWorker not initialized; call init() first"); + } + int rc = mapped_region_datacopy_region2h_fn_( + device_ctx_, reinterpret_cast(handle), offset, dst, nbytes + ); + check_mapped_region_rc(rc, "mapped_region_datacopy_region2h"); +} + +void ChipWorker::mapped_region_notify(uint64_t handle, uint32_t signal_id, uint32_t value) { + if (!initialized_) { + throw std::runtime_error("ChipWorker not initialized; call init() first"); + } + int rc = + mapped_region_notify_fn_(device_ctx_, reinterpret_cast(handle), signal_id, value); + check_mapped_region_rc(rc, "mapped_region_notify"); +} + +void ChipWorker::mapped_region_wait(uint64_t handle, uint32_t signal_id, uint32_t target, uint32_t timeout_us) { + if (!initialized_) { + throw std::runtime_error("ChipWorker not initialized; call init() first"); + } + int rc = mapped_region_wait_fn_( + device_ctx_, reinterpret_cast(handle), signal_id, target, timeout_us + ); + if (rc == -EAGAIN || rc == -EWOULDBLOCK) { + throw std::runtime_error("mapped_region_wait timed out"); + } + check_mapped_region_rc(rc, "mapped_region_wait"); +} + uint64_t ChipWorker::comm_init(int rank, int nranks, const std::string &rootinfo_path) { if (!initialized_) { throw std::runtime_error("ChipWorker not initialized; call init() first"); diff --git a/src/common/worker/chip_worker.h b/src/common/worker/chip_worker.h index 2227245f1..7abc8ab76 100644 --- a/src/common/worker/chip_worker.h +++ b/src/common/worker/chip_worker.h @@ -81,6 +81,14 @@ class ChipWorker { void copy_to(uint64_t dst, uint64_t src, size_t size); void copy_from(uint64_t dst, uint64_t src, size_t size); + uint64_t open_mapped_region(uint64_t data_bytes, uint32_t signal_count, uint32_t flags); + void close_mapped_region(uint64_t handle); + HostDeviceMappedRegionInfo mapped_region_info(uint64_t handle); + void mapped_region_datacopy_h2region(uint64_t handle, uint64_t offset, const void *src, size_t nbytes); + void mapped_region_datacopy_region2h(uint64_t handle, uint64_t offset, void *dst, size_t nbytes); + void mapped_region_notify(uint64_t handle, uint32_t signal_id, uint32_t value); + void mapped_region_wait(uint64_t handle, uint32_t signal_id, uint32_t target, uint32_t timeout_us); + /// Distributed communication primitives (optional — only available when /// the bound runtime exports comm_*). Wraps the backend-neutral C API /// defined in src//platform/include/host/comm.h. @@ -158,6 +166,14 @@ class ChipWorker { using CommReleaseDomainWindowsFn = int (*)(void *, uint64_t, size_t, uint32_t); using CommBarrierFn = int (*)(void *); using CommDestroyFn = int (*)(void *); + using OpenMappedRegionFn = int (*)(void *, const HostDeviceMappedRegionConfig *, HostDeviceMappedRegionHandle *); + using CloseMappedRegionFn = int (*)(void *, HostDeviceMappedRegionHandle); + using MappedRegionInfoFn = int (*)(void *, HostDeviceMappedRegionHandle, HostDeviceMappedRegionInfo *); + using MappedRegionDatacopyH2RegionFn = + int (*)(void *, HostDeviceMappedRegionHandle, uint64_t, const void *, size_t); + using MappedRegionDatacopyRegion2HFn = int (*)(void *, HostDeviceMappedRegionHandle, uint64_t, void *, size_t); + using MappedRegionNotifyFn = int (*)(void *, HostDeviceMappedRegionHandle, uint32_t, uint32_t); + using MappedRegionWaitFn = int (*)(void *, HostDeviceMappedRegionHandle, uint32_t, uint32_t, uint32_t); struct CommSession { void *handle = nullptr; @@ -175,6 +191,7 @@ class ChipWorker { int destroy_comm_session(CommSession &session); uint64_t create_base_comm(int rank, int nranks, const std::string &rootinfo_path); void clear_comm_sessions(); + void check_mapped_region_rc(int rc, const char *op_name); void *lib_handle_ = nullptr; CreateDeviceContextFn create_device_context_fn_ = nullptr; @@ -203,6 +220,13 @@ class ChipWorker { CommReleaseDomainWindowsFn comm_release_domain_windows_fn_ = nullptr; CommBarrierFn comm_barrier_fn_ = nullptr; CommDestroyFn comm_destroy_fn_ = nullptr; + OpenMappedRegionFn open_mapped_region_fn_ = nullptr; + CloseMappedRegionFn close_mapped_region_fn_ = nullptr; + MappedRegionInfoFn mapped_region_info_fn_ = nullptr; + MappedRegionDatacopyH2RegionFn mapped_region_datacopy_h2region_fn_ = nullptr; + MappedRegionDatacopyRegion2HFn mapped_region_datacopy_region2h_fn_ = nullptr; + MappedRegionNotifyFn mapped_region_notify_fn_ = nullptr; + MappedRegionWaitFn mapped_region_wait_fn_ = nullptr; void *device_ctx_ = nullptr; std::vector comm_sessions_; std::unordered_map comm_session_index_; diff --git a/src/common/worker/pto_runtime_c_api.h b/src/common/worker/pto_runtime_c_api.h index 00debb446..a55f855ce 100644 --- a/src/common/worker/pto_runtime_c_api.h +++ b/src/common/worker/pto_runtime_c_api.h @@ -47,6 +47,7 @@ extern "C" { typedef void *RuntimeHandle; typedef void *DeviceContextHandle; +typedef void *HostDeviceMappedRegionHandle; /** * Timing breakdown for a single run_prepared() invocation. @@ -73,6 +74,25 @@ typedef struct PtoRunTiming { uint64_t device_wall_ns; } PtoRunTiming; +typedef struct HostDeviceMappedRegionConfig { + uint64_t data_bytes; + uint32_t signal_count; + uint32_t flags; +} HostDeviceMappedRegionConfig; + +typedef struct HostDeviceMappedRegionInfo { + uint64_t host_data_ptr; + uint64_t device_data_ptr; + uint64_t data_bytes; + uint64_t host_signal_ptr; + uint64_t device_signal_ptr; + uint32_t signal_count; + uint32_t reserved0; + uint64_t total_bytes; + uint32_t flags; + uint32_t reserved1; +} HostDeviceMappedRegionInfo; + /* =========================================================================== * Public API (resolved by ChipWorker via dlsym) * =========================================================================== */ @@ -141,6 +161,33 @@ int simpler_init( */ int finalize_device(DeviceContextHandle ctx); +int open_host_device_mapped_region_ctx( + DeviceContextHandle ctx, const HostDeviceMappedRegionConfig *cfg, HostDeviceMappedRegionHandle *out_region +); + +int close_host_device_mapped_region_ctx(DeviceContextHandle ctx, HostDeviceMappedRegionHandle region); + +int host_device_mapped_region_info_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, HostDeviceMappedRegionInfo *info +); + +int host_device_mapped_region_datacopy_h2region_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint64_t offset, const void *src, size_t nbytes +); + +int host_device_mapped_region_datacopy_region2h_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint64_t offset, void *dst, size_t nbytes +); + +int host_device_mapped_region_notify_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint32_t signal_id, uint32_t value +); + +int host_device_mapped_region_wait_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint32_t signal_id, uint32_t target, + uint32_t timeout_us +); + /* =========================================================================== * Per-callable_id preparation * diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt index 89314d800..3445a2c2f 100644 --- a/tests/ut/cpp/CMakeLists.txt +++ b/tests/ut/cpp/CMakeLists.txt @@ -237,6 +237,7 @@ add_hierarchical_test(test_ring hierarchical/test_ring.cpp) add_hierarchical_test(test_scope hierarchical/test_scope.cpp) add_hierarchical_test(test_orchestrator hierarchical/test_orchestrator.cpp) add_hierarchical_test(test_scheduler hierarchical/test_scheduler.cpp) +add_hierarchical_test(test_mailbox_control_layout hierarchical/test_mailbox_control_layout.cpp) # --------------------------------------------------------------------------- # Types / task_interface tests (src/common/task_interface/) @@ -271,6 +272,24 @@ add_common_utils_test(test_elf_build_id common/test_elf_build_id.cpp) add_common_utils_test(test_runtime_orch_so common/test_runtime_orch_so.cpp) add_common_utils_test(test_device_arena common/test_device_arena.cpp) +add_executable(test_host_device_mapped_region + common/test_host_device_mapped_region.cpp + ${CMAKE_SOURCE_DIR}/../../../src/common/host_device_comm/host_device_mapped_region.cpp +) +target_include_directories(test_host_device_mapped_region PRIVATE + ${GTEST_INCLUDE_DIRS} + ${CMAKE_SOURCE_DIR}/../../../src/common + ${CMAKE_SOURCE_DIR}/../../../src/common/worker +) +target_compile_options(test_host_device_mapped_region PRIVATE -D_GLIBCXX_USE_CXX11_ABI=0) +target_link_libraries(test_host_device_mapped_region PRIVATE + ${GTEST_MAIN_LIB} + ${GTEST_LIB} + pthread +) +add_test(NAME test_host_device_mapped_region COMMAND test_host_device_mapped_region) +set_tests_properties(test_host_device_mapped_region PROPERTIES LABELS "no_hardware") + # Per-callable_id orch SO file naming regression (see rtStreamSynchronize # 507018 root cause). Compiles the a2a3 onboard `create_orch_so_file` # against the test source so it runs on no-hw runners too. diff --git a/tests/ut/cpp/common/test_host_device_mapped_region.cpp b/tests/ut/cpp/common/test_host_device_mapped_region.cpp new file mode 100644 index 000000000..bcd608524 --- /dev/null +++ b/tests/ut/cpp/common/test_host_device_mapped_region.cpp @@ -0,0 +1,319 @@ +#include "host_device_comm/host_device_mapped_region.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace { + +struct CacheOpsRecorder { + int flush_count = 0; + int invalidate_count = 0; + std::vector events; +}; + +int allocate_heap_region( + DeviceContextHandle, uint64_t total_bytes, HostDeviceMappedRegionPlatform *platform, void **host_base, + void **device_base +) { + void *ptr = nullptr; + if (posix_memalign(&ptr, 64, static_cast(total_bytes)) != 0) { + return -ENOMEM; + } + platform->resource = ptr; + platform->release = [](HostDeviceMappedRegionPlatform *p) { + std::free(p->resource); + p->resource = nullptr; + }; + *host_base = ptr; + *device_base = ptr; + return 0; +} + +int allocate_heap_region_with_cache_ops( + DeviceContextHandle ctx, uint64_t total_bytes, HostDeviceMappedRegionPlatform *platform, void **host_base, + void **device_base +) { + int rc = allocate_heap_region(ctx, total_bytes, platform, host_base, device_base); + if (rc != 0) { + return rc; + } + platform->cache_ops_cookie = ctx; + platform->flush_host_range = [](HostDeviceMappedRegionPlatform *p, void *, uint64_t) { + auto *recorder = static_cast(p->cache_ops_cookie); + ++recorder->flush_count; + recorder->events.push_back("flush"); + return 0; + }; + platform->invalidate_host_range = [](HostDeviceMappedRegionPlatform *p, void *, uint64_t) { + auto *recorder = static_cast(p->cache_ops_cookie); + ++recorder->invalidate_count; + recorder->events.push_back("invalidate"); + return 0; + }; + return 0; +} + +struct ReleaseRecorder { + int release_count = 0; +}; + +struct ReleaseState { + ReleaseRecorder *recorder = nullptr; + void *resource = nullptr; +}; + +void release_recorded_heap_region(HostDeviceMappedRegionPlatform *p) { + auto *state = static_cast(p->resource); + ++state->recorder->release_count; + std::free(state->resource); + delete state; + p->resource = nullptr; +} + +int allocate_region_without_host_base( + DeviceContextHandle ctx, uint64_t total_bytes, HostDeviceMappedRegionPlatform *platform, void **host_base, + void **device_base +) { + int rc = allocate_heap_region(ctx, total_bytes, platform, host_base, device_base); + if (rc != 0) { + return rc; + } + auto *recorder = static_cast(ctx); + void *resource = platform->resource; + platform->resource = new ReleaseState{recorder, resource}; + platform->release = release_recorded_heap_region; + *host_base = nullptr; + return 0; +} + +int allocate_region_with_failing_initial_flush( + DeviceContextHandle ctx, uint64_t total_bytes, HostDeviceMappedRegionPlatform *platform, void **host_base, + void **device_base +) { + int rc = allocate_heap_region(ctx, total_bytes, platform, host_base, device_base); + if (rc != 0) { + return rc; + } + auto *recorder = static_cast(ctx); + void *resource = platform->resource; + platform->resource = new ReleaseState{recorder, resource}; + platform->release = release_recorded_heap_region; + platform->flush_host_range = [](HostDeviceMappedRegionPlatform *, void *, uint64_t) { + return -EIO; + }; + return 0; +} + +} // namespace + +TEST(HostDeviceMappedRegion, PublicAbiLayoutIsStable) { + EXPECT_EQ(offsetof(HostDeviceMappedRegionInfo, host_data_ptr), 0u); + EXPECT_EQ(offsetof(HostDeviceMappedRegionInfo, device_data_ptr), 8u); + EXPECT_EQ(offsetof(HostDeviceMappedRegionInfo, data_bytes), 16u); + EXPECT_EQ(offsetof(HostDeviceMappedRegionInfo, host_signal_ptr), 24u); + EXPECT_EQ(offsetof(HostDeviceMappedRegionInfo, device_signal_ptr), 32u); + EXPECT_EQ(offsetof(HostDeviceMappedRegionInfo, signal_count), 40u); + EXPECT_EQ(offsetof(HostDeviceMappedRegionInfo, total_bytes), 48u); + EXPECT_EQ(offsetof(HostDeviceMappedRegionInfo, flags), 56u); + EXPECT_EQ(sizeof(HostDeviceMappedRegionInfo), 64u); +} + +TEST(HostDeviceMappedRegion, InternalLayoutIsStable) { + EXPECT_EQ(sizeof(HostDeviceMappedRegionHeader), 64u); + EXPECT_EQ(alignof(HostDeviceMappedRegionHeader), 64u); + EXPECT_EQ(sizeof(HostDeviceMappedRegionSignalSlot), 64u); + EXPECT_EQ(alignof(HostDeviceMappedRegionSignalSlot), 64u); + EXPECT_EQ(offsetof(HostDeviceMappedRegionSignalSlot, value), 0u); +} + +TEST(HostDeviceMappedRegion, RejectsInvalidConfig) { + HostDeviceMappedRegionHandle region = reinterpret_cast(0x1); + HostDeviceMappedRegionConfig cfg{0, 1, 0}; + EXPECT_EQ( + host_device_mapped_region_open_common( + reinterpret_cast(0x10), &cfg, ®ion, allocate_heap_region + ), + -EINVAL + ); + EXPECT_EQ(region, nullptr); + + cfg = HostDeviceMappedRegionConfig{16, 0, 0}; + EXPECT_EQ( + host_device_mapped_region_open_common( + reinterpret_cast(0x10), &cfg, ®ion, allocate_heap_region + ), + -EINVAL + ); + EXPECT_EQ(region, nullptr); + + cfg = HostDeviceMappedRegionConfig{16, 1, 1}; + EXPECT_EQ( + host_device_mapped_region_open_common( + reinterpret_cast(0x10), &cfg, ®ion, allocate_heap_region + ), + -EINVAL + ); + EXPECT_EQ(region, nullptr); +} + +TEST(HostDeviceMappedRegion, ReleasesBackendResourceWhenOpenValidationFailsAfterAllocate) { + ReleaseRecorder recorder; + auto ctx = reinterpret_cast(&recorder); + HostDeviceMappedRegionConfig cfg{16, 1, 0}; + HostDeviceMappedRegionHandle region = reinterpret_cast(0x1); + + EXPECT_EQ(host_device_mapped_region_open_common(ctx, &cfg, ®ion, allocate_region_without_host_base), -EIO); + EXPECT_EQ(region, nullptr); + EXPECT_EQ(recorder.release_count, 1); +} + +TEST(HostDeviceMappedRegion, ReleasesBackendResourceWhenInitialFlushFails) { + ReleaseRecorder recorder; + auto ctx = reinterpret_cast(&recorder); + HostDeviceMappedRegionConfig cfg{16, 1, 0}; + HostDeviceMappedRegionHandle region = reinterpret_cast(0x1); + + EXPECT_EQ( + host_device_mapped_region_open_common(ctx, &cfg, ®ion, allocate_region_with_failing_initial_flush), -EIO + ); + EXPECT_EQ(region, nullptr); + EXPECT_EQ(recorder.release_count, 1); +} + +TEST(HostDeviceMappedRegion, OpensZeroInitializedRegionAndReportsInfo) { + auto ctx = reinterpret_cast(0x20); + HostDeviceMappedRegionConfig cfg{17, 2, 0}; + HostDeviceMappedRegionHandle region = nullptr; + ASSERT_EQ(host_device_mapped_region_open_common(ctx, &cfg, ®ion, allocate_heap_region), 0); + ASSERT_NE(region, nullptr); + + HostDeviceMappedRegionInfo info{}; + ASSERT_EQ(host_device_mapped_region_info_common(ctx, region, &info), 0); + EXPECT_NE(info.host_data_ptr, 0u); + EXPECT_EQ(info.host_data_ptr, info.device_data_ptr); + EXPECT_NE(info.host_signal_ptr, 0u); + EXPECT_EQ(info.host_signal_ptr, info.device_signal_ptr); + EXPECT_EQ(info.data_bytes, 17u); + EXPECT_EQ(info.signal_count, 2u); + EXPECT_EQ(info.flags, 0u); + EXPECT_EQ(info.total_bytes, 64u + 2u * 64u + 64u); + + auto *host_base = reinterpret_cast(info.host_signal_ptr - sizeof(HostDeviceMappedRegionHeader)); + auto *header = reinterpret_cast(host_base); + EXPECT_EQ(header->magic, HDMR_MAGIC); + EXPECT_EQ(header->version, HDMR_VERSION); + EXPECT_EQ(header->flags, 0u); + EXPECT_EQ(header->signal_count, 2u); + EXPECT_EQ(header->signal_offset, 64u); + EXPECT_EQ(header->data_offset, 64u + 2u * 64u); + EXPECT_EQ(header->data_bytes, 17u); + EXPECT_EQ(header->total_bytes, info.total_bytes); + + std::vector out(17, 0xAA); + ASSERT_EQ(host_device_mapped_region_datacopy_region2h_common(ctx, region, 0, out.data(), out.size()), 0); + EXPECT_EQ(out, std::vector(17, 0)); + + EXPECT_EQ(host_device_mapped_region_close_common(ctx, region), 0); +} + +TEST(HostDeviceMappedRegion, DatacopyValidatesBoundsAndRoundTripsBytes) { + auto ctx = reinterpret_cast(0x30); + HostDeviceMappedRegionConfig cfg{8, 1, 0}; + HostDeviceMappedRegionHandle region = nullptr; + ASSERT_EQ(host_device_mapped_region_open_common(ctx, &cfg, ®ion, allocate_heap_region), 0); + + const uint8_t input[4] = {1, 2, 3, 4}; + ASSERT_EQ(host_device_mapped_region_datacopy_h2region_common(ctx, region, 2, input, sizeof(input)), 0); + uint8_t output[8] = {}; + ASSERT_EQ(host_device_mapped_region_datacopy_region2h_common(ctx, region, 0, output, sizeof(output)), 0); + const uint8_t expected[8] = {0, 0, 1, 2, 3, 4, 0, 0}; + EXPECT_EQ(std::memcmp(output, expected, sizeof(expected)), 0); + + EXPECT_EQ(host_device_mapped_region_datacopy_h2region_common(ctx, region, 8, input, 0), 0); + EXPECT_EQ(host_device_mapped_region_datacopy_h2region_common(ctx, region, 8, input, 1), -EINVAL); + EXPECT_EQ(host_device_mapped_region_datacopy_region2h_common(ctx, region, 9, output, 0), -EINVAL); + + EXPECT_EQ(host_device_mapped_region_close_common(ctx, region), 0); +} + +TEST(HostDeviceMappedRegion, InvokesPlatformCacheOpsAroundHostAccess) { + CacheOpsRecorder recorder; + auto ctx = reinterpret_cast(&recorder); + HostDeviceMappedRegionConfig cfg{8, 2, 0}; + HostDeviceMappedRegionHandle region = nullptr; + ASSERT_EQ(host_device_mapped_region_open_common(ctx, &cfg, ®ion, allocate_heap_region_with_cache_ops), 0); + EXPECT_GT(recorder.flush_count, 0); + + recorder.events.clear(); + const uint8_t input[4] = {1, 2, 3, 4}; + EXPECT_EQ(host_device_mapped_region_datacopy_h2region_common(ctx, region, 0, input, sizeof(input)), 0); + ASSERT_FALSE(recorder.events.empty()); + EXPECT_EQ(recorder.events.back(), "flush"); + + recorder.events.clear(); + EXPECT_EQ(host_device_mapped_region_notify_common(ctx, region, 0, 1), 0); + ASSERT_FALSE(recorder.events.empty()); + EXPECT_EQ(recorder.events.back(), "flush"); + + recorder.events.clear(); + EXPECT_EQ(host_device_mapped_region_wait_common(ctx, region, 0, 1, 0), 0); + ASSERT_FALSE(recorder.events.empty()); + EXPECT_EQ(recorder.events.front(), "invalidate"); + + recorder.events.clear(); + uint8_t output[4] = {}; + EXPECT_EQ(host_device_mapped_region_datacopy_region2h_common(ctx, region, 0, output, sizeof(output)), 0); + ASSERT_FALSE(recorder.events.empty()); + EXPECT_EQ(recorder.events.front(), "invalidate"); + + EXPECT_EQ(host_device_mapped_region_close_common(ctx, region), 0); +} + +TEST(HostDeviceMappedRegion, NotifyWaitUsesMonotonicSignals) { + auto ctx = reinterpret_cast(0x40); + HostDeviceMappedRegionConfig cfg{8, 1, 0}; + HostDeviceMappedRegionHandle region = nullptr; + ASSERT_EQ(host_device_mapped_region_open_common(ctx, &cfg, ®ion, allocate_heap_region), 0); + + EXPECT_EQ(host_device_mapped_region_wait_common(ctx, region, 0, 0, 0), 0); + EXPECT_EQ(host_device_mapped_region_wait_common(ctx, region, 0, 1, 0), -EAGAIN); + EXPECT_EQ(host_device_mapped_region_notify_common(ctx, region, 0, 7), 0); + EXPECT_EQ(host_device_mapped_region_wait_common(ctx, region, 0, 7, 0), 0); + EXPECT_EQ(host_device_mapped_region_wait_common(ctx, region, 0, 8, 100), -EAGAIN); + EXPECT_EQ(host_device_mapped_region_notify_common(ctx, region, 0, 6), -EINVAL); + EXPECT_EQ(host_device_mapped_region_notify_common(ctx, region, 1, 8), -EINVAL); + + EXPECT_EQ(host_device_mapped_region_close_common(ctx, region), 0); +} + +TEST(HostDeviceMappedRegion, RejectsStaleAndCrossContextHandles) { + auto ctx_a = reinterpret_cast(0x50); + auto ctx_b = reinterpret_cast(0x51); + HostDeviceMappedRegionConfig cfg{8, 1, 0}; + HostDeviceMappedRegionHandle region = nullptr; + ASSERT_EQ(host_device_mapped_region_open_common(ctx_a, &cfg, ®ion, allocate_heap_region), 0); + + HostDeviceMappedRegionInfo info{}; + EXPECT_EQ(host_device_mapped_region_info_common(ctx_b, region, &info), -EINVAL); + EXPECT_EQ(host_device_mapped_region_close_common(ctx_a, region), 0); + EXPECT_EQ(host_device_mapped_region_close_common(ctx_a, region), -EINVAL); + EXPECT_EQ(host_device_mapped_region_info_common(ctx_a, region, &info), -EINVAL); +} + +TEST(HostDeviceMappedRegion, CloseAllReleasesContextRegions) { + auto ctx = reinterpret_cast(0x60); + HostDeviceMappedRegionConfig cfg{8, 1, 0}; + HostDeviceMappedRegionHandle region = nullptr; + ASSERT_EQ(host_device_mapped_region_open_common(ctx, &cfg, ®ion, allocate_heap_region), 0); + host_device_mapped_region_close_all_common(ctx); + HostDeviceMappedRegionInfo info{}; + EXPECT_EQ(host_device_mapped_region_info_common(ctx, region, &info), -EINVAL); +} diff --git a/tests/ut/cpp/hierarchical/test_mailbox_control_layout.cpp b/tests/ut/cpp/hierarchical/test_mailbox_control_layout.cpp new file mode 100644 index 000000000..a531817a0 --- /dev/null +++ b/tests/ut/cpp/hierarchical/test_mailbox_control_layout.cpp @@ -0,0 +1,11 @@ +#include + +#include "worker_manager.h" + +TEST(MailboxControlLayout, UsesFourArgsAndMovesResultToOffset48) { + EXPECT_EQ(CTRL_OFF_ARG0, 16); + EXPECT_EQ(CTRL_OFF_ARG1, 24); + EXPECT_EQ(CTRL_OFF_ARG2, 32); + EXPECT_EQ(CTRL_OFF_ARG3, 40); + EXPECT_EQ(CTRL_OFF_RESULT, 48); +} diff --git a/tests/ut/py/test_chip_worker.py b/tests/ut/py/test_chip_worker.py index 0d6762c14..41fe8cd08 100644 --- a/tests/ut/py/test_chip_worker.py +++ b/tests/ut/py/test_chip_worker.py @@ -9,7 +9,7 @@ """Tests for CallConfig and ChipWorker state machine.""" import pytest -from _task_interface import CallConfig, _ChipWorker # pyright: ignore[reportMissingImports] +from _task_interface import CallConfig, MappedRegionInfo, _ChipWorker # pyright: ignore[reportMissingImports] # ============================================================================ # CallConfig tests @@ -136,6 +136,34 @@ def test_unregister_callable_before_init_raises(self): with pytest.raises(RuntimeError, match="not initialized"): worker.unregister_callable(0) + def test_mapped_region_methods_before_init_raise(self): + worker = _ChipWorker() + with pytest.raises(RuntimeError, match="not initialized"): + worker.open_mapped_region(8, 1, 0) + with pytest.raises(RuntimeError, match="not initialized"): + worker.mapped_region_info(1) + with pytest.raises(RuntimeError, match="not initialized"): + worker.mapped_region_datacopy_h2region(1, 0, b"x") + with pytest.raises(RuntimeError, match="not initialized"): + worker.mapped_region_datacopy_region2h(1, 0, 1) + with pytest.raises(RuntimeError, match="not initialized"): + worker.mapped_region_notify(1, 0, 1) + with pytest.raises(RuntimeError, match="not initialized"): + worker.mapped_region_wait(1, 0, 1, 0) + with pytest.raises(RuntimeError, match="not initialized"): + worker.close_mapped_region(1) + + def test_mapped_region_info_is_structured(self): + info = MappedRegionInfo(0, 0x1000, 16, 0, 0x2000, 2, 256, 0) + assert info.host_data_ptr == 0 + assert info.device_data_ptr == 0x1000 + assert info.data_bytes == 16 + assert info.host_signal_ptr == 0 + assert info.device_signal_ptr == 0x2000 + assert info.signal_count == 2 + assert info.total_bytes == 256 + assert info.flags == 0 + # ============================================================================ # Python-level ChipWorker wrapper tests diff --git a/tests/ut/py/test_worker/test_mapped_region_hw.py b/tests/ut/py/test_worker/test_mapped_region_hw.py new file mode 100644 index 000000000..bcdb92c0d --- /dev/null +++ b/tests/ut/py/test_worker/test_mapped_region_hw.py @@ -0,0 +1,53 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Hardware smoke coverage for HostDeviceMappedRegion on a2a3 onboard.""" + +from __future__ import annotations + +import os + +import pytest + + +@pytest.mark.requires_hardware("a2a3") +@pytest.mark.platforms(["a2a3"]) +def test_a2a3_onboard_mapped_region_host_side_smoke(st_device_ids): + from simpler.worker import Worker + from simpler_setup.runtime_builder import RuntimeBuilder + + build = bool(os.environ.get("PTO_UT_BUILD")) + _ = RuntimeBuilder(platform="a2a3").get_binaries("tensormap_and_ringbuffer", build=build) + device_id = int(st_device_ids[0]) + + worker = Worker(level=2, platform="a2a3", runtime="tensormap_and_ringbuffer", device_id=device_id, build=build) + worker.init() + try: + region = worker.open_mapped_region(128, signal_count=2) + try: + info = worker.mapped_region_info(region) + assert info.host_data_ptr == 0 + assert info.host_signal_ptr == 0 + assert info.device_data_ptr != 0 + assert info.device_signal_ptr != 0 + assert info.data_bytes == 128 + assert info.signal_count == 2 + + payload = bytes((i * 7) % 251 for i in range(96)) + worker.mapped_region_datacopy_h2region(region, 16, payload) + assert worker.mapped_region_datacopy_region2h(region, 16, len(payload)) == payload + + worker.mapped_region_wait(region, 0, 0, 0) + with pytest.raises(TimeoutError): + worker.mapped_region_wait(region, 0, 1, 0) + worker.mapped_region_notify(region, 0, 3) + worker.mapped_region_wait(region, 0, 3, 0) + finally: + worker.close_mapped_region(region) + finally: + worker.close() diff --git a/tests/ut/py/test_worker/test_mapped_region_round_trip_hw.py b/tests/ut/py/test_worker/test_mapped_region_round_trip_hw.py new file mode 100644 index 000000000..aed24a719 --- /dev/null +++ b/tests/ut/py/test_worker/test_mapped_region_round_trip_hw.py @@ -0,0 +1,56 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""CI gate for the mapped-region round-trip example.""" + +from __future__ import annotations + +import os +import subprocess +import sys +from pathlib import Path + +import pytest + + +EXAMPLE = ( + Path(__file__).resolve().parents[4] + / "examples" + / "a2a3" + / "tensormap_and_ringbuffer" + / "host_device_mapped_region_round_trip" + / "main.py" +) +REPO_ROOT = Path(__file__).resolve().parents[4] + + +def _subprocess_env() -> dict[str, str]: + env = os.environ.copy() + paths = [str(REPO_ROOT), str(REPO_ROOT / "python")] + venv_lib = REPO_ROOT / ".venv" / "lib" + if venv_lib.exists(): + paths.extend(str(p) for p in sorted(venv_lib.glob("python*/site-packages"))) + existing = env.get("PYTHONPATH") + if existing: + paths.append(existing) + env["PYTHONPATH"] = os.pathsep.join(paths) + return env + + +@pytest.mark.requires_hardware("a2a3") +@pytest.mark.platforms(["a2a3"]) +def test_a2a3_onboard_mapped_region_real_npu_round_trip(st_device_ids): + result = subprocess.run( + [sys.executable, str(EXAMPLE), "-p", "a2a3", "-d", str(int(st_device_ids[0])), "--iters", "10"], + text=True, + capture_output=True, + timeout=180, + check=False, + env=_subprocess_env(), + ) + assert result.returncode == 0, result.stdout + result.stderr diff --git a/tests/ut/py/test_worker/test_mapped_region_sim.py b/tests/ut/py/test_worker/test_mapped_region_sim.py new file mode 100644 index 000000000..1bb17f808 --- /dev/null +++ b/tests/ut/py/test_worker/test_mapped_region_sim.py @@ -0,0 +1,225 @@ +import pytest + + +def _run_python_snippet(code: str) -> None: + import subprocess + import sys + import textwrap + + result = subprocess.run( + [sys.executable, "-c", textwrap.dedent(code)], + text=True, + capture_output=True, + timeout=60, + check=False, + ) + assert result.returncode == 0, result.stderr + result.stdout + + +def test_worker_mailbox_control_offsets_match_cpp_contract(): + import simpler.worker as worker_mod + from _task_interface import ( # pyright: ignore[reportMissingImports] + CTRL_OFF_ARG0, + CTRL_OFF_ARG1, + CTRL_OFF_ARG2, + CTRL_OFF_ARG3, + CTRL_OFF_RESULT, + ) + + assert (worker_mod._CTRL_OFF_ARG0, worker_mod._CTRL_OFF_ARG1, worker_mod._CTRL_OFF_ARG2) == (16, 24, 32) + assert worker_mod._CTRL_OFF_ARG3 == 40 + assert worker_mod._CTRL_OFF_RESULT == 48 + assert (CTRL_OFF_ARG0, CTRL_OFF_ARG1, CTRL_OFF_ARG2, CTRL_OFF_ARG3, CTRL_OFF_RESULT) == ( + worker_mod._CTRL_OFF_ARG0, + worker_mod._CTRL_OFF_ARG1, + worker_mod._CTRL_OFF_ARG2, + worker_mod._CTRL_OFF_ARG3, + worker_mod._CTRL_OFF_RESULT, + ) + + +class FakeChipWorker: + def __init__(self): + self.calls = [] + + def open_mapped_region(self, data_bytes, signal_count=1, flags=0): + self.calls.append(("open", data_bytes, signal_count, flags)) + return 0xABC + + def mapped_region_info(self, handle): + self.calls.append(("info", handle)) + from simpler.task_interface import MappedRegionInfo + + return MappedRegionInfo(0, 0x1000, 16, 0, 0x2000, 2, 256, 0) + + def mapped_region_datacopy_h2region(self, handle, offset, data): + self.calls.append(("h2region", handle, offset, data)) + + def mapped_region_datacopy_region2h(self, handle, offset, nbytes): + self.calls.append(("region2h", handle, offset, nbytes)) + return b"out" + + def mapped_region_notify(self, handle, signal_id, value): + self.calls.append(("notify", handle, signal_id, value)) + + def mapped_region_wait(self, handle, signal_id, target, timeout_us): + self.calls.append(("wait", handle, signal_id, target, timeout_us)) + + def close_mapped_region(self, handle): + self.calls.append(("close", handle)) + + +def make_l2_worker_with_fake_chip(): + from simpler.worker import Worker + + worker = Worker(level=2) + worker._chip_worker = FakeChipWorker() + worker._initialized = True + return worker + + +def test_worker_l2_mapped_region_round_trips_to_chip_worker(): + worker = make_l2_worker_with_fake_chip() + + region = worker.open_mapped_region(16, signal_count=2) + assert region.handle == 0xABC + assert region.worker_id == 0 + assert region.data_bytes == 16 + assert region.signal_count == 2 + assert region.flags == 0 + assert region.closed is False + + info = worker.mapped_region_info(region) + assert info.host_data_ptr == 0 + assert info.host_signal_ptr == 0 + assert info.device_data_ptr == 0x1000 + + worker.mapped_region_datacopy_h2region(region, 4, b"abcd") + assert worker.mapped_region_datacopy_region2h(region, 0, 3) == b"out" + worker.mapped_region_notify(region, 1, 7) + worker.mapped_region_wait(region, 1, 7, 100) + worker.close_mapped_region(region) + + assert region.closed is True + assert worker._chip_worker.calls == [ + ("open", 16, 2, 0), + ("info", 0xABC), + ("h2region", 0xABC, 4, b"abcd"), + ("region2h", 0xABC, 0, 3), + ("notify", 0xABC, 1, 7), + ("wait", 0xABC, 1, 7, 100), + ("close", 0xABC), + ] + + +def test_worker_mapped_region_rejects_mismatched_worker_id_and_closed_wrapper(): + worker = make_l2_worker_with_fake_chip() + region = worker.open_mapped_region(16, signal_count=1) + + with pytest.raises(ValueError, match="worker_id"): + worker.mapped_region_info(region, worker_id=1) + + worker.close_mapped_region(region) + with pytest.raises(ValueError, match="closed"): + worker.mapped_region_notify(region, 0, 1) + + worker.close_mapped_region(region) + assert worker._chip_worker.calls[-1] == ("close", 0xABC) + assert worker._chip_worker.calls.count(("close", 0xABC)) == 1 + + +def test_worker_mapped_region_rejects_str_h2region_input(): + worker = make_l2_worker_with_fake_chip() + region = worker.open_mapped_region(16, signal_count=1) + + with pytest.raises(ValueError, match="bytes-like"): + worker.mapped_region_datacopy_h2region(region, 0, "text") + + +@pytest.mark.parametrize("platform", ["a2a3sim", "a5sim"]) +def test_worker_l2_mapped_region_sim_backend_round_trip(platform): + _run_python_snippet( + f""" + from simpler.worker import Worker + + worker = Worker(level=2, platform="{platform}", runtime="tensormap_and_ringbuffer", build=True) + worker.init() + try: + region = worker.open_mapped_region(8, signal_count=1) + info = worker.mapped_region_info(region) + assert info.host_data_ptr == 0 + assert info.host_signal_ptr == 0 + assert info.device_data_ptr != 0 + assert info.device_signal_ptr != 0 + assert info.data_bytes == 8 + assert info.signal_count == 1 + + worker.mapped_region_datacopy_h2region(region, 2, b"abcd") + assert worker.mapped_region_datacopy_region2h(region, 0, 8) == b"\\x00\\x00abcd\\x00\\x00" + + worker.mapped_region_wait(region, 0, 0, 0) + try: + worker.mapped_region_wait(region, 0, 1, 0) + raise AssertionError("mapped_region_wait unexpectedly succeeded") + except TimeoutError: + pass + worker.mapped_region_notify(region, 0, 3) + worker.mapped_region_wait(region, 0, 3, 0) + + worker.close_mapped_region(region) + try: + worker.mapped_region_info(region) + raise AssertionError("closed mapped region unexpectedly succeeded") + except ValueError: + pass + finally: + worker.close() + """ + ) + + +@pytest.mark.parametrize("platform", ["a2a3sim", "a5sim"]) +def test_worker_l3_mapped_region_sim_backend_round_trip(platform): + _run_python_snippet( + f""" + from simpler.worker import Worker + + worker = Worker(level=3, device_ids=[0], platform="{platform}", runtime="tensormap_and_ringbuffer", build=True) + worker.init() + try: + region = worker.open_mapped_region(8192, signal_count=2, worker_id=0) + info = worker.mapped_region_info(region) + assert info.host_data_ptr == 0 + assert info.host_signal_ptr == 0 + assert info.device_data_ptr != 0 + assert info.device_signal_ptr != 0 + assert info.data_bytes == 8192 + assert info.signal_count == 2 + + payload = bytes((i % 251 for i in range(5000))) + worker.mapped_region_datacopy_h2region(region, 1024, payload) + assert worker.mapped_region_datacopy_region2h(region, 1024, len(payload)) == payload + + try: + worker.mapped_region_wait(region, 1, 1, 0) + raise AssertionError("mapped_region_wait unexpectedly succeeded") + except TimeoutError: + pass + worker.mapped_region_notify(region, 1, 9) + worker.mapped_region_wait(region, 1, 9, 0) + + try: + worker.mapped_region_info(region, worker_id=1) + raise AssertionError("mismatched worker_id unexpectedly succeeded") + except ValueError: + pass + worker.close_mapped_region(region) + try: + worker.mapped_region_info(region) + raise AssertionError("closed mapped region unexpectedly succeeded") + except ValueError: + pass + finally: + worker.close() + """ + )