From 0c8f41aeef5737fa0f697eeddcaa0214b49860ee Mon Sep 17 00:00:00 2001 From: ccyywwen <75376396+ccyywwen@users.noreply.github.com> Date: Tue, 26 May 2026 18:17:29 +0800 Subject: [PATCH 1/2] Add: host-device mapped region design --- docs/host-device_mapped-region_design.md | 779 +++++++++++++++++++++++ 1 file changed, 779 insertions(+) create mode 100644 docs/host-device_mapped-region_design.md diff --git a/docs/host-device_mapped-region_design.md b/docs/host-device_mapped-region_design.md new file mode 100644 index 000000000..f1c6f1fd6 --- /dev/null +++ b/docs/host-device_mapped-region_design.md @@ -0,0 +1,779 @@ +# HostDeviceMappedRegion Design + +## Purpose + +`HostDeviceMappedRegion` is a low-level L3 parent-to-chip-child/NPU +communication primitive in Simpler. It exposes a child-owned memory region +through a narrow host-side API: + +- device-visible data and signal addresses that can be passed to kernels, +- a raw data area addressed by byte offset, +- cache-line-sized signal slots, +- explicit host-side datacopy into and out of the region, and +- explicit host-side notify/wait operations on signal slots. + +The primitive is intentionally lower-level than task tensor payload handling, +shared-buffer protocols, and send/recv protocols. It does not construct +`ContinuousTensor` descriptors, infer dependencies, manage queue metadata, or +define a message format. Higher-level protocols can build those policies on +top of the mapped data area and signal slots. + +This design extracts only the first-layer `datacopy + notify/wait` primitive +from the lessons of PR803. Higher-level shared-buffer and send/recv protocols +are left to later designs. + +## Runtime Ownership + +`HostDeviceMappedRegion` is owned by the process that owns `DeviceContext`. + +In L3 PROCESS mode, that owner is the chip child process containing +`ChipWorker`, `DeviceRunner`, and the loaded host runtime. The L3 parent owns +only an opaque region wrapper and reaches the child through the existing +parent-child mailbox RPC path. + +The NPU does not own the allocation lifetime. It participates by reading and +writing the device-visible addresses returned by `mapped_region_info`. + +Host mappings are owner-process implementation details: + +- public Python `mapped_region_info()` always reports `host_data_ptr == 0`, + and `host_signal_ptr == 0`; +- host-side public access uses `mapped_region_datacopy_h2region()` and + `mapped_region_datacopy_region2h()` in all modes; +- L3 parent callers may receive `device_data_ptr` and `device_signal_ptr`, + because those values are task/kernel arguments rather than parent + dereferenceable addresses. + +The parent-child mailbox is a host-side control and proxy transport. It is not +the CPU-NPU mapped-region primitive itself. + +### Lifetime And Handle Ownership + +Each opened region is registered in the owning `DeviceContext`. The public +handle is the child-side pointer value, carried as an opaque token by the L3 +parent. The parent must never dereference that value. + +The owner context registry is still required. All `info`, datacopy, notify, +wait, and close operations validate that the handle exists in the supplied +context. A handle from another context, a stale handle, or a double-close is +invalid. + +`close_mapped_region` releases the owner-side host mapping and device +allocation, but it is not a device synchronization operation. The caller must +ensure no in-flight kernel, AICPU code, or other device participant still uses +`device_data_ptr` or `device_signal_ptr` before closing the region. Usually +that means waiting for task completion or for the protocol's completion signal +before close. + +`close_mapped_region` does not inspect task state, signal state, or implicitly +wait for device work. In-flight device access at close time is a caller +protocol error. + +`finalize_device` / `destroy_device_context` releases any still-registered +mapped regions as a resource cleanup fallback. That cleanup does not make close +safe while device code is still accessing the region. + +## Public ABI + +The runtime C ABI defines an opaque handle plus config and info structures in +`src/common/worker/pto_runtime_c_api.h`: + +```cpp +typedef void *HostDeviceMappedRegionHandle; + +typedef struct { + uint64_t data_bytes; + uint32_t signal_count; + uint32_t flags; +} HostDeviceMappedRegionConfig; + +typedef struct { + uint64_t host_data_ptr; + uint64_t device_data_ptr; + uint64_t data_bytes; + uint64_t host_signal_ptr; + uint64_t device_signal_ptr; + uint32_t signal_count; + uint64_t total_bytes; + uint32_t flags; +} HostDeviceMappedRegionInfo; +``` + +`flags` is reserved and must be `0` in this design. Non-zero flags are invalid. + +The C ABI entry points are: + +```cpp +int open_host_device_mapped_region_ctx( + DeviceContextHandle ctx, + const HostDeviceMappedRegionConfig *cfg, + HostDeviceMappedRegionHandle *out_region +); +int close_host_device_mapped_region_ctx( + DeviceContextHandle ctx, + HostDeviceMappedRegionHandle region +); +int host_device_mapped_region_info_ctx( + DeviceContextHandle ctx, + HostDeviceMappedRegionHandle region, + HostDeviceMappedRegionInfo *info +); +int host_device_mapped_region_datacopy_h2region_ctx( + DeviceContextHandle ctx, + HostDeviceMappedRegionHandle region, + uint64_t offset, + const void *src, + size_t nbytes +); +int host_device_mapped_region_datacopy_region2h_ctx( + DeviceContextHandle ctx, + HostDeviceMappedRegionHandle region, + uint64_t offset, + void *dst, + size_t nbytes +); +int host_device_mapped_region_notify_ctx( + DeviceContextHandle ctx, + HostDeviceMappedRegionHandle region, + uint32_t signal_id, + uint32_t value +); +int host_device_mapped_region_wait_ctx( + DeviceContextHandle ctx, + HostDeviceMappedRegionHandle region, + uint32_t signal_id, + uint32_t target, + uint32_t timeout_us +); +``` + +`open_host_device_mapped_region_ctx` returns `0` on success and writes the +handle to `out_region`. On failure it writes `NULL` to `out_region` and returns +a negative errno-style code. + +`close_host_device_mapped_region_ctx` takes the handle by value. It does not +clear caller-side storage. Reusing the same handle after close is invalid. + +### Error Model + +The C ABI uses negative errno-style return codes: + +- `0`: success. +- `-EINVAL`: invalid context, handle, config, range, signal id, value, or null + pointer. +- `-EAGAIN` / `-EWOULDBLOCK`: non-blocking wait miss or bounded wait timeout. +- `-ENOMEM`: allocation or wrapper-allocation failure. +- `-EIO`: backend mapping, datacopy, or signal failure. +- `-ENOTSUP`: unsupported platform or unsupported backend feature. + +Python bindings map invalid user input to `ValueError`, wait miss or timeout +to `TimeoutError`, and backend, allocation, or unsupported-platform failures +to `RuntimeError`. + +## Python API + +The Python API exposes matching expert operations through the existing +`ChipWorker`, `Worker`, and `Orchestrator` control chain: + +```python +region = worker.open_mapped_region( + data_bytes, + signal_count=2, + flags=0, + worker_id=0, +) +info = worker.mapped_region_info(region) + +worker.mapped_region_datacopy_h2region(region, offset, data) +out = worker.mapped_region_datacopy_region2h(region, offset, nbytes) + +worker.mapped_region_notify(region, signal_id, value) +worker.mapped_region_wait(region, signal_id, target, timeout_us) +worker.close_mapped_region(region) +``` + +`region` is a lightweight Python wrapper, not a naked integer. It records at +least: + +- the opaque child-side handle value, +- the owning `worker_id`, +- whether the region is closed, and +- the opened `data_bytes`, `signal_count`, and `flags`. + +Follow-up operations default to `region.worker_id`. If the caller explicitly +passes a different `worker_id`, Python raises `ValueError` before sending RPC. +Operations on a closed region also raise `ValueError`. + +The first implementation uses manual close only. Context-manager syntax such as +`with worker.open_mapped_region(...)` is out of scope. + +`mapped_region_info(region)` returns a structured `MappedRegionInfo` object, +not a dictionary. The fields mirror `HostDeviceMappedRegionInfo`, but public +Python host pointers are always masked: + +```python +MappedRegionInfo( + host_data_ptr=0, + device_data_ptr=..., + data_bytes=..., + host_signal_ptr=0, + device_signal_ptr=..., + signal_count=..., + total_bytes=..., + flags=..., +) +``` + +The primitive returns bare device pointers only. It does not construct +`ContinuousTensor`, `TaskArgs`, queue metadata, or message descriptors. If a +kernel expects tensor metadata, a higher layer may wrap +`info.device_data_ptr + offset` in `ContinuousTensor(..., child_memory=True)`. + +`mapped_region_datacopy_h2region(region, offset, data)` accepts readable, +C-contiguous bytes-like objects through the Python buffer protocol. `str` is +rejected; callers must encode explicitly. Non-contiguous buffers are invalid. + +`mapped_region_datacopy_region2h(region, offset, nbytes)` returns a new +`bytes` object. The first implementation does not support writing into a +caller-provided mutable buffer. + +## Region Layout + +The region has a fixed internal layout: + +```text +HostDeviceMappedRegionHeader +HostDeviceMappedRegionSignalSlot[signal_count] +data[data_bytes] +padding to 64B +``` + +Header: + +```cpp +static constexpr uint32_t HDMR_MAGIC = 0x48444D52U; // "HDMR" +static constexpr uint32_t HDMR_VERSION = 1; + +struct alignas(64) HostDeviceMappedRegionHeader { + uint32_t magic; + uint32_t version; + uint32_t flags; + uint32_t signal_count; + uint64_t signal_offset; + uint64_t data_offset; + uint64_t data_bytes; + uint64_t total_bytes; + uint64_t reserved[2]; +}; +``` + +Signal slot: + +```cpp +struct alignas(64) HostDeviceMappedRegionSignalSlot { + volatile uint32_t value; + uint32_t reserved0; + uint64_t reserved[7]; +}; +``` + +Required static layout checks: + +```cpp +static_assert(sizeof(HostDeviceMappedRegionHeader) == 64); +static_assert(alignof(HostDeviceMappedRegionHeader) == 64); +static_assert(sizeof(HostDeviceMappedRegionSignalSlot) == 64); +static_assert(alignof(HostDeviceMappedRegionSignalSlot) == 64); +``` + +Each signal slot occupies one 64B cache line. This keeps independent signal +words from sharing a line and gives device code a conservative documented +layout. Signal values are `uint32_t`. + +`HostDeviceMappedRegionInfo` is fixed at 64B: + +```cpp +static_assert(offsetof(HostDeviceMappedRegionInfo, host_data_ptr) == 0); +static_assert(offsetof(HostDeviceMappedRegionInfo, device_data_ptr) == 8); +static_assert(offsetof(HostDeviceMappedRegionInfo, data_bytes) == 16); +static_assert(offsetof(HostDeviceMappedRegionInfo, host_signal_ptr) == 24); +static_assert(offsetof(HostDeviceMappedRegionInfo, device_signal_ptr) == 32); +static_assert(offsetof(HostDeviceMappedRegionInfo, signal_count) == 40); +static_assert(offsetof(HostDeviceMappedRegionInfo, total_bytes) == 48); +static_assert(offsetof(HostDeviceMappedRegionInfo, flags) == 56); +static_assert(sizeof(HostDeviceMappedRegionInfo) == 64); +``` + +Sizing: + +```text +signal_offset = sizeof(HostDeviceMappedRegionHeader) +data_offset = align64(signal_offset + signal_count * sizeof(SignalSlot)) +total_bytes = align64(data_offset + data_bytes) +``` + +Validation rules: + +- `data_bytes > 0` +- `signal_count > 0` +- `flags == 0` +- `offset <= data_bytes` +- `nbytes <= data_bytes - offset` +- `signal_id < signal_count` +- all arithmetic is checked for overflow + +This makes zero-length datacopy at `offset == data_bytes` valid while +rejecting non-zero copies at the end of the data area. Implementations should +use the subtraction form above rather than computing `offset + nbytes`, so +overflow cannot turn an out-of-range request into an in-range value. + +A newly opened region is zero-initialized. Header reserved fields, signal +values, data bytes, and padding are all zero before the region is returned. + +## L3 Control Transport + +L3 parent calls proxy through the existing mailbox control path. Small control +operations use mailbox arguments. Structured replies and payload-bearing +operations use POSIX `SharedMemory` side-band. + +The mailbox control argument layout is extended to support four input values: + +```text +CTRL_OFF_ARG0 = 16 +CTRL_OFF_ARG1 = 24 +CTRL_OFF_ARG2 = 32 +CTRL_OFF_ARG3 = 40 +CTRL_OFF_RESULT = 48 +``` + +This moves `CTRL_OFF_RESULT` from offset 40 to 48. Existing control commands +must be updated on both C++ and Python sides. + +Mailbox-argument operations: + +- `open_mapped_region` +- `close_mapped_region` +- `mapped_region_notify` +- `mapped_region_wait` + +Side-band operations: + +- `mapped_region_info` +- `mapped_region_datacopy_h2region` +- `mapped_region_datacopy_region2h` + +`open_mapped_region` uses: + +```text +ARG0 = data_bytes +ARG1 = signal_count +ARG2 = flags +RESULT = region_handle +``` + +`close_mapped_region` uses: + +```text +ARG0 = region_handle +``` + +`mapped_region_notify` uses: + +```text +ARG0 = region_handle +ARG1 = signal_id +ARG2 = value +``` + +`mapped_region_wait` uses: + +```text +ARG0 = region_handle +ARG1 = signal_id +ARG2 = target +ARG3 = timeout_us +``` + +### Side-Band Header + +The side-band request/reply memory has a fixed little-endian 64B header. The +mailbox carries only the control command and the NUL-terminated shared-memory +name. + +```text +magic u32 "HMRD" +version u16 1 +op u16 1=info, 2=h2region, 3=region2h +region u64 opaque handle value +offset u64 +nbytes u64 +status i32 child writes 0 or negative errno +reserved u32 +reserved2 zero padding to 64B +payload starts at offset 64 +``` + +For `mapped_region_info`, the child writes a 64B +`HostDeviceMappedRegionInfo` payload after the header and masks +`host_data_ptr` and `host_signal_ptr` to zero before publishing +`CONTROL_DONE`. + +For `mapped_region_datacopy_h2region`, the parent writes the payload before +sending the mailbox request. The child validates the request and copies from +the side-band payload to the owner-process mapped region. + +For `mapped_region_datacopy_region2h`, the parent creates a side-band segment +large enough for the header plus `nbytes`. The child validates the request, +copies from the mapped region into the side-band payload, writes `status`, and +then publishes `CONTROL_DONE`. The parent reads the payload only after +`CONTROL_DONE`. + +The parent closes and unlinks the `SharedMemory` segment after the control +round trip. The child closes its mapping before publishing `CONTROL_DONE`. + +## Datacopy Semantics + +The datacopy APIs move raw bytes between a caller-provided host buffer and the +mapped region data area: + +```text +datacopy_h2region: + host buffer -> region data[offset:offset+nbytes] + +datacopy_region2h: + region data[offset:offset+nbytes] -> host buffer +``` + +In L3, host buffer transfer has two host-side steps: + +```text +h2region: + parent buffer -> SharedMemory payload -> child host mapping + +region2h: + child host mapping -> SharedMemory payload -> parent bytes +``` + +The side-band `SharedMemory` segment is only the L3 parent-to-child payload +transport. It does not provide CPU-NPU visibility semantics. + +Datacopy does not wait, notify, check protocol phase, update ring metadata, or +construct tensor descriptors. Protocols compose the primitives explicitly. For +example: + +```text +producer write = datacopy_h2region + notify +consumer read = wait + datacopy_region2h +``` + +For public Python callers, host writes become publishable only through a +successful `mapped_region_datacopy_h2region` call. `mapped_region_notify` is a +release publication point for those completed datacopy writes. Direct host +pointer writes are not part of the public Python contract. + +## Signal Semantics + +Signal values are `uint32_t`. A signal slot is a lightweight doorbell or phase +word for bounded protocol epochs, not a long-lived channel sequence counter. +Higher-level protocols such as SPSC channels that need long-running head, +tail, or sequence values should define their own `uint64_t` metadata in the +mapped data area, including their own wrap-around rules. + +```text +notify(signal_id, value) + publish value to signal[signal_id] + +wait(signal_id, target, timeout_us) + complete when observed signal[signal_id] >= target + otherwise return would-block after timeout policy +``` + +Signal values are monotonic within one protocol epoch. Host-side notify +rejects a value lower than the current signal value with `-EINVAL`. Device-side +signal publication must also be monotonic; violating this is a protocol error +and wait behavior is no longer guaranteed. + +Wrap-around handling is out of scope. + +All signal values are initialized to zero. Therefore +`wait(signal_id, target=0, timeout_us=0)` is a legal non-blocking probe that +immediately succeeds. + +`wait` has only two modes: + +- `timeout_us == 0`: non-blocking probe. +- `timeout_us > 0`: bounded wait. + +There is no infinite wait mode. + +`timeout_us` is a best-effort upper bound measured with a monotonic host clock. +The implementation may return later than requested because of host scheduler +latency. It must not report timeout before the deadline, except when +`timeout_us == 0`, which performs no waiting. + +`device_signal_ptr` points to the device-visible signal slot array. Device code +may use that documented layout directly. There is no required device-side +helper API for the first implementation. + +### Memory Ordering + +`notify` is a release publication point for completed host datacopy writes. +`wait` is an acquire observation point for reads sequenced after it. + +For CPU produces / NPU consumes: + +```text +host datacopy_h2region(...) +host notify(signal_id, seq) +device acquire-poll signal_id >= seq +device reads data +``` + +If device code observes `signal_id >= seq`, device reads after that +observation must see host datacopy writes completed before the matching +`notify`. + +For NPU produces / CPU consumes: + +```text +device writes data +device release-stores signal_id = seq +host wait(signal_id, seq, timeout_us) +host datacopy_region2h(...) +``` + +If host wait succeeds, host datacopy reads after wait must see device writes +completed before the matching device signal publication. + +The first a2a3 implementation uses atomic release/acquire operations on mapped +signal slots. If the real-NPU round-trip validation is flaky or fails, the +backend must not expose weaker best-effort semantics. It must either add the +required Ascend cache-maintenance primitives and document them, or return +`-ENOTSUP` for a2a3 onboard. + +## Relationship To Task Tensor Payloads + +Simpler's normal task tensor payload path is built around `TaskArgs` and +`ContinuousTensor`. It is task-scoped and tensor-oriented: + +1. The user adds a `ContinuousTensor` to `TaskArgs`. +2. The task is dispatched to a chip child. +3. `init_runtime_impl` prepares device-side task arguments. +4. For ordinary tensors, the runtime allocates device memory and calls + `copy_to_device()` from the host pointer in `ContinuousTensor.data`. +5. The runtime replaces the tensor's data pointer with the device pointer + before launching device orchestration and kernels. +6. During validation/copy-back, recorded tensor pairs can be copied from + device memory back to the original host pointer. + +That path is convenient for normal kernel inputs and outputs. The runtime owns +the per-task tensor staging details, and the user describes tensors rather than +explicit data movement phases. + +`child_memory=True` is an opt-out from that automatic staging path. When a +`ContinuousTensor` is marked as child memory, `init_runtime_impl` treats +`ContinuousTensor.data` as an existing child-managed device pointer. It passes +the tensor through without allocating new device memory and without +`copy_to_device()`. The caller is responsible for allocating and populating the +device buffer. + +`HostDeviceMappedRegion` is different from both: + +- Ordinary task tensor: `ContinuousTensor` host pointer, implicit + `copy_to_device()` and copy-back around a task, task/runtime-managed + lifetime, TensorMap and task dependencies for sync. +- `child_memory=True` tensor: existing device pointer, caller-managed H2D/D2H + copies, caller/child-managed lifetime, TensorMap can still see the tensor + argument. +- Mapped region: data offsets plus signal slots, explicit + `datacopy_h2region` / `datacopy_region2h`, explicit open/close on a + child-owned region, explicit notify/wait. + +Mapped regions produce device addresses that can be passed through existing +task arguments. They do not require changing `TaskArgs`, `ContinuousTensor`, +TensorMap, or the normal `copy_to_device()` path. + +If a mapped-region-backed tensor is submitted through `TaskArgs`, callers +should choose the appropriate tensor tag for scheduler behavior, usually +`NO_DEP` when synchronization is handled by the mapped region's signal +protocol. + +## Platform Support + +Supported platforms: + +- `a2a3` onboard +- `a2a3sim` +- `a5sim` + +Unsupported stub: + +- `a5` onboard + +Common implementation files: + +- `src/common/worker/host_device_mapped_region.h` +- `src/common/worker/host_device_mapped_region.cpp` + +Platform allocation and mapping live in existing runtime C ABI implementation +files: + +- `src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp` +- `src/a2a3/platform/sim/host/pto_runtime_c_api.cpp` +- `src/a5/platform/sim/host/pto_runtime_c_api.cpp` +- `src/a5/platform/onboard/host/pto_runtime_c_api.cpp` + +The common implementation owns layout validation, bounds checks, datacopy, +host-side notify, and host-side wait. Platform code owns allocation, host +mapping, unmapping, and unsupported stubs. + +### a2a3 Onboard Backend + +The a2a3 onboard backend reuses the existing repo transport pattern: + +```text +dev_ptr = DeviceRunner::allocate_tensor(total_bytes) +halHostRegister(dev_ptr, total_bytes, DEV_SVM_MAP_HOST, device_id, &host_ptr) +common_init(host_ptr, total_bytes, cfg) +``` + +`DeviceRunner::allocate_tensor` uses the platform `MemoryAllocator`, backed by +device memory allocation. `halHostRegister` is resolved from +`libascend_hal.so`, matching the existing profiling, tensor-dump, PMU, and +dep-gen paths. + +`common_init` writes the header and zero-fills signal, data, reserved, and +padding bytes through the owner-process host mapping. + +On close or cleanup, the backend unregisters the host mapping before freeing +device memory: + +```text +halHostUnregister(host_ptr, device_id) +DeviceRunner::free_tensor(dev_ptr) +``` + +If `libascend_hal.so` cannot be loaded, the HAL symbols are missing, or +`halHostRegister` fails, open returns `-EIO`. + +### Sim Backends + +`a2a3sim` and `a5sim` may use ordinary host memory as both the host mapping and +device-visible pointer in the simulation address space. They must still follow +the same layout, validation, and signal semantics as onboard. + +### a5 Onboard + +`a5` onboard returns `-ENOTSUP` for open. It must export the ABI symbols as +explicit unsupported stubs rather than omitting them. + +## Thin NPU Example + +Add a minimal real-NPU round-trip example for `a2a3` onboard: + +```text +host: + data_bytes = 2 * N * sizeof(float) + signal_count = 2 + open mapped region + repeat seq in 1..10: + datacopy input pattern(seq) into data[0:N] + notify signal[0] = seq + submit or let AIV observe device_data_ptr/device_signal_ptr + +AIV: + acquire-poll signal[0] until >= seq + read input pattern(seq) + output[i] = input[i] + add_const + seq_adjustment + release-store signal[1] = seq + +host: + wait signal[1] >= seq with bounded timeout + datacopy output from data[N:2N] + verify exact expected bytes for seq +``` + +This proves: + +- host datacopy into mapped region is visible to NPU, +- host notify is visible to NPU, +- NPU writes to mapped region are visible to host, +- NPU signal publication is visible to host wait, and +- device pointers returned by `mapped_region_info` can be passed to kernels. + +The example reuses the same mapped region for all 10 iterations. Signal values +must increase monotonically. Input and output patterns must depend on `seq` so +stale reads cannot pass accidentally. + +If this example is flaky or fails, `a2a3` onboard must not be marked supported +until the missing ordering or cache-maintenance mechanism is implemented and +documented. + +## Tests + +Common C++ unit tests: + +- layout size, alignment, and `offsetof` assertions, +- invalid config handling, +- overflow and bounds checks, +- zero-initialization, +- datacopy h2region and region2h, +- host notify/wait, +- decreasing host notify rejection, and +- stale handle, double-close, and cross-context handle rejection. + +Python sim tests: + +- `a2a3sim` and `a5sim` smoke coverage, +- direct and L3 proxy paths, +- public Python info returns `host_*_ptr == 0`, +- info returns valid `device_*_ptr`, +- region wrapper rejects mismatched `worker_id`, +- closed region operations raise `ValueError`, +- h2region rejects `str` and non-contiguous buffers, +- region2h returns `bytes`, +- L3 datacopy payloads larger than mailbox capacity, and +- datacopy plus host notify/wait behavior. + +Mailbox and side-band tests: + +- mapped-region `wait` uses `ARG3`, +- existing mailbox controls still read result at `CTRL_OFF_RESULT == 48`, +- malformed side-band magic/version/op is rejected, +- side-band `status` propagates negative errno-style failures, and +- `info` side-band replies mask host pointers in the child. + +Onboard example: + +- run the 10-iteration `a2a3` real-NPU round trip described above, +- verify the NPU code demonstrates acquire poll and release store against the + documented signal slot layout. + +Error and lifetime tests: + +- invalid config, stale handle, double-close, and cross-context handle use, +- non-blocking wait miss and bounded wait timeout map to timeout errors in + Python, +- unsupported `a5` onboard stubs fail explicitly, +- `finalize_device` / `destroy_device_context` release regions that were not + explicitly closed, and +- runtime shared objects export all mapped-region ABI symbols. + +## Out Of Scope + +`HostDeviceMappedRegion` does not define: + +- migrating PR803 `HostDeviceMemory` or `HostDeviceChannel`, +- send/recv message protocols, +- ring/channel protocols, +- queue metadata, +- TensorMap dependency publication, +- automatic tensor descriptor construction, +- formal device-side helper APIs, +- multi-chip communication protocols, +- performance benchmarking contracts, +- `a5` onboard allocation/mapping behavior, +- context-manager support for the Python wrapper, +- infinite wait semantics, +- non-zero flag semantics, or +- signal wrap-around handling. From 0277e749d067bb2056ca6b65b9402d8679eba5ba Mon Sep 17 00:00:00 2001 From: ccyywwen <75376396+ccyywwen@users.noreply.github.com> Date: Wed, 27 May 2026 18:10:35 +0800 Subject: [PATCH 2/2] Add: host-device mapped region support - Add HostDeviceMappedRegion ABI, common implementation, and sim/onboard backends - Expose mapped-region lifecycle, datacopy, notify, and wait through ChipWorker and Worker - Add L3 control-plane proxying, round-trip examples, UT coverage, and user documentation --- docs/L3-L2-host-device-communication.md | 628 ++++++++++++++ docs/dynamic-linking.md | 10 +- docs/host-device_mapped-region_design.md | 779 ------------------ docs/worker-manager.md | 51 +- .../host_device_mapped_region_round_trip.cpp | 99 +++ ...t_device_mapped_region_round_trip_orch.cpp | 36 + .../main.py | 145 ++++ .../test_mapped_region_round_trip.py | 67 ++ python/bindings/task_interface.cpp | 133 +++ python/bindings/worker_bind.h | 37 + python/simpler/task_interface.py | 30 + python/simpler/worker.py | 399 ++++++++- src/a2a3/platform/onboard/host/CMakeLists.txt | 2 + .../platform/onboard/host/device_runner.cpp | 101 +++ .../platform/onboard/host/device_runner.h | 22 + .../host_device_mapped_region_onboard.cpp | 99 +++ .../host/host_device_mapped_region_onboard.h | 22 + .../onboard/host/pto_runtime_c_api.cpp | 55 +- src/a2a3/platform/sim/host/CMakeLists.txt | 2 + .../platform/sim/host/pto_runtime_c_api.cpp | 55 +- src/a5/platform/onboard/host/CMakeLists.txt | 1 + .../onboard/host/pto_runtime_c_api.cpp | 55 ++ src/a5/platform/sim/host/CMakeLists.txt | 2 + .../platform/sim/host/pto_runtime_c_api.cpp | 55 +- src/common/hierarchical/worker.h | 17 + src/common/hierarchical/worker_manager.cpp | 97 ++- src/common/hierarchical/worker_manager.h | 25 +- .../host_device_mapped_region.cpp | 490 +++++++++++ .../host_device_mapped_region.h | 89 ++ .../host_device_mapped_region_sim.cpp | 35 + .../host_device_mapped_region_sim.h | 22 + src/common/worker/chip_worker.cpp | 120 +++ src/common/worker/chip_worker.h | 24 + src/common/worker/pto_runtime_c_api.h | 47 ++ tests/ut/cpp/CMakeLists.txt | 19 + .../common/test_host_device_mapped_region.cpp | 319 +++++++ .../test_mailbox_control_layout.cpp | 11 + tests/ut/py/test_chip_worker.py | 30 +- .../py/test_worker/test_mapped_region_hw.py | 53 ++ .../test_mapped_region_round_trip_hw.py | 56 ++ .../py/test_worker/test_mapped_region_sim.py | 225 +++++ 41 files changed, 3768 insertions(+), 796 deletions(-) create mode 100644 docs/L3-L2-host-device-communication.md delete mode 100644 docs/host-device_mapped-region_design.md create mode 100644 examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/kernels/aiv/host_device_mapped_region_round_trip.cpp create mode 100644 examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/kernels/orchestration/host_device_mapped_region_round_trip_orch.cpp create mode 100644 examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/main.py create mode 100644 examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/test_mapped_region_round_trip.py create mode 100644 src/a2a3/platform/onboard/host/host_device_mapped_region_onboard.cpp create mode 100644 src/a2a3/platform/onboard/host/host_device_mapped_region_onboard.h create mode 100644 src/common/host_device_comm/host_device_mapped_region.cpp create mode 100644 src/common/host_device_comm/host_device_mapped_region.h create mode 100644 src/common/host_device_comm/host_device_mapped_region_sim.cpp create mode 100644 src/common/host_device_comm/host_device_mapped_region_sim.h create mode 100644 tests/ut/cpp/common/test_host_device_mapped_region.cpp create mode 100644 tests/ut/cpp/hierarchical/test_mailbox_control_layout.cpp create mode 100644 tests/ut/py/test_worker/test_mapped_region_hw.py create mode 100644 tests/ut/py/test_worker/test_mapped_region_round_trip_hw.py create mode 100644 tests/ut/py/test_worker/test_mapped_region_sim.py diff --git a/docs/L3-L2-host-device-communication.md b/docs/L3-L2-host-device-communication.md new file mode 100644 index 000000000..05dfaea51 --- /dev/null +++ b/docs/L3-L2-host-device-communication.md @@ -0,0 +1,628 @@ +# L3/L2 Host-Device Communication + +This page explains Simpler's L3/L2 host-device communication primitive: +`HostDeviceMappedRegion`. + +It is written for people learning the Simpler library. It covers ownership, +lifetime, the public ABI, Python usage, datacopy, signal semantics, and how +mapped regions relate to the normal `TaskArgs` tensor path. It intentionally +does not document mailbox payload layouts, backend allocation internals, or +test implementation details. + +For the surrounding runtime model, see +[hierarchical_level_runtime.md](hierarchical_level_runtime.md) and +[task-flow.md](task-flow.md). + +## Why It Exists + +Simpler's normal task path is tensor-oriented: + +1. User code builds `TaskArgs`. +2. The runtime stages tensor inputs to device memory. +3. A chip task runs. +4. Output tensors are copied back or validated through the task runtime. + +That path is the right default for ordinary kernel inputs and outputs. It is +less suitable when host code and device code need to coordinate across several +steps, reuse one device-visible buffer, or exchange small phase signals without +turning every phase into a new task payload. + +`HostDeviceMappedRegion` fills that gap. A mapped region gives host code: + +- a reusable data area that is visible to device code, +- one or more signal slots for simple phase handshakes, +- explicit host-to-region and region-to-host byte copies, and +- explicit notify/wait operations. + +The primitive is intentionally small. It does not define a queue, channel, +message format, tensor descriptor, or scheduler dependency policy. Higher-level +protocols can build those rules on top of the data area and signal slots. + +## Runtime Ownership + +The mapped region is owned by the process that owns the chip-side +`DeviceContext`. + +At L2, a `Worker(level=2)` owns one `ChipWorker` and talks to one chip +directly: + +```text +Python Worker(level=2) + | + +-- ChipWorker + | + +-- L2 chip runtime and kernels +``` + +At L3, a `Worker(level=3)` owns one chip child process per device id. The L3 +parent exposes the same mapped-region methods, and `worker_id` selects which +chip child owns the region: + +```text +Python Worker(level=3) + | + +-- chip child 0 -> ChipWorker -> L2 chip 0 + +-- chip child 1 -> ChipWorker -> L2 chip 1 + +-- sub workers +``` + +In L3 process mode, the chip child process owns `ChipWorker`, `DeviceRunner`, +the loaded host runtime, and the mapped-region registry. The L3 parent owns +only a Python `MappedRegion` wrapper and reaches the child through the +existing parent-child control path. + +The NPU does not own the allocation lifetime. Device code participates by +reading and writing the device-visible addresses returned by +`mapped_region_info()`. + +Pointer semantics follow the ownership rule: + +- `device_data_ptr` and `device_signal_ptr` are public device-visible + addresses. They can be passed to kernels through `TaskArgs` scalars or + tensor metadata. +- `host_data_ptr` and `host_signal_ptr` are not public Python dereferenceable + addresses. The Python API reports them as `0`. +- Host code accesses the region through datacopy methods, not by writing a + mapped host pointer directly. + +The L3 mailbox is only a host-side proxy transport. It is not the CPU-NPU +mapped-region primitive itself. + +## Lifetime And Handle Ownership + +Each opened region is registered in the owning `DeviceContext`. All `info`, +datacopy, notify, wait, and close operations validate that the handle belongs +to that context. + +A handle from another context, a stale handle, a closed handle, or a +double-close is invalid. In Python, `MappedRegion` also records the owning +`worker_id`; passing a different `worker_id` to a later operation is rejected +before the request is sent to the chip child. + +`close_mapped_region()` releases the owner-side host mapping and device +allocation. It is not a device synchronization operation. The caller must +ensure no in-flight kernel, AICPU code, or other device participant still uses +`device_data_ptr` or `device_signal_ptr` before closing the region. + +Usually that means waiting for task completion or for the protocol's +completion signal before close: + +```text +host writes input +host notify input_ready +device reads input and writes output +device publish output_ready +host wait output_ready +host reads output +host close mapped region +``` + +`Worker.close()`, `finalize_device()`, and `destroy_device_context()` clean up +remaining mapped regions as a resource fallback. That cleanup does not make it +safe to close while device code is still accessing the region. + +## Public ABI + +The runtime C ABI defines an opaque handle plus config and info structures in +`src/common/worker/pto_runtime_c_api.h`. Most library users do not call this +ABI directly, but it is the stable boundary that `ChipWorker` resolves from a +runtime shared object. + +```cpp +typedef void *HostDeviceMappedRegionHandle; + +typedef struct HostDeviceMappedRegionConfig { + uint64_t data_bytes; + uint32_t signal_count; + uint32_t flags; +} HostDeviceMappedRegionConfig; + +typedef struct HostDeviceMappedRegionInfo { + uint64_t host_data_ptr; + uint64_t device_data_ptr; + uint64_t data_bytes; + uint64_t host_signal_ptr; + uint64_t device_signal_ptr; + uint32_t signal_count; + uint32_t reserved0; + uint64_t total_bytes; + uint32_t flags; + uint32_t reserved1; +} HostDeviceMappedRegionInfo; +``` + +`flags` is reserved and must be `0`. + +The C ABI entry points are: + +```cpp +int open_host_device_mapped_region_ctx( + DeviceContextHandle ctx, + const HostDeviceMappedRegionConfig *cfg, + HostDeviceMappedRegionHandle *out_region +); + +int close_host_device_mapped_region_ctx( + DeviceContextHandle ctx, + HostDeviceMappedRegionHandle region +); + +int host_device_mapped_region_info_ctx( + DeviceContextHandle ctx, + HostDeviceMappedRegionHandle region, + HostDeviceMappedRegionInfo *info +); + +int host_device_mapped_region_datacopy_h2region_ctx( + DeviceContextHandle ctx, + HostDeviceMappedRegionHandle region, + uint64_t offset, + const void *src, + size_t nbytes +); + +int host_device_mapped_region_datacopy_region2h_ctx( + DeviceContextHandle ctx, + HostDeviceMappedRegionHandle region, + uint64_t offset, + void *dst, + size_t nbytes +); + +int host_device_mapped_region_notify_ctx( + DeviceContextHandle ctx, + HostDeviceMappedRegionHandle region, + uint32_t signal_id, + uint32_t value +); + +int host_device_mapped_region_wait_ctx( + DeviceContextHandle ctx, + HostDeviceMappedRegionHandle region, + uint32_t signal_id, + uint32_t target, + uint32_t timeout_us +); +``` + +The ABI uses negative errno-style return codes: + +- `0`: success. +- `-EINVAL`: invalid context, handle, config, range, signal id, value, or + pointer. +- `-EAGAIN` / `-EWOULDBLOCK`: non-blocking wait miss or bounded wait timeout. +- `-ENOMEM`: allocation or wrapper construction failure. +- `-EIO`: backend mapping, datacopy, or signal failure. +- `-ENOTSUP`: unsupported platform or unsupported backend feature. + +Python maps invalid user input to `ValueError`, wait miss or timeout to +`TimeoutError`, and backend or unsupported-platform failures to `RuntimeError`. + +## Python API + +The user-facing API is exposed through `Worker`: + +```python +region = worker.open_mapped_region( + data_bytes, + signal_count=2, + flags=0, + worker_id=0, +) + +info = worker.mapped_region_info(region) + +worker.mapped_region_datacopy_h2region(region, offset, payload) +payload = worker.mapped_region_datacopy_region2h(region, offset, nbytes) + +worker.mapped_region_notify(region, signal_id, value) +worker.mapped_region_wait(region, signal_id, target, timeout_us) + +worker.close_mapped_region(region) +``` + +Direct L2 calls execute in the owner process. L3 calls route to the selected +chip child while preserving the same public method names. + +`MappedRegion` is a lightweight Python wrapper, not a pointer. It records: + +- the opaque runtime handle, +- the owning `worker_id`, +- the requested `data_bytes`, +- the requested `signal_count`, +- the reserved `flags` value, and +- whether the region is closed. + +Follow-up operations default to `region.worker_id`. Passing a different +`worker_id` is a user error. Operations on a closed region are also user +errors. + +`mapped_region_info()` returns a `MappedRegionInfo` object with: + +- `device_data_ptr`: device-visible base address of the data area, +- `device_signal_ptr`: device-visible base address of the signal slots, +- `data_bytes`: usable data bytes, +- `signal_count`: number of signal slots, +- `total_bytes`: backend allocation size, and +- `flags`: currently `0`. + +The host pointer fields are always reported as `0` in the public Python API. + +`mapped_region_datacopy_h2region()` accepts bytes-like contiguous buffers. +`str` is rejected; encode text explicitly before passing it. Non-contiguous +buffers are invalid. + +`mapped_region_datacopy_region2h()` returns a new `bytes` object. + +## Datacopy Semantics + +The datacopy APIs move raw bytes between a caller-provided host buffer and the +mapped region data area: + +```text +datacopy_h2region: + host buffer -> region data[offset:offset+nbytes] + +datacopy_region2h: + region data[offset:offset+nbytes] -> host buffer +``` + +The data area is raw bytes. Simpler does not interpret offsets inside it, does +not construct tensor descriptors, and does not attach protocol meaning to a +range. The caller's protocol decides which offsets contain inputs, outputs, +headers, or message payloads. + +Bounds are checked against the configured `data_bytes`. A zero-length copy at +`offset == data_bytes` is valid; a non-zero copy past the end is invalid. + +Datacopy does not wait, notify, check protocol phase, update ring metadata, or +publish TensorMap dependencies. Protocols compose the primitives explicitly: + +```text +producer write = datacopy_h2region + notify +consumer read = wait + datacopy_region2h +``` + +Datacopy alone is not a synchronization boundary. Visibility to the other +participant is established by composing datacopy with the signal protocol +described below. + +For direct L2 calls, the runtime can copy through the owner process's mapped +host view. For L3 parent calls, the same Python method is proxied to the chip +child: + +```text +h2region: + parent buffer -> child request payload -> child mapped region + +region2h: + child mapped region -> child reply payload -> parent bytes +``` + +The proxy path is an implementation detail. The Python contract is the same +for L2 and L3. + +## Signal Semantics + +Signal slots provide lightweight phase or sequence synchronization: + +```text +notify(signal_id, value) + publish value to signal[signal_id] + +wait(signal_id, target, timeout_us) + complete when observed signal[signal_id] >= target + otherwise raise or return a timeout result +``` + +Signal values are `uint32_t`. A signal slot is best treated as a phase word +for bounded protocol epochs, not as a long-lived channel sequence counter. +Higher-level protocols that need long-running head, tail, or sequence values +should define their own metadata in the mapped data area. + +Signal values should be monotonic within one protocol epoch. Wrap-around +handling is not part of this primitive. + +`wait` has two modes: + +- `timeout_us == 0`: non-blocking probe. +- `timeout_us > 0`: bounded wait. + +There is no infinite wait mode. + +All signal slots start at zero. Therefore a non-blocking wait for target zero +can succeed immediately. + +`device_signal_ptr` points to the device-visible signal slot array. Device code +may use the documented signal layout directly. For example, a kernel can poll +signal slot 0, read input data, write output data, and then publish signal +slot 1. + +### Memory Ordering + +`notify` is a release publication point for writes sequenced before it. `wait` +is an acquire observation point for reads sequenced after it. + +For CPU produces / NPU consumes: + +```text +host datacopy_h2region(...) +host notify(signal_id, seq) +device wait or poll signal_id >= seq +device reads data +``` + +If device code observes `signal_id >= seq`, device reads after that +observation must see host writes completed before the matching `notify`. + +For NPU produces / CPU consumes: + +```text +device writes data +device publishes signal_id = seq +host wait(signal_id, seq, timeout_us) +host datacopy_region2h(...) +``` + +If host wait succeeds, host reads after wait must see device writes completed +before the matching device signal publication. + +Device code that accesses signal slots directly must preserve the same +ordering contract: polling a signal that reaches the target is an acquire +operation, and publishing a signal after data writes is a release operation. + +## Relationship To Task Tensor Payloads + +Simpler's normal task tensor payload path is built around `TaskArgs` and +`ContinuousTensor`. It is task-scoped and tensor-oriented: + +1. The user adds a `ContinuousTensor` to `TaskArgs`. +2. The task is dispatched to a chip child. +3. The chip runtime prepares device-side task arguments. +4. For ordinary tensors, the runtime allocates device memory and copies from + the host pointer in `ContinuousTensor.data`. +5. The runtime replaces the tensor's data pointer with the device pointer + before launching device orchestration and kernels. +6. During validation or copy-back, recorded tensor pairs can be copied from + device memory back to the original host pointer. + +That path is convenient for normal kernel inputs and outputs. The runtime owns +the per-task tensor staging details, and the user describes tensors rather than +explicit data movement phases. + +`child_memory=True` is an opt-out from that automatic staging path. When a +`ContinuousTensor` is marked as child memory, the chip runtime treats +`ContinuousTensor.data` as an existing child-managed device pointer. It passes +the tensor through without allocating new device memory and without staging the +contents again. The caller is responsible for allocating and populating the +device buffer, commonly through `orch.malloc` plus `orch.copy_to`. + +`HostDeviceMappedRegion` is different from both: + +- Ordinary task tensor: `ContinuousTensor` host pointer, implicit staging and + optional copy-back around a task, task/runtime-managed lifetime, TensorMap + dependencies for synchronization. +- `child_memory=True` tensor: existing device pointer, caller-managed copies, + caller/child-managed lifetime, TensorMap can still see the tensor argument. +- Mapped region: data offsets plus signal slots, explicit datacopy, explicit + open/close on a chip-owned region, explicit notify/wait. + +### Difference From `copy_to_device()` + +`copy_to_device()` copies from a host buffer into ordinary device memory. It is +used by the task runtime to stage tensor payloads before execution, and it is +also exposed through worker/orchestrator copy helpers for manually managed +device buffers. + +Mapped-region datacopy targets the mapped region's data area, not an arbitrary +device allocation. It is paired with `mapped_region_info()`, which exposes +device-side views of the region, and with signal slots that let a protocol +publish readiness or completion. + +In short: + +```text +copy_to_device: + host buffer -> device allocation + +mapped_region_datacopy_h2region: + host buffer -> chip-owned mapped-region data area +``` + +The mapped-region path is not a replacement for tensor staging. It is the +primitive to use when host and device need a persistent data area plus explicit +synchronization semantics. + +### Difference From `child_memory=True` + +`child_memory=True` changes how the task runtime interprets a tensor argument. +It says: this `ContinuousTensor.data` value is already a valid child-side +device pointer, so the runtime should not allocate, copy, or free it as an +ordinary task tensor. + +Mapped regions can provide such a pointer, but they do not by themselves make +a tensor. A caller may wrap `info.device_data_ptr` in a +`ContinuousTensor(..., child_memory=True)` when a kernel expects tensor +metadata. The mapped region still owns the backing allocation and signal +slots; `child_memory=True` only prevents the task runtime from trying to stage +that pointer again. + +This composition is useful for a kernel-facing data path: + +```text +host: + region = open_mapped_region(...) + info = mapped_region_info(region) + mapped_region_datacopy_h2region(region, 0, input_bytes) + mapped_region_notify(region, 0, 1) + +task args: + tensor = ContinuousTensor.make( + info.device_data_ptr, + shape, + dtype, + child_memory=True, + ) + args.add_tensor(tensor, TensorArgType.NO_DEP) + args.add_scalar(info.device_signal_ptr) + +device: + wait or poll signal[0] + read or write data through device_data_ptr + publish signal[1] + +host: + mapped_region_wait(region, 1, 1, timeout_us) + output = mapped_region_datacopy_region2h(region, output_offset, nbytes) +``` + +The important boundary is that `child_memory=True` is a task-argument staging +flag, while `HostDeviceMappedRegion` is an allocation, address exposure, +datacopy, and signal primitive. + +### Choosing A Data Path + +Use ordinary tensors for standard task input/output payloads. Use +`child_memory=True` for manually allocated device buffers that should be passed +as tensors without automatic staging. Use `HostDeviceMappedRegion` when a +protocol needs a CPU/NPU-visible data area, persistent lifetime, explicit +byte-level datacopy, and signal slots. + +These choices can be combined. A mapped region can provide the backing device +address for a `child_memory=True` tensor, while the mapped-region signal slots +provide the protocol ordering. + +Mapped-region datacopy and signal operations do not publish TensorMap +dependencies and do not replace `TensorArgType` dependency tags. If a +mapped-region-backed tensor is submitted through `TaskArgs`, choose the tensor +tag deliberately. `NO_DEP` is usually the right tag when synchronization is +handled by the mapped-region signal protocol. + +## L2 Example + +This is the direct one-chip shape. It opens one region, reuses it across +iterations, and passes the device-visible addresses to a chip callable: + +```python +worker = Worker( + level=2, + platform="a2a3sim", + runtime="tensormap_and_ringbuffer", + device_id=0, +) +worker.init() + +region = worker.open_mapped_region(data_bytes * 2, signal_count=2) +info = worker.mapped_region_info(region) + +for seq in range(1, 11): + worker.mapped_region_datacopy_h2region(region, 0, make_payload(seq)) + worker.mapped_region_notify(region, 0, seq) + + args = TaskArgs() + args.add_scalar(info.device_data_ptr) + args.add_scalar(info.device_signal_ptr) + args.add_scalar(seq) + args.add_scalar(data_bytes) + worker.run(chip_cid, args, cfg) + + worker.mapped_region_wait(region, 1, seq, 1_000_000) + out = worker.mapped_region_datacopy_region2h( + region, + data_bytes, + data_bytes, + ) + +worker.close_mapped_region(region) +worker.close() +``` + +## L3 Example + +In L3, the parent `Worker` may have multiple chip children. `worker_id` +selects the chip child that owns the mapped region: + +```python +worker = Worker( + level=3, + device_ids=[0, 1], + platform="a2a3sim", + runtime="tensormap_and_ringbuffer", +) +worker.init() + +region0 = worker.open_mapped_region( + data_bytes, + signal_count=2, + worker_id=0, +) +info0 = worker.mapped_region_info(region0) + +region1 = worker.open_mapped_region( + data_bytes, + signal_count=2, + worker_id=1, +) +info1 = worker.mapped_region_info(region1) +``` + +Each region belongs to exactly one chip child. Do not pass a region opened for +`worker_id=0` to operations for `worker_id=1`, and do not pass its device +pointers to a task running on a different chip unless a higher-level protocol +explicitly supports that. + +## Platform Support + +Mapped regions are available on: + +- `a2a3sim` +- `a5sim` +- `a2a3` onboard + +`a5` onboard currently reports mapped regions as unsupported. + +The portable contract is the public Python behavior described here: raw byte +datacopy, explicit signal notify/wait, masked host pointers, opaque handles, +and device-visible addresses suitable for task arguments. + +## Example Location + +The round-trip example lives at: + +```text +examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/ +``` + +Run it on simulation with: + +```bash +cd examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip +python main.py -p a2a3sim -d 0 +``` + +Run it on a2a3 hardware with: + +```bash +cd examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip +python main.py -p a2a3 -d 0 +``` diff --git a/docs/dynamic-linking.md b/docs/dynamic-linking.md index 2539c8464..642ae404d 100644 --- a/docs/dynamic-linking.md +++ b/docs/dynamic-linking.md @@ -38,7 +38,7 @@ Python process (ChipWorker) | +-- rtRegisterAllKernel(aicore_binary) ← CANN kernel registration | +-- rtAicpuKernelLaunchExWithArgs(...) ← device-side execution | - +-- dlopen("libascend_hal.so", RTLD_NOW | RTLD_LOCAL) ← CANN HAL (profiling only) + +-- dlopen("libascend_hal.so", RTLD_NOW | RTLD_LOCAL) ← CANN HAL ``` Key difference: onboard does **not** dlopen AICPU/AICore as host-side SOs. @@ -93,9 +93,11 @@ execution. ### CANN HAL: `RTLD_NOW | RTLD_LOCAL` -`libascend_hal.so` is loaded only for performance profiling (SVM memory -mapping via `halHostRegister`/`halHostUnregister`). The handle is cached -in a file-scope `g_hal_handle` and never explicitly dlclosed. +`libascend_hal.so` is loaded for onboard HAL services that need SVM memory +mapping, including performance profiling buffers and a2a3 +`HostDeviceMappedRegion` host mappings via +`halHostRegister`/`halHostUnregister`. The handle is cached in a file-scope +`g_hal_handle` and never explicitly dlclosed. ## All dlsym(RTLD_DEFAULT) Calls diff --git a/docs/host-device_mapped-region_design.md b/docs/host-device_mapped-region_design.md deleted file mode 100644 index f1c6f1fd6..000000000 --- a/docs/host-device_mapped-region_design.md +++ /dev/null @@ -1,779 +0,0 @@ -# HostDeviceMappedRegion Design - -## Purpose - -`HostDeviceMappedRegion` is a low-level L3 parent-to-chip-child/NPU -communication primitive in Simpler. It exposes a child-owned memory region -through a narrow host-side API: - -- device-visible data and signal addresses that can be passed to kernels, -- a raw data area addressed by byte offset, -- cache-line-sized signal slots, -- explicit host-side datacopy into and out of the region, and -- explicit host-side notify/wait operations on signal slots. - -The primitive is intentionally lower-level than task tensor payload handling, -shared-buffer protocols, and send/recv protocols. It does not construct -`ContinuousTensor` descriptors, infer dependencies, manage queue metadata, or -define a message format. Higher-level protocols can build those policies on -top of the mapped data area and signal slots. - -This design extracts only the first-layer `datacopy + notify/wait` primitive -from the lessons of PR803. Higher-level shared-buffer and send/recv protocols -are left to later designs. - -## Runtime Ownership - -`HostDeviceMappedRegion` is owned by the process that owns `DeviceContext`. - -In L3 PROCESS mode, that owner is the chip child process containing -`ChipWorker`, `DeviceRunner`, and the loaded host runtime. The L3 parent owns -only an opaque region wrapper and reaches the child through the existing -parent-child mailbox RPC path. - -The NPU does not own the allocation lifetime. It participates by reading and -writing the device-visible addresses returned by `mapped_region_info`. - -Host mappings are owner-process implementation details: - -- public Python `mapped_region_info()` always reports `host_data_ptr == 0`, - and `host_signal_ptr == 0`; -- host-side public access uses `mapped_region_datacopy_h2region()` and - `mapped_region_datacopy_region2h()` in all modes; -- L3 parent callers may receive `device_data_ptr` and `device_signal_ptr`, - because those values are task/kernel arguments rather than parent - dereferenceable addresses. - -The parent-child mailbox is a host-side control and proxy transport. It is not -the CPU-NPU mapped-region primitive itself. - -### Lifetime And Handle Ownership - -Each opened region is registered in the owning `DeviceContext`. The public -handle is the child-side pointer value, carried as an opaque token by the L3 -parent. The parent must never dereference that value. - -The owner context registry is still required. All `info`, datacopy, notify, -wait, and close operations validate that the handle exists in the supplied -context. A handle from another context, a stale handle, or a double-close is -invalid. - -`close_mapped_region` releases the owner-side host mapping and device -allocation, but it is not a device synchronization operation. The caller must -ensure no in-flight kernel, AICPU code, or other device participant still uses -`device_data_ptr` or `device_signal_ptr` before closing the region. Usually -that means waiting for task completion or for the protocol's completion signal -before close. - -`close_mapped_region` does not inspect task state, signal state, or implicitly -wait for device work. In-flight device access at close time is a caller -protocol error. - -`finalize_device` / `destroy_device_context` releases any still-registered -mapped regions as a resource cleanup fallback. That cleanup does not make close -safe while device code is still accessing the region. - -## Public ABI - -The runtime C ABI defines an opaque handle plus config and info structures in -`src/common/worker/pto_runtime_c_api.h`: - -```cpp -typedef void *HostDeviceMappedRegionHandle; - -typedef struct { - uint64_t data_bytes; - uint32_t signal_count; - uint32_t flags; -} HostDeviceMappedRegionConfig; - -typedef struct { - uint64_t host_data_ptr; - uint64_t device_data_ptr; - uint64_t data_bytes; - uint64_t host_signal_ptr; - uint64_t device_signal_ptr; - uint32_t signal_count; - uint64_t total_bytes; - uint32_t flags; -} HostDeviceMappedRegionInfo; -``` - -`flags` is reserved and must be `0` in this design. Non-zero flags are invalid. - -The C ABI entry points are: - -```cpp -int open_host_device_mapped_region_ctx( - DeviceContextHandle ctx, - const HostDeviceMappedRegionConfig *cfg, - HostDeviceMappedRegionHandle *out_region -); -int close_host_device_mapped_region_ctx( - DeviceContextHandle ctx, - HostDeviceMappedRegionHandle region -); -int host_device_mapped_region_info_ctx( - DeviceContextHandle ctx, - HostDeviceMappedRegionHandle region, - HostDeviceMappedRegionInfo *info -); -int host_device_mapped_region_datacopy_h2region_ctx( - DeviceContextHandle ctx, - HostDeviceMappedRegionHandle region, - uint64_t offset, - const void *src, - size_t nbytes -); -int host_device_mapped_region_datacopy_region2h_ctx( - DeviceContextHandle ctx, - HostDeviceMappedRegionHandle region, - uint64_t offset, - void *dst, - size_t nbytes -); -int host_device_mapped_region_notify_ctx( - DeviceContextHandle ctx, - HostDeviceMappedRegionHandle region, - uint32_t signal_id, - uint32_t value -); -int host_device_mapped_region_wait_ctx( - DeviceContextHandle ctx, - HostDeviceMappedRegionHandle region, - uint32_t signal_id, - uint32_t target, - uint32_t timeout_us -); -``` - -`open_host_device_mapped_region_ctx` returns `0` on success and writes the -handle to `out_region`. On failure it writes `NULL` to `out_region` and returns -a negative errno-style code. - -`close_host_device_mapped_region_ctx` takes the handle by value. It does not -clear caller-side storage. Reusing the same handle after close is invalid. - -### Error Model - -The C ABI uses negative errno-style return codes: - -- `0`: success. -- `-EINVAL`: invalid context, handle, config, range, signal id, value, or null - pointer. -- `-EAGAIN` / `-EWOULDBLOCK`: non-blocking wait miss or bounded wait timeout. -- `-ENOMEM`: allocation or wrapper-allocation failure. -- `-EIO`: backend mapping, datacopy, or signal failure. -- `-ENOTSUP`: unsupported platform or unsupported backend feature. - -Python bindings map invalid user input to `ValueError`, wait miss or timeout -to `TimeoutError`, and backend, allocation, or unsupported-platform failures -to `RuntimeError`. - -## Python API - -The Python API exposes matching expert operations through the existing -`ChipWorker`, `Worker`, and `Orchestrator` control chain: - -```python -region = worker.open_mapped_region( - data_bytes, - signal_count=2, - flags=0, - worker_id=0, -) -info = worker.mapped_region_info(region) - -worker.mapped_region_datacopy_h2region(region, offset, data) -out = worker.mapped_region_datacopy_region2h(region, offset, nbytes) - -worker.mapped_region_notify(region, signal_id, value) -worker.mapped_region_wait(region, signal_id, target, timeout_us) -worker.close_mapped_region(region) -``` - -`region` is a lightweight Python wrapper, not a naked integer. It records at -least: - -- the opaque child-side handle value, -- the owning `worker_id`, -- whether the region is closed, and -- the opened `data_bytes`, `signal_count`, and `flags`. - -Follow-up operations default to `region.worker_id`. If the caller explicitly -passes a different `worker_id`, Python raises `ValueError` before sending RPC. -Operations on a closed region also raise `ValueError`. - -The first implementation uses manual close only. Context-manager syntax such as -`with worker.open_mapped_region(...)` is out of scope. - -`mapped_region_info(region)` returns a structured `MappedRegionInfo` object, -not a dictionary. The fields mirror `HostDeviceMappedRegionInfo`, but public -Python host pointers are always masked: - -```python -MappedRegionInfo( - host_data_ptr=0, - device_data_ptr=..., - data_bytes=..., - host_signal_ptr=0, - device_signal_ptr=..., - signal_count=..., - total_bytes=..., - flags=..., -) -``` - -The primitive returns bare device pointers only. It does not construct -`ContinuousTensor`, `TaskArgs`, queue metadata, or message descriptors. If a -kernel expects tensor metadata, a higher layer may wrap -`info.device_data_ptr + offset` in `ContinuousTensor(..., child_memory=True)`. - -`mapped_region_datacopy_h2region(region, offset, data)` accepts readable, -C-contiguous bytes-like objects through the Python buffer protocol. `str` is -rejected; callers must encode explicitly. Non-contiguous buffers are invalid. - -`mapped_region_datacopy_region2h(region, offset, nbytes)` returns a new -`bytes` object. The first implementation does not support writing into a -caller-provided mutable buffer. - -## Region Layout - -The region has a fixed internal layout: - -```text -HostDeviceMappedRegionHeader -HostDeviceMappedRegionSignalSlot[signal_count] -data[data_bytes] -padding to 64B -``` - -Header: - -```cpp -static constexpr uint32_t HDMR_MAGIC = 0x48444D52U; // "HDMR" -static constexpr uint32_t HDMR_VERSION = 1; - -struct alignas(64) HostDeviceMappedRegionHeader { - uint32_t magic; - uint32_t version; - uint32_t flags; - uint32_t signal_count; - uint64_t signal_offset; - uint64_t data_offset; - uint64_t data_bytes; - uint64_t total_bytes; - uint64_t reserved[2]; -}; -``` - -Signal slot: - -```cpp -struct alignas(64) HostDeviceMappedRegionSignalSlot { - volatile uint32_t value; - uint32_t reserved0; - uint64_t reserved[7]; -}; -``` - -Required static layout checks: - -```cpp -static_assert(sizeof(HostDeviceMappedRegionHeader) == 64); -static_assert(alignof(HostDeviceMappedRegionHeader) == 64); -static_assert(sizeof(HostDeviceMappedRegionSignalSlot) == 64); -static_assert(alignof(HostDeviceMappedRegionSignalSlot) == 64); -``` - -Each signal slot occupies one 64B cache line. This keeps independent signal -words from sharing a line and gives device code a conservative documented -layout. Signal values are `uint32_t`. - -`HostDeviceMappedRegionInfo` is fixed at 64B: - -```cpp -static_assert(offsetof(HostDeviceMappedRegionInfo, host_data_ptr) == 0); -static_assert(offsetof(HostDeviceMappedRegionInfo, device_data_ptr) == 8); -static_assert(offsetof(HostDeviceMappedRegionInfo, data_bytes) == 16); -static_assert(offsetof(HostDeviceMappedRegionInfo, host_signal_ptr) == 24); -static_assert(offsetof(HostDeviceMappedRegionInfo, device_signal_ptr) == 32); -static_assert(offsetof(HostDeviceMappedRegionInfo, signal_count) == 40); -static_assert(offsetof(HostDeviceMappedRegionInfo, total_bytes) == 48); -static_assert(offsetof(HostDeviceMappedRegionInfo, flags) == 56); -static_assert(sizeof(HostDeviceMappedRegionInfo) == 64); -``` - -Sizing: - -```text -signal_offset = sizeof(HostDeviceMappedRegionHeader) -data_offset = align64(signal_offset + signal_count * sizeof(SignalSlot)) -total_bytes = align64(data_offset + data_bytes) -``` - -Validation rules: - -- `data_bytes > 0` -- `signal_count > 0` -- `flags == 0` -- `offset <= data_bytes` -- `nbytes <= data_bytes - offset` -- `signal_id < signal_count` -- all arithmetic is checked for overflow - -This makes zero-length datacopy at `offset == data_bytes` valid while -rejecting non-zero copies at the end of the data area. Implementations should -use the subtraction form above rather than computing `offset + nbytes`, so -overflow cannot turn an out-of-range request into an in-range value. - -A newly opened region is zero-initialized. Header reserved fields, signal -values, data bytes, and padding are all zero before the region is returned. - -## L3 Control Transport - -L3 parent calls proxy through the existing mailbox control path. Small control -operations use mailbox arguments. Structured replies and payload-bearing -operations use POSIX `SharedMemory` side-band. - -The mailbox control argument layout is extended to support four input values: - -```text -CTRL_OFF_ARG0 = 16 -CTRL_OFF_ARG1 = 24 -CTRL_OFF_ARG2 = 32 -CTRL_OFF_ARG3 = 40 -CTRL_OFF_RESULT = 48 -``` - -This moves `CTRL_OFF_RESULT` from offset 40 to 48. Existing control commands -must be updated on both C++ and Python sides. - -Mailbox-argument operations: - -- `open_mapped_region` -- `close_mapped_region` -- `mapped_region_notify` -- `mapped_region_wait` - -Side-band operations: - -- `mapped_region_info` -- `mapped_region_datacopy_h2region` -- `mapped_region_datacopy_region2h` - -`open_mapped_region` uses: - -```text -ARG0 = data_bytes -ARG1 = signal_count -ARG2 = flags -RESULT = region_handle -``` - -`close_mapped_region` uses: - -```text -ARG0 = region_handle -``` - -`mapped_region_notify` uses: - -```text -ARG0 = region_handle -ARG1 = signal_id -ARG2 = value -``` - -`mapped_region_wait` uses: - -```text -ARG0 = region_handle -ARG1 = signal_id -ARG2 = target -ARG3 = timeout_us -``` - -### Side-Band Header - -The side-band request/reply memory has a fixed little-endian 64B header. The -mailbox carries only the control command and the NUL-terminated shared-memory -name. - -```text -magic u32 "HMRD" -version u16 1 -op u16 1=info, 2=h2region, 3=region2h -region u64 opaque handle value -offset u64 -nbytes u64 -status i32 child writes 0 or negative errno -reserved u32 -reserved2 zero padding to 64B -payload starts at offset 64 -``` - -For `mapped_region_info`, the child writes a 64B -`HostDeviceMappedRegionInfo` payload after the header and masks -`host_data_ptr` and `host_signal_ptr` to zero before publishing -`CONTROL_DONE`. - -For `mapped_region_datacopy_h2region`, the parent writes the payload before -sending the mailbox request. The child validates the request and copies from -the side-band payload to the owner-process mapped region. - -For `mapped_region_datacopy_region2h`, the parent creates a side-band segment -large enough for the header plus `nbytes`. The child validates the request, -copies from the mapped region into the side-band payload, writes `status`, and -then publishes `CONTROL_DONE`. The parent reads the payload only after -`CONTROL_DONE`. - -The parent closes and unlinks the `SharedMemory` segment after the control -round trip. The child closes its mapping before publishing `CONTROL_DONE`. - -## Datacopy Semantics - -The datacopy APIs move raw bytes between a caller-provided host buffer and the -mapped region data area: - -```text -datacopy_h2region: - host buffer -> region data[offset:offset+nbytes] - -datacopy_region2h: - region data[offset:offset+nbytes] -> host buffer -``` - -In L3, host buffer transfer has two host-side steps: - -```text -h2region: - parent buffer -> SharedMemory payload -> child host mapping - -region2h: - child host mapping -> SharedMemory payload -> parent bytes -``` - -The side-band `SharedMemory` segment is only the L3 parent-to-child payload -transport. It does not provide CPU-NPU visibility semantics. - -Datacopy does not wait, notify, check protocol phase, update ring metadata, or -construct tensor descriptors. Protocols compose the primitives explicitly. For -example: - -```text -producer write = datacopy_h2region + notify -consumer read = wait + datacopy_region2h -``` - -For public Python callers, host writes become publishable only through a -successful `mapped_region_datacopy_h2region` call. `mapped_region_notify` is a -release publication point for those completed datacopy writes. Direct host -pointer writes are not part of the public Python contract. - -## Signal Semantics - -Signal values are `uint32_t`. A signal slot is a lightweight doorbell or phase -word for bounded protocol epochs, not a long-lived channel sequence counter. -Higher-level protocols such as SPSC channels that need long-running head, -tail, or sequence values should define their own `uint64_t` metadata in the -mapped data area, including their own wrap-around rules. - -```text -notify(signal_id, value) - publish value to signal[signal_id] - -wait(signal_id, target, timeout_us) - complete when observed signal[signal_id] >= target - otherwise return would-block after timeout policy -``` - -Signal values are monotonic within one protocol epoch. Host-side notify -rejects a value lower than the current signal value with `-EINVAL`. Device-side -signal publication must also be monotonic; violating this is a protocol error -and wait behavior is no longer guaranteed. - -Wrap-around handling is out of scope. - -All signal values are initialized to zero. Therefore -`wait(signal_id, target=0, timeout_us=0)` is a legal non-blocking probe that -immediately succeeds. - -`wait` has only two modes: - -- `timeout_us == 0`: non-blocking probe. -- `timeout_us > 0`: bounded wait. - -There is no infinite wait mode. - -`timeout_us` is a best-effort upper bound measured with a monotonic host clock. -The implementation may return later than requested because of host scheduler -latency. It must not report timeout before the deadline, except when -`timeout_us == 0`, which performs no waiting. - -`device_signal_ptr` points to the device-visible signal slot array. Device code -may use that documented layout directly. There is no required device-side -helper API for the first implementation. - -### Memory Ordering - -`notify` is a release publication point for completed host datacopy writes. -`wait` is an acquire observation point for reads sequenced after it. - -For CPU produces / NPU consumes: - -```text -host datacopy_h2region(...) -host notify(signal_id, seq) -device acquire-poll signal_id >= seq -device reads data -``` - -If device code observes `signal_id >= seq`, device reads after that -observation must see host datacopy writes completed before the matching -`notify`. - -For NPU produces / CPU consumes: - -```text -device writes data -device release-stores signal_id = seq -host wait(signal_id, seq, timeout_us) -host datacopy_region2h(...) -``` - -If host wait succeeds, host datacopy reads after wait must see device writes -completed before the matching device signal publication. - -The first a2a3 implementation uses atomic release/acquire operations on mapped -signal slots. If the real-NPU round-trip validation is flaky or fails, the -backend must not expose weaker best-effort semantics. It must either add the -required Ascend cache-maintenance primitives and document them, or return -`-ENOTSUP` for a2a3 onboard. - -## Relationship To Task Tensor Payloads - -Simpler's normal task tensor payload path is built around `TaskArgs` and -`ContinuousTensor`. It is task-scoped and tensor-oriented: - -1. The user adds a `ContinuousTensor` to `TaskArgs`. -2. The task is dispatched to a chip child. -3. `init_runtime_impl` prepares device-side task arguments. -4. For ordinary tensors, the runtime allocates device memory and calls - `copy_to_device()` from the host pointer in `ContinuousTensor.data`. -5. The runtime replaces the tensor's data pointer with the device pointer - before launching device orchestration and kernels. -6. During validation/copy-back, recorded tensor pairs can be copied from - device memory back to the original host pointer. - -That path is convenient for normal kernel inputs and outputs. The runtime owns -the per-task tensor staging details, and the user describes tensors rather than -explicit data movement phases. - -`child_memory=True` is an opt-out from that automatic staging path. When a -`ContinuousTensor` is marked as child memory, `init_runtime_impl` treats -`ContinuousTensor.data` as an existing child-managed device pointer. It passes -the tensor through without allocating new device memory and without -`copy_to_device()`. The caller is responsible for allocating and populating the -device buffer. - -`HostDeviceMappedRegion` is different from both: - -- Ordinary task tensor: `ContinuousTensor` host pointer, implicit - `copy_to_device()` and copy-back around a task, task/runtime-managed - lifetime, TensorMap and task dependencies for sync. -- `child_memory=True` tensor: existing device pointer, caller-managed H2D/D2H - copies, caller/child-managed lifetime, TensorMap can still see the tensor - argument. -- Mapped region: data offsets plus signal slots, explicit - `datacopy_h2region` / `datacopy_region2h`, explicit open/close on a - child-owned region, explicit notify/wait. - -Mapped regions produce device addresses that can be passed through existing -task arguments. They do not require changing `TaskArgs`, `ContinuousTensor`, -TensorMap, or the normal `copy_to_device()` path. - -If a mapped-region-backed tensor is submitted through `TaskArgs`, callers -should choose the appropriate tensor tag for scheduler behavior, usually -`NO_DEP` when synchronization is handled by the mapped region's signal -protocol. - -## Platform Support - -Supported platforms: - -- `a2a3` onboard -- `a2a3sim` -- `a5sim` - -Unsupported stub: - -- `a5` onboard - -Common implementation files: - -- `src/common/worker/host_device_mapped_region.h` -- `src/common/worker/host_device_mapped_region.cpp` - -Platform allocation and mapping live in existing runtime C ABI implementation -files: - -- `src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp` -- `src/a2a3/platform/sim/host/pto_runtime_c_api.cpp` -- `src/a5/platform/sim/host/pto_runtime_c_api.cpp` -- `src/a5/platform/onboard/host/pto_runtime_c_api.cpp` - -The common implementation owns layout validation, bounds checks, datacopy, -host-side notify, and host-side wait. Platform code owns allocation, host -mapping, unmapping, and unsupported stubs. - -### a2a3 Onboard Backend - -The a2a3 onboard backend reuses the existing repo transport pattern: - -```text -dev_ptr = DeviceRunner::allocate_tensor(total_bytes) -halHostRegister(dev_ptr, total_bytes, DEV_SVM_MAP_HOST, device_id, &host_ptr) -common_init(host_ptr, total_bytes, cfg) -``` - -`DeviceRunner::allocate_tensor` uses the platform `MemoryAllocator`, backed by -device memory allocation. `halHostRegister` is resolved from -`libascend_hal.so`, matching the existing profiling, tensor-dump, PMU, and -dep-gen paths. - -`common_init` writes the header and zero-fills signal, data, reserved, and -padding bytes through the owner-process host mapping. - -On close or cleanup, the backend unregisters the host mapping before freeing -device memory: - -```text -halHostUnregister(host_ptr, device_id) -DeviceRunner::free_tensor(dev_ptr) -``` - -If `libascend_hal.so` cannot be loaded, the HAL symbols are missing, or -`halHostRegister` fails, open returns `-EIO`. - -### Sim Backends - -`a2a3sim` and `a5sim` may use ordinary host memory as both the host mapping and -device-visible pointer in the simulation address space. They must still follow -the same layout, validation, and signal semantics as onboard. - -### a5 Onboard - -`a5` onboard returns `-ENOTSUP` for open. It must export the ABI symbols as -explicit unsupported stubs rather than omitting them. - -## Thin NPU Example - -Add a minimal real-NPU round-trip example for `a2a3` onboard: - -```text -host: - data_bytes = 2 * N * sizeof(float) - signal_count = 2 - open mapped region - repeat seq in 1..10: - datacopy input pattern(seq) into data[0:N] - notify signal[0] = seq - submit or let AIV observe device_data_ptr/device_signal_ptr - -AIV: - acquire-poll signal[0] until >= seq - read input pattern(seq) - output[i] = input[i] + add_const + seq_adjustment - release-store signal[1] = seq - -host: - wait signal[1] >= seq with bounded timeout - datacopy output from data[N:2N] - verify exact expected bytes for seq -``` - -This proves: - -- host datacopy into mapped region is visible to NPU, -- host notify is visible to NPU, -- NPU writes to mapped region are visible to host, -- NPU signal publication is visible to host wait, and -- device pointers returned by `mapped_region_info` can be passed to kernels. - -The example reuses the same mapped region for all 10 iterations. Signal values -must increase monotonically. Input and output patterns must depend on `seq` so -stale reads cannot pass accidentally. - -If this example is flaky or fails, `a2a3` onboard must not be marked supported -until the missing ordering or cache-maintenance mechanism is implemented and -documented. - -## Tests - -Common C++ unit tests: - -- layout size, alignment, and `offsetof` assertions, -- invalid config handling, -- overflow and bounds checks, -- zero-initialization, -- datacopy h2region and region2h, -- host notify/wait, -- decreasing host notify rejection, and -- stale handle, double-close, and cross-context handle rejection. - -Python sim tests: - -- `a2a3sim` and `a5sim` smoke coverage, -- direct and L3 proxy paths, -- public Python info returns `host_*_ptr == 0`, -- info returns valid `device_*_ptr`, -- region wrapper rejects mismatched `worker_id`, -- closed region operations raise `ValueError`, -- h2region rejects `str` and non-contiguous buffers, -- region2h returns `bytes`, -- L3 datacopy payloads larger than mailbox capacity, and -- datacopy plus host notify/wait behavior. - -Mailbox and side-band tests: - -- mapped-region `wait` uses `ARG3`, -- existing mailbox controls still read result at `CTRL_OFF_RESULT == 48`, -- malformed side-band magic/version/op is rejected, -- side-band `status` propagates negative errno-style failures, and -- `info` side-band replies mask host pointers in the child. - -Onboard example: - -- run the 10-iteration `a2a3` real-NPU round trip described above, -- verify the NPU code demonstrates acquire poll and release store against the - documented signal slot layout. - -Error and lifetime tests: - -- invalid config, stale handle, double-close, and cross-context handle use, -- non-blocking wait miss and bounded wait timeout map to timeout errors in - Python, -- unsupported `a5` onboard stubs fail explicitly, -- `finalize_device` / `destroy_device_context` release regions that were not - explicitly closed, and -- runtime shared objects export all mapped-region ABI symbols. - -## Out Of Scope - -`HostDeviceMappedRegion` does not define: - -- migrating PR803 `HostDeviceMemory` or `HostDeviceChannel`, -- send/recv message protocols, -- ring/channel protocols, -- queue metadata, -- TensorMap dependency publication, -- automatic tensor descriptor construction, -- formal device-side helper APIs, -- multi-chip communication protocols, -- performance benchmarking contracts, -- `a5` onboard allocation/mapping behavior, -- context-manager support for the Python wrapper, -- infinite wait semantics, -- non-zero flag semantics, or -- signal wrap-around handling. diff --git a/docs/worker-manager.md b/docs/worker-manager.md index 0be0480ea..87e70fa55 100644 --- a/docs/worker-manager.md +++ b/docs/worker-manager.md @@ -183,7 +183,56 @@ mailbox_size_ = HEADER_SIZE // 8 B (state + error) Per-worker total: ~2 KB. Typical pool: 4-8 workers → ~8-16 KB shm total. -### 3.4 Shutdown +### 3.4 Control-plane commands + +The mailbox is also the per-child control channel. When the parent writes +`CONTROL_REQUEST`, offset 8 carries a `CTRL_*` sub-command instead of a task +callable id. The child loop handles the command in the same polling state +machine as `TASK_READY`, writes `MAILBOX_OFF_ERROR` / `MAILBOX_OFF_ERROR_MSG` +and any scalar result, then publishes `CONTROL_DONE`. + +Task dispatch and control commands share one mailbox. Parent-side +`dispatch_process()` and every `control_*()` method serialize on the same +`mailbox_mu_`, so a control request issued while a task is running waits for +that task's mailbox round trip to finish before it claims the state field. +This is a WorkerManager-level RPC contract; individual features only define +their own `CTRL_*` sub-command and payload schema. + +The fixed control slot layout is: + +```text +offset 8: uint64 control sub-command +offset 16: uint64 arg0 +offset 24: uint64 arg1 +offset 32: uint64 arg2 +offset 40: uint64 arg3 +offset 48: uint64 result +``` + +The meaning of `arg0..arg3` is sub-command-specific. Commands that return one +scalar or pointer write it at `CTRL_OFF_RESULT`; commands with larger request +or reply payloads pass fixed-width POSIX shared-memory names through +`MAILBOX_OFF_ARGS`. + +Current control-plane users include: + +- Device memory control from the orchestrator: + `CTRL_MALLOC`, `CTRL_FREE`, `CTRL_COPY_TO`, and `CTRL_COPY_FROM`. +- Callable lifecycle control: + `CTRL_PREPARE`, `CTRL_REGISTER`, `CTRL_UNREGISTER`, + `CTRL_PY_REGISTER`, and `CTRL_PY_UNREGISTER`. +- Communication-domain setup: + `CTRL_COMM_INIT`, `CTRL_ALLOC_DOMAIN`, and `CTRL_RELEASE_DOMAIN`. +- Host/device mapped-region operations: + `CTRL_OPEN_MAPPED_REGION`, `CTRL_CLOSE_MAPPED_REGION`, + `CTRL_MAPPED_REGION_INFO`, datacopy, notify, and wait commands. + +When adding a new control command, keep the mailbox fields limited to small +fixed arguments and move variable-sized payloads into side-band shared memory. +The child must always publish `CONTROL_DONE` with a clear error code/message +before the parent releases the mailbox back to `IDLE`. + +### 3.5 Shutdown `WorkerManager::shutdown_children()` writes `SHUTDOWN` to every registered mailbox; each child loop sees it on its next poll and exits. The Python diff --git a/examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/kernels/aiv/host_device_mapped_region_round_trip.cpp b/examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/kernels/aiv/host_device_mapped_region_round_trip.cpp new file mode 100644 index 000000000..88229c8a2 --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/kernels/aiv/host_device_mapped_region_round_trip.cpp @@ -0,0 +1,99 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include + +#include + +#ifndef __gm__ +#define __gm__ +#endif +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +#include "pipe_sync.h" + +static constexpr uint64_t kCacheLineBytes = 64; +static constexpr uint32_t kMaxPollIters = 1024U; + +static inline __aicore__ void flush_range(volatile __gm__ void *addr, uint64_t size_bytes) { +#if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__) + uintptr_t start = reinterpret_cast(addr) & ~(uintptr_t(kCacheLineBytes) - 1u); + uintptr_t end = + (reinterpret_cast(addr) + size_bytes + kCacheLineBytes - 1u) & ~(uintptr_t(kCacheLineBytes) - 1u); + for (uintptr_t p = start; p < end; p += kCacheLineBytes) { + dcci((__gm__ int32_t *)p, SINGLE_CACHE_LINE, CACHELINE_OUT); + } +#if defined(__CPU_SIM) + dsb(0); +#else + dsb(DSB_DDR); +#endif + pipe_barrier(PIPE_ALL); +#else + (void)addr; + (void)size_bytes; + __asm__ __volatile__("" ::: "memory"); +#endif +} + +static inline __aicore__ void invalidate_range(volatile __gm__ void *addr, uint64_t size_bytes) { +#if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__) + uintptr_t start = reinterpret_cast(addr) & ~(uintptr_t(kCacheLineBytes) - 1u); + uintptr_t end = + (reinterpret_cast(addr) + size_bytes + kCacheLineBytes - 1u) & ~(uintptr_t(kCacheLineBytes) - 1u); + for (uintptr_t p = start; p < end; p += kCacheLineBytes) { + dcci((__gm__ int32_t *)p, SINGLE_CACHE_LINE); + } +#if defined(__CPU_SIM) + dsb(0); +#else + dsb(DSB_DDR); +#endif +#else + (void)addr; + (void)size_bytes; + __asm__ __volatile__("" ::: "memory"); +#endif +} + +static inline __aicore__ volatile __gm__ uint32_t *signal_slot(__gm__ uint8_t *signal_base, uint32_t signal_id) { + return reinterpret_cast(signal_base + signal_id * kCacheLineBytes); +} + +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + auto *data = reinterpret_cast<__gm__ uint8_t *>(static_cast(args[0])); + auto *signal_base = reinterpret_cast<__gm__ uint8_t *>(static_cast(args[1])); + auto *signal0 = signal_slot(signal_base, 0); + auto *signal1 = signal_slot(signal_base, 1); + uint32_t seq = static_cast(args[2]); + uint32_t nbytes = static_cast(args[3]); + + bool observed = false; + for (uint32_t i = 0; i < kMaxPollIters; ++i) { + invalidate_range(signal0, kCacheLineBytes); + if (*signal0 >= seq) { + observed = true; + break; + } + } + + invalidate_range(data, nbytes); + for (uint32_t i = 0; i < nbytes; ++i) { + uint8_t mask = observed ? static_cast(seq + i * 3U) : static_cast(0xA5U); + data[nbytes + i] = static_cast(data[i] ^ mask); + } + flush_range(data + nbytes, nbytes); + + *signal1 = seq; + flush_range(signal1, kCacheLineBytes); +} diff --git a/examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/kernels/orchestration/host_device_mapped_region_round_trip_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/kernels/orchestration/host_device_mapped_region_round_trip_orch.cpp new file mode 100644 index 000000000..84f230e3c --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/kernels/orchestration/host_device_mapped_region_round_trip_orch.cpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include "pto_orchestration_api.h" // NOLINT(build/include_subdir) + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig +host_device_mapped_region_round_trip_config(const ChipStorageTaskArgs &orch_args) { + (void)orch_args; + return PTO2OrchestrationConfig{.expected_arg_count = 4}; +} + +__attribute__((visibility("default"))) PTO2OrchestrationConfig +aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { + return host_device_mapped_region_round_trip_config(orch_args); +} + +__attribute__((visibility("default"))) void host_device_mapped_region_round_trip_orch(const ChipStorageTaskArgs &orch_args) { + Arg args; + args.add_scalar(orch_args.scalar(0)); + args.add_scalar(orch_args.scalar(1)); + args.add_scalar(orch_args.scalar(2)); + args.add_scalar(orch_args.scalar(3)); + rt_submit_aiv_task(0, args); +} + +} // extern "C" diff --git a/examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/main.py b/examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/main.py new file mode 100644 index 000000000..516e0f567 --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/main.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Host CPU to device NPU round-trip through HostDeviceMappedRegion.""" + +from __future__ import annotations + +import argparse +import os +import sys +from pathlib import Path + +from simpler.task_interface import ArgDirection, CallConfig, ChipCallable, CoreCallable, TaskArgs +from simpler.worker import Worker +from simpler_setup.elf_parser import extract_text_section +from simpler_setup.kernel_compiler import KernelCompiler +from simpler_setup.pto_isa import ensure_pto_isa_root +from simpler_setup.runtime_builder import RuntimeBuilder + + +HERE = Path(__file__).resolve().parent +KERNEL_DIR = HERE / "kernels" +RUNTIME = "tensormap_and_ringbuffer" +DEFAULT_DATA_BYTES = 256 +DEFAULT_ITERS = 10 + + +def _build_callable(platform: str) -> ChipCallable: + kc = KernelCompiler(platform=platform) + pto_isa_root = ensure_pto_isa_root(clone_protocol="https") + include_dirs = kc.get_orchestration_include_dirs(RUNTIME) + + incore = kc.compile_incore( + source_path=str(KERNEL_DIR / "aiv" / "host_device_mapped_region_round_trip.cpp"), + core_type="aiv", + pto_isa_root=pto_isa_root, + extra_include_dirs=include_dirs, + ) + if not platform.endswith("sim"): + incore = extract_text_section(incore) + + orch = kc.compile_orchestration( + runtime_name=RUNTIME, + source_path=str(KERNEL_DIR / "orchestration" / "host_device_mapped_region_round_trip_orch.cpp"), + ) + return ChipCallable.build( + signature=[ArgDirection.IN, ArgDirection.IN, ArgDirection.IN, ArgDirection.IN], + func_name="host_device_mapped_region_round_trip_orch", + binary=orch, + children=[(0, CoreCallable.build(signature=[], binary=incore))], + ) + + +def _pattern(seq: int, data_bytes: int) -> bytes: + return bytes(((seq * 17 + i * 5) & 0xFF) for i in range(data_bytes)) + + +def _expected(seq: int, payload: bytes) -> bytes: + return bytes((b ^ ((seq + i * 3) & 0xFF)) for i, b in enumerate(payload)) + + +def run( + platform: str, + device_id: int, + *, + build: bool = False, + iters: int = DEFAULT_ITERS, + data_bytes: int = DEFAULT_DATA_BYTES, +) -> None: + if platform not in {"a2a3sim", "a2a3"}: + raise ValueError(f"unsupported platform: {platform}") + if iters <= 0: + raise ValueError("iters must be positive") + if data_bytes <= 0: + raise ValueError("data_bytes must be positive") + + os.environ["PTO_ISA_ROOT"] = ensure_pto_isa_root(clone_protocol="https") + RuntimeBuilder(platform=platform).get_binaries(RUNTIME, build=build) + chip_callable = _build_callable(platform) + + worker = Worker(level=2, platform=platform, runtime=RUNTIME, device_id=device_id, build=build) + worker.init() + region = None + try: + chip_cid = worker.register(chip_callable) + region = worker.open_mapped_region(data_bytes * 2, signal_count=2) + info = worker.mapped_region_info(region) + assert info.host_data_ptr == 0 + assert info.host_signal_ptr == 0 + assert info.device_data_ptr != 0 + assert info.device_signal_ptr != 0 + + cfg = CallConfig() + cfg.block_dim = 1 + cfg.aicpu_thread_num = 2 + + for seq in range(1, iters + 1): + payload = _pattern(seq, data_bytes) + worker.mapped_region_datacopy_h2region(region, 0, payload) + worker.mapped_region_notify(region, 0, seq) + + args = TaskArgs() + args.add_scalar(info.device_data_ptr) + args.add_scalar(info.device_signal_ptr) + args.add_scalar(seq) + args.add_scalar(data_bytes) + worker.run(chip_cid, args, cfg) + + worker.mapped_region_wait(region, 1, seq, 1_000_000) + got = worker.mapped_region_datacopy_region2h(region, data_bytes, data_bytes) + assert got == _expected(seq, payload) + finally: + if region is not None: + worker.close_mapped_region(region) + worker.close() + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("-p", "--platform", required=True, choices=["a2a3sim", "a2a3"]) + parser.add_argument("-d", "--device", type=int, default=0) + parser.add_argument("--build", action="store_true", help="Rebuild runtime from source.") + parser.add_argument("--iters", type=int, default=DEFAULT_ITERS) + parser.add_argument("--data-bytes", type=int, default=DEFAULT_DATA_BYTES) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + run(args.platform, args.device, build=args.build, iters=args.iters, data_bytes=args.data_bytes) + print( + "[host_device_mapped_region_round_trip] " + f"platform={args.platform} device={args.device} iters={args.iters} data_bytes={args.data_bytes} PASSED" + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/test_mapped_region_round_trip.py b/examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/test_mapped_region_round_trip.py new file mode 100644 index 000000000..ec449b40c --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/test_mapped_region_round_trip.py @@ -0,0 +1,67 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Pytest entrypoint for the HostDeviceMappedRegion round-trip example. + +This example demonstrates host CPU to device NPU communication through a +``HostDeviceMappedRegion``. The host opens one mapped region, reuses it for 10 +iterations, writes a sequence-dependent input pattern, notifies signal slot 0, +submits an AIV task with the returned device pointers, waits for signal slot 1, +then reads and checks the output bytes. + +Run directly: + + python examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/main.py -p a2a3sim -d 0 + python examples/a2a3/tensormap_and_ringbuffer/host_device_mapped_region_round_trip/main.py -p a2a3 -d 0 + +Use ``--build`` to rebuild the runtime from source. Use ``--iters N`` to adjust +the number of reused-region iterations; support gating should keep the default +10 iterations. +""" + +from __future__ import annotations + +import os +import subprocess +import sys +from pathlib import Path + +import pytest + + +HERE = Path(__file__).resolve().parent +REPO_ROOT = HERE.parents[3] + + +def _subprocess_env() -> dict[str, str]: + env = os.environ.copy() + paths = [str(REPO_ROOT), str(REPO_ROOT / "python")] + venv_lib = REPO_ROOT / ".venv" / "lib" + if venv_lib.exists(): + paths.extend(str(p) for p in sorted(venv_lib.glob("python*/site-packages"))) + existing = env.get("PYTHONPATH") + if existing: + paths.append(existing) + env["PYTHONPATH"] = os.pathsep.join(paths) + return env + + +@pytest.mark.platforms(["a2a3sim", "a2a3"]) +def test_host_device_mapped_region_round_trip(request): + platform = request.config.getoption("--platform", default=None) or "a2a3sim" + device = request.config.getoption("--device", default=None) + device_id = int(str(device).split(",")[0].split("-")[0]) if device is not None else 0 + result = subprocess.run( + [sys.executable, str(HERE / "main.py"), "-p", platform, "-d", str(device_id)], + text=True, + capture_output=True, + timeout=180, + check=False, + env=_subprocess_env(), + ) + assert result.returncode == 0, result.stdout + result.stderr diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp index 4ba073839..0a39065e6 100644 --- a/python/bindings/task_interface.cpp +++ b/python/bindings/task_interface.cpp @@ -18,10 +18,13 @@ */ #include +#include #include #include #include +#include + #include #include #include @@ -40,6 +43,50 @@ namespace nb = nanobind; +namespace { + +struct MappedRegionInfo { + uint64_t host_data_ptr; + uint64_t device_data_ptr; + uint64_t data_bytes; + uint64_t host_signal_ptr; + uint64_t device_signal_ptr; + uint32_t signal_count; + uint64_t total_bytes; + uint32_t flags; + + MappedRegionInfo( + uint64_t host_data_ptr_, uint64_t device_data_ptr_, uint64_t data_bytes_, uint64_t host_signal_ptr_, + uint64_t device_signal_ptr_, uint32_t signal_count_, uint64_t total_bytes_, uint32_t flags_ + ) : + host_data_ptr(host_data_ptr_), + device_data_ptr(device_data_ptr_), + data_bytes(data_bytes_), + host_signal_ptr(host_signal_ptr_), + device_signal_ptr(device_signal_ptr_), + signal_count(signal_count_), + total_bytes(total_bytes_), + flags(flags_) {} +}; + +MappedRegionInfo make_mapped_region_info(const HostDeviceMappedRegionInfo &info) { + return MappedRegionInfo{ + info.host_data_ptr, info.device_data_ptr, info.data_bytes, info.host_signal_ptr, + info.device_signal_ptr, info.signal_count, info.total_bytes, info.flags, + }; +} + +void raise_python_exception_for_mapped_region_error(const std::exception &e) { + std::string msg = e.what(); + if (msg.find("timed out") != std::string::npos) { + PyErr_SetString(PyExc_TimeoutError, msg.c_str()); + throw nb::python_error(); + } + throw; +} + +} // namespace + // ============================================================================ // Module definition // ============================================================================ @@ -703,6 +750,29 @@ NB_MODULE(_task_interface, m) { return os.str(); }); + nb::class_(m, "MappedRegionInfo") + .def( + nb::init(), + nb::arg("host_data_ptr"), nb::arg("device_data_ptr"), nb::arg("data_bytes"), nb::arg("host_signal_ptr"), + nb::arg("device_signal_ptr"), nb::arg("signal_count"), nb::arg("total_bytes"), nb::arg("flags") + ) + .def_ro("host_data_ptr", &MappedRegionInfo::host_data_ptr) + .def_ro("device_data_ptr", &MappedRegionInfo::device_data_ptr) + .def_ro("data_bytes", &MappedRegionInfo::data_bytes) + .def_ro("host_signal_ptr", &MappedRegionInfo::host_signal_ptr) + .def_ro("device_signal_ptr", &MappedRegionInfo::device_signal_ptr) + .def_ro("signal_count", &MappedRegionInfo::signal_count) + .def_ro("total_bytes", &MappedRegionInfo::total_bytes) + .def_ro("flags", &MappedRegionInfo::flags) + .def("__repr__", [](const MappedRegionInfo &info) { + std::ostringstream os; + os << "MappedRegionInfo(device_data_ptr=0x" << std::hex << info.device_data_ptr << ", device_signal_ptr=0x" + << info.device_signal_ptr << std::dec << ", data_bytes=" << info.data_bytes + << ", signal_count=" << info.signal_count << ", total_bytes=" << info.total_bytes + << ", flags=" << info.flags << ")"; + return os.str(); + }); + // --- ChipWorker --- nb::class_(m, "_ChipWorker") .def(nb::init<>()) @@ -800,6 +870,69 @@ NB_MODULE(_task_interface, m) { .def("free", &ChipWorker::free, nb::arg("ptr")) .def("copy_to", &ChipWorker::copy_to, nb::arg("dst"), nb::arg("src"), nb::arg("size")) .def("copy_from", &ChipWorker::copy_from, nb::arg("dst"), nb::arg("src"), nb::arg("size")) + .def( + "open_mapped_region", + [](ChipWorker &self, uint64_t data_bytes, uint32_t signal_count, uint32_t flags) { + return self.open_mapped_region(data_bytes, signal_count, flags); + }, + nb::arg("data_bytes"), nb::arg("signal_count") = 1, nb::arg("flags") = 0 + ) + .def("close_mapped_region", &ChipWorker::close_mapped_region, nb::arg("handle")) + .def( + "mapped_region_info", + [](ChipWorker &self, uint64_t handle) { + return make_mapped_region_info(self.mapped_region_info(handle)); + }, + nb::arg("handle") + ) + .def( + "mapped_region_datacopy_h2region", + [](ChipWorker &self, uint64_t handle, uint64_t offset, nb::object obj) { + if (PyUnicode_Check(obj.ptr())) { + throw std::invalid_argument("mapped_region_datacopy_h2region requires a bytes-like object"); + } + Py_buffer view{}; + if (PyObject_GetBuffer(obj.ptr(), &view, PyBUF_CONTIG_RO) != 0) { + throw nb::python_error(); + } + try { + self.mapped_region_datacopy_h2region(handle, offset, view.buf, static_cast(view.len)); + } catch (const std::exception &e) { + PyBuffer_Release(&view); + raise_python_exception_for_mapped_region_error(e); + } + PyBuffer_Release(&view); + }, + nb::arg("handle"), nb::arg("offset"), nb::arg("buffer") + ) + .def( + "mapped_region_datacopy_region2h", + [](ChipWorker &self, uint64_t handle, uint64_t offset, size_t nbytes) { + std::string out(nbytes, '\0'); + try { + self.mapped_region_datacopy_region2h(handle, offset, out.data(), nbytes); + } catch (const std::exception &e) { + raise_python_exception_for_mapped_region_error(e); + } + return nb::bytes(out.data(), out.size()); + }, + nb::arg("handle"), nb::arg("offset"), nb::arg("nbytes") + ) + .def( + "mapped_region_notify", &ChipWorker::mapped_region_notify, nb::arg("handle"), nb::arg("signal_id"), + nb::arg("value") + ) + .def( + "mapped_region_wait", + [](ChipWorker &self, uint64_t handle, uint32_t signal_id, uint32_t target, uint32_t timeout_us) { + try { + self.mapped_region_wait(handle, signal_id, target, timeout_us); + } catch (const std::exception &e) { + raise_python_exception_for_mapped_region_error(e); + } + }, + nb::arg("handle"), nb::arg("signal_id"), nb::arg("target"), nb::arg("timeout_us") + ) .def( "comm_init", &ChipWorker::comm_init, nb::arg("rank"), nb::arg("nranks"), nb::arg("rootinfo_path"), "Initialize a communicator for this rank. ChipWorker owns ACL + stream " diff --git a/python/bindings/worker_bind.h b/python/bindings/worker_bind.h index 19c6598dd..bafb9ae14 100644 --- a/python/bindings/worker_bind.h +++ b/python/bindings/worker_bind.h @@ -296,12 +296,49 @@ inline void bind_worker(nb::module_ &m) { "control_comm_init", &Worker::control_comm_init, nb::arg("worker_id"), nb::arg("request_shm_name"), nb::call_guard(), "Drive one NEXT_LEVEL chip child through CTRL_COMM_INIT (lazy base comm init)." + ) + .def( + "control_open_mapped_region", &Worker::control_open_mapped_region, nb::arg("worker_id"), + nb::arg("data_bytes"), nb::arg("signal_count"), nb::arg("flags"), nb::call_guard(), + "Open a HostDeviceMappedRegion on a NEXT_LEVEL chip child." + ) + .def( + "control_close_mapped_region", &Worker::control_close_mapped_region, nb::arg("worker_id"), + nb::arg("handle"), nb::call_guard(), + "Close a HostDeviceMappedRegion on a NEXT_LEVEL chip child." + ) + .def( + "control_mapped_region_payload", &Worker::control_mapped_region_payload, nb::arg("worker_id"), + nb::arg("sub_cmd"), nb::arg("shm_name"), nb::call_guard(), + "Dispatch a HostDeviceMappedRegion side-band shm payload command." + ) + .def( + "control_mapped_region_notify", &Worker::control_mapped_region_notify, nb::arg("worker_id"), + nb::arg("handle"), nb::arg("signal_id"), nb::arg("value"), nb::call_guard(), + "Notify a HostDeviceMappedRegion signal slot on a NEXT_LEVEL chip child." + ) + .def( + "control_mapped_region_wait", &Worker::control_mapped_region_wait, nb::arg("worker_id"), nb::arg("handle"), + nb::arg("signal_id"), nb::arg("target"), nb::arg("timeout_us"), nb::call_guard(), + "Wait on a HostDeviceMappedRegion signal slot on a NEXT_LEVEL chip child." ); m.attr("DEFAULT_HEAP_RING_SIZE") = static_cast(DEFAULT_HEAP_RING_SIZE); m.attr("MAILBOX_SIZE") = static_cast(MAILBOX_SIZE); m.attr("MAILBOX_OFF_ERROR_MSG") = static_cast(MAILBOX_OFF_ERROR_MSG); m.attr("MAILBOX_ERROR_MSG_SIZE") = static_cast(MAILBOX_ERROR_MSG_SIZE); + m.attr("CTRL_OFF_ARG0") = static_cast(CTRL_OFF_ARG0); + m.attr("CTRL_OFF_ARG1") = static_cast(CTRL_OFF_ARG1); + m.attr("CTRL_OFF_ARG2") = static_cast(CTRL_OFF_ARG2); + m.attr("CTRL_OFF_ARG3") = static_cast(CTRL_OFF_ARG3); + m.attr("CTRL_OFF_RESULT") = static_cast(CTRL_OFF_RESULT); + m.attr("CTRL_OPEN_MAPPED_REGION") = static_cast(CTRL_OPEN_MAPPED_REGION); + m.attr("CTRL_CLOSE_MAPPED_REGION") = static_cast(CTRL_CLOSE_MAPPED_REGION); + m.attr("CTRL_MAPPED_REGION_INFO") = static_cast(CTRL_MAPPED_REGION_INFO); + m.attr("CTRL_MAPPED_REGION_DATACOPY_H2REGION") = static_cast(CTRL_MAPPED_REGION_DATACOPY_H2REGION); + m.attr("CTRL_MAPPED_REGION_DATACOPY_REGION2H") = static_cast(CTRL_MAPPED_REGION_DATACOPY_REGION2H); + m.attr("CTRL_MAPPED_REGION_NOTIFY") = static_cast(CTRL_MAPPED_REGION_NOTIFY); + m.attr("CTRL_MAPPED_REGION_WAIT") = static_cast(CTRL_MAPPED_REGION_WAIT); m.attr("MAX_RING_DEPTH") = static_cast(MAX_RING_DEPTH); m.attr("MAX_SCOPE_DEPTH") = static_cast(MAX_SCOPE_DEPTH); diff --git a/python/simpler/task_interface.py b/python/simpler/task_interface.py index 0a06d269a..5832840c9 100644 --- a/python/simpler/task_interface.py +++ b/python/simpler/task_interface.py @@ -35,6 +35,7 @@ ContinuousTensor, CoreCallable, DataType, + MappedRegionInfo, SubmitResult, TaskArgs, TaskState, @@ -63,6 +64,7 @@ "ChipCallable", "CallConfig", "ChipWorker", + "MappedRegionInfo", "arg_direction_name", "scalar_to_uint64", # Distributed runtime @@ -428,6 +430,34 @@ def copy_from(self, dst, src, size): """Copy *size* bytes from worker *src* to host *dst*.""" self._impl.copy_from(int(dst), int(src), int(size)) + def open_mapped_region(self, data_bytes: int, signal_count: int = 1, flags: int = 0) -> int: + """Open a child-owned mapped region and return its opaque handle.""" + return int(self._impl.open_mapped_region(int(data_bytes), int(signal_count), int(flags))) + + def close_mapped_region(self, handle: int) -> None: + """Close a mapped-region handle opened on this chip worker.""" + self._impl.close_mapped_region(int(handle)) + + def mapped_region_info(self, handle: int) -> MappedRegionInfo: + """Return public mapped-region info with host pointers masked to zero.""" + return self._impl.mapped_region_info(int(handle)) + + def mapped_region_datacopy_h2region(self, handle: int, offset: int, data) -> None: + """Copy bytes-like data into the mapped region's data area.""" + self._impl.mapped_region_datacopy_h2region(int(handle), int(offset), data) + + def mapped_region_datacopy_region2h(self, handle: int, offset: int, nbytes: int) -> bytes: + """Copy bytes out of the mapped region's data area.""" + return self._impl.mapped_region_datacopy_region2h(int(handle), int(offset), int(nbytes)) + + def mapped_region_notify(self, handle: int, signal_id: int, value: int) -> None: + """Publish a mapped-region signal slot value.""" + self._impl.mapped_region_notify(int(handle), int(signal_id), int(value)) + + def mapped_region_wait(self, handle: int, signal_id: int, target: int, timeout_us: int) -> None: + """Wait until a mapped-region signal slot reaches ``target``.""" + self._impl.mapped_region_wait(int(handle), int(signal_id), int(target), int(timeout_us)) + def comm_init(self, rank: int, nranks: int, rootinfo_path: str) -> int: """Initialize a distributed communicator for this rank. diff --git a/python/simpler/worker.py b/python/simpler/worker.py index e4956e708..4ff528fb3 100644 --- a/python/simpler/worker.py +++ b/python/simpler/worker.py @@ -56,17 +56,31 @@ def my_l4_orch(orch, args, config): """ import ctypes +import errno import os import signal import struct import sys import threading import time +from dataclasses import dataclass from multiprocessing.shared_memory import SharedMemory from typing import Any, Optional import cloudpickle from _task_interface import ( # pyright: ignore[reportMissingImports] + CTRL_CLOSE_MAPPED_REGION as _CPP_CTRL_CLOSE_MAPPED_REGION, + CTRL_MAPPED_REGION_DATACOPY_H2REGION as _CPP_CTRL_MAPPED_REGION_DATACOPY_H2REGION, + CTRL_MAPPED_REGION_DATACOPY_REGION2H as _CPP_CTRL_MAPPED_REGION_DATACOPY_REGION2H, + CTRL_MAPPED_REGION_INFO as _CPP_CTRL_MAPPED_REGION_INFO, + CTRL_MAPPED_REGION_NOTIFY as _CPP_CTRL_MAPPED_REGION_NOTIFY, + CTRL_MAPPED_REGION_WAIT as _CPP_CTRL_MAPPED_REGION_WAIT, + CTRL_OFF_ARG0 as _CPP_CTRL_OFF_ARG0, + CTRL_OFF_ARG1 as _CPP_CTRL_OFF_ARG1, + CTRL_OFF_ARG2 as _CPP_CTRL_OFF_ARG2, + CTRL_OFF_ARG3 as _CPP_CTRL_OFF_ARG3, + CTRL_OFF_RESULT as _CPP_CTRL_OFF_RESULT, + CTRL_OPEN_MAPPED_REGION as _CPP_CTRL_OPEN_MAPPED_REGION, MAX_REGISTERED_CALLABLE_IDS, RunTiming, WorkerType, @@ -87,6 +101,7 @@ def my_l4_orch(orch, args, config): ChipWorker, CommBufferSpec, CommDomainHandle, + MappedRegionInfo, TaskArgs, _Worker, ) @@ -169,6 +184,13 @@ def my_l4_orch(orch, args, config): _CTRL_COMM_INIT = 9 _CTRL_PY_REGISTER = 10 _CTRL_PY_UNREGISTER = 11 +_CTRL_OPEN_MAPPED_REGION = int(_CPP_CTRL_OPEN_MAPPED_REGION) +_CTRL_CLOSE_MAPPED_REGION = int(_CPP_CTRL_CLOSE_MAPPED_REGION) +_CTRL_MAPPED_REGION_INFO = int(_CPP_CTRL_MAPPED_REGION_INFO) +_CTRL_MAPPED_REGION_DATACOPY_H2REGION = int(_CPP_CTRL_MAPPED_REGION_DATACOPY_H2REGION) +_CTRL_MAPPED_REGION_DATACOPY_REGION2H = int(_CPP_CTRL_MAPPED_REGION_DATACOPY_REGION2H) +_CTRL_MAPPED_REGION_NOTIFY = int(_CPP_CTRL_MAPPED_REGION_NOTIFY) +_CTRL_MAPPED_REGION_WAIT = int(_CPP_CTRL_MAPPED_REGION_WAIT) # Layout of the CTRL_COMM_INIT request shm. _COMM_INIT_HEADER = struct.Struct(" bytes: @@ -494,6 +529,79 @@ def _handle_ctrl_release_domain(cw: "ChipWorker", buf: memoryview) -> None: cw._impl.comm_release_domain_windows(int(handle), int(allocation_id), int(rank_count), int(domain_rank)) +def _mapped_region_exception_status(exc: BaseException) -> int: + if isinstance(exc, (ValueError, TypeError)): + return -errno.EINVAL + if isinstance(exc, TimeoutError): + return -errno.EAGAIN + msg = str(exc) + if "-95" in msg or "ENOTSUP" in msg: + return -errno.ENOTSUP + return -errno.EIO + + +def _handle_ctrl_mapped_region_payload(cw: "ChipWorker", buf: memoryview, sub_cmd: int) -> None: + shm_name = _read_shm_name(buf, _OFF_ARGS) + expected_op = { + _CTRL_MAPPED_REGION_INFO: _HDMR_OP_INFO, + _CTRL_MAPPED_REGION_DATACOPY_H2REGION: _HDMR_OP_H2REGION, + _CTRL_MAPPED_REGION_DATACOPY_REGION2H: _HDMR_OP_REGION2H, + }.get(int(sub_cmd)) + if expected_op is None: + raise RuntimeError(f"mapped-region payload: invalid sub-command {int(sub_cmd)}") + + shm = SharedMemory(name=shm_name) + try: + shm_buf = shm.buf + assert shm_buf is not None + if shm.size < _HDMR_HEADER.size: + raise RuntimeError(f"mapped-region payload too small: {shm.size} bytes") + magic, version, op, region, offset, nbytes, _status, reserved = _HDMR_HEADER.unpack_from(shm_buf, 0) + if magic != _HDMR_MAGIC: + raise RuntimeError(f"mapped-region payload invalid magic: {magic!r}") + if version != _HDMR_VERSION: + raise RuntimeError(f"mapped-region payload unsupported version: {version}") + if op != expected_op: + raise RuntimeError(f"mapped-region payload op {op} does not match sub-command {int(sub_cmd)}") + if reserved != 0: + raise RuntimeError(f"mapped-region payload reserved field must be zero, got {reserved}") + required_size = _HDMR_HEADER.size + (0 if op == _HDMR_OP_INFO else int(nbytes)) + if op == _HDMR_OP_INFO: + required_size = _HDMR_HEADER.size + _HDMR_INFO_PAYLOAD.size + if required_size > shm.size: + raise RuntimeError(f"mapped-region payload size mismatch: need {required_size}, shm={shm.size}") + + status = 0 + try: + if op == _HDMR_OP_INFO: + info = cw.mapped_region_info(int(region)) + _HDMR_INFO_PAYLOAD.pack_into( + shm_buf, + _HDMR_HEADER.size, + 0, + int(info.device_data_ptr), + int(info.data_bytes), + 0, + int(info.device_signal_ptr), + int(info.signal_count), + int(info.total_bytes), + int(info.flags), + ) + elif op == _HDMR_OP_H2REGION: + payload = bytes(shm_buf[_HDMR_HEADER.size : _HDMR_HEADER.size + int(nbytes)]) + cw.mapped_region_datacopy_h2region(int(region), int(offset), payload) + elif op == _HDMR_OP_REGION2H: + payload = cw.mapped_region_datacopy_region2h(int(region), int(offset), int(nbytes)) + shm_buf[_HDMR_HEADER.size : _HDMR_HEADER.size + int(nbytes)] = payload + else: + status = -errno.EINVAL + except Exception as e: # noqa: BLE001 + status = _mapped_region_exception_status(e) + struct.pack_into(" int: """Return the cached base-communicator handle the chip allocated during bootstrap. @@ -653,6 +761,32 @@ def _run_chip_main_loop( # noqa: PLR0912 -- TASK_READY + 6 control sub-commands _handle_ctrl_release_domain(cw, buf) elif sub_cmd == _CTRL_COMM_INIT: _handle_ctrl_comm_init(cw, buf) + elif sub_cmd == _CTRL_OPEN_MAPPED_REGION: + data_bytes = struct.unpack_from("Q", buf, _CTRL_OFF_ARG0)[0] + signal_count = struct.unpack_from("Q", buf, _CTRL_OFF_ARG1)[0] + flags = struct.unpack_from("Q", buf, _CTRL_OFF_ARG2)[0] + handle = cw.open_mapped_region(int(data_bytes), int(signal_count), int(flags)) + struct.pack_into("Q", buf, _CTRL_OFF_RESULT, int(handle)) + elif sub_cmd == _CTRL_CLOSE_MAPPED_REGION: + handle = struct.unpack_from("Q", buf, _CTRL_OFF_ARG0)[0] + cw.close_mapped_region(int(handle)) + elif sub_cmd in ( + _CTRL_MAPPED_REGION_INFO, + _CTRL_MAPPED_REGION_DATACOPY_H2REGION, + _CTRL_MAPPED_REGION_DATACOPY_REGION2H, + ): + _handle_ctrl_mapped_region_payload(cw, buf, int(sub_cmd)) + elif sub_cmd == _CTRL_MAPPED_REGION_NOTIFY: + handle = struct.unpack_from("Q", buf, _CTRL_OFF_ARG0)[0] + signal_id = struct.unpack_from("Q", buf, _CTRL_OFF_ARG1)[0] + value = struct.unpack_from("Q", buf, _CTRL_OFF_ARG2)[0] + cw.mapped_region_notify(int(handle), int(signal_id), int(value)) + elif sub_cmd == _CTRL_MAPPED_REGION_WAIT: + handle = struct.unpack_from("Q", buf, _CTRL_OFF_ARG0)[0] + signal_id = struct.unpack_from("Q", buf, _CTRL_OFF_ARG1)[0] + target = struct.unpack_from("Q", buf, _CTRL_OFF_ARG2)[0] + timeout_us = struct.unpack_from("Q", buf, _CTRL_OFF_ARG3)[0] + cw.mapped_region_wait(int(handle), int(signal_id), int(target), int(timeout_us)) else: raise RuntimeError(f"unknown control sub-command {int(sub_cmd)}") except Exception as e: # noqa: BLE001 @@ -831,6 +965,16 @@ def _child_worker_loop( # --------------------------------------------------------------------------- +@dataclass(frozen=True) +class MappedRegion: + handle: int + worker_id: int + data_bytes: int + signal_count: int + flags: int + closed: bool = False + + class Worker: """Unified worker for all hierarchy levels. @@ -1264,6 +1408,251 @@ def _init_level2(self) -> None: if isinstance(target, ChipCallable): self._chip_worker.prepare_callable(cid, target) + def _resolve_mapped_region_worker_id(self, region: MappedRegion, worker_id: Optional[int]) -> int: + selected = region.worker_id if worker_id is None else int(worker_id) + if selected != region.worker_id: + raise ValueError(f"mapped region belongs to worker_id={region.worker_id}, got worker_id={selected}") + return selected + + def _ensure_open_mapped_region(self, region: MappedRegion, worker_id: Optional[int]) -> int: + selected = self._resolve_mapped_region_worker_id(region, worker_id) + if region.closed: + raise ValueError("mapped region is closed") + return selected + + def _mapped_region_chip_worker(self, worker_id: int) -> ChipWorker: + if self.level != 2: + raise NotImplementedError("mapped-region L3 proxy support is not implemented yet") + if worker_id != 0: + raise ValueError("level-2 mapped regions only support worker_id=0") + if self._chip_worker is None: + raise RuntimeError("Worker.init() must be called before mapped-region operations") + return self._chip_worker + + def _ensure_mapped_region_l3_control_ready(self, worker_id: int) -> _Worker: + if not self._initialized: + raise RuntimeError("Worker.init() must be called before mapped-region operations") + if self.level != 3: + raise RuntimeError("mapped-region L3 proxy support requires a level-3 Worker with chip children") + self._check_chip_worker_id(worker_id) + if not getattr(self, "_hierarchical_started", False): + self._start_hierarchical() + if self._worker is None: + raise RuntimeError("mapped-region L3 proxy is not available after Worker.close()") + return self._worker + + def _raise_mapped_region_control_error(self, exc: RuntimeError) -> None: + msg = str(exc) + if "TimeoutError" in msg or "timed out" in msg: + raise TimeoutError(msg) from exc + if "ValueError" in msg or "invalid_argument" in msg or "code -22" in msg: + raise ValueError(msg) from exc + raise exc + + def _raise_mapped_region_status(self, status: int) -> None: + if status == 0: + return + if status == -errno.EAGAIN or status == -errno.EWOULDBLOCK: + raise TimeoutError(f"mapped-region operation timed out with status {status}") + if status == -errno.EINVAL: + raise ValueError(f"mapped-region operation failed with status {status}") + raise RuntimeError(f"mapped-region operation failed with status {status}") + + def _mapped_region_shm_name(self, worker_id: int) -> str: + counter = getattr(self, "_mapped_region_shm_counter", 0) + self._mapped_region_shm_counter = counter + 1 + name = f"simpler-hdmr-{os.getpid()}-{int(worker_id)}-{counter}" + if len(name.encode("utf-8")) + 1 > _CTRL_SHM_NAME_BYTES: + raise RuntimeError(f"mapped-region shm name too long: {name}") + return name + + def _mapped_region_payload_roundtrip( + self, + worker_id: int, + sub_cmd: int, + op: int, + region_handle: int, + offset: int, + payload: bytes, + reply_nbytes: int, + ) -> tuple[int, bytes]: + dw = self._ensure_mapped_region_l3_control_ready(worker_id) + nbytes = len(payload) if op == _HDMR_OP_H2REGION else int(reply_nbytes) + shm_size = _HDMR_HEADER.size + (_HDMR_INFO_PAYLOAD.size if op == _HDMR_OP_INFO else int(nbytes)) + shm = SharedMemory(name=self._mapped_region_shm_name(worker_id), create=True, size=shm_size) + try: + shm_buf = shm.buf + assert shm_buf is not None + shm_buf[:] = b"\x00" * shm_size + _HDMR_HEADER.pack_into( + shm_buf, + 0, + _HDMR_MAGIC, + _HDMR_VERSION, + int(op), + int(region_handle), + int(offset), + int(nbytes), + 0, + 0, + ) + if payload: + shm_buf[_HDMR_HEADER.size : _HDMR_HEADER.size + len(payload)] = payload + try: + dw.control_mapped_region_payload(int(worker_id), int(sub_cmd), shm.name) + except RuntimeError as e: + self._raise_mapped_region_control_error(e) + status = struct.unpack_from(" MappedRegion: + worker_id = int(worker_id) + if self.level == 2: + handle = self._mapped_region_chip_worker(worker_id).open_mapped_region( + int(data_bytes), int(signal_count), int(flags) + ) + else: + dw = self._ensure_mapped_region_l3_control_ready(worker_id) + try: + handle = dw.control_open_mapped_region(worker_id, int(data_bytes), int(signal_count), int(flags)) + except RuntimeError as e: + self._raise_mapped_region_control_error(e) + return MappedRegion( + handle=int(handle), + worker_id=int(worker_id), + data_bytes=int(data_bytes), + signal_count=int(signal_count), + flags=int(flags), + ) + + def close_mapped_region(self, region: MappedRegion, worker_id: Optional[int] = None) -> None: + selected = self._resolve_mapped_region_worker_id(region, worker_id) + if region.closed: + return + if self.level == 2: + self._mapped_region_chip_worker(selected).close_mapped_region(region.handle) + else: + dw = self._ensure_mapped_region_l3_control_ready(selected) + try: + dw.control_close_mapped_region(selected, int(region.handle)) + except RuntimeError as e: + self._raise_mapped_region_control_error(e) + object.__setattr__(region, "closed", True) + + def mapped_region_info(self, region: MappedRegion, worker_id: Optional[int] = None) -> MappedRegionInfo: + selected = self._ensure_open_mapped_region(region, worker_id) + if self.level == 2: + return self._mapped_region_chip_worker(selected).mapped_region_info(region.handle) + status, payload = self._mapped_region_payload_roundtrip( + selected, _CTRL_MAPPED_REGION_INFO, _HDMR_OP_INFO, int(region.handle), 0, b"", _HDMR_INFO_PAYLOAD.size + ) + self._raise_mapped_region_status(status) + fields = _HDMR_INFO_PAYLOAD.unpack_from(payload, 0) + return MappedRegionInfo(*fields) + + def mapped_region_datacopy_h2region( + self, + region: MappedRegion, + offset: int, + data, + worker_id: Optional[int] = None, + ) -> None: + if isinstance(data, str): + raise ValueError("mapped_region_datacopy_h2region requires a bytes-like object") + selected = self._ensure_open_mapped_region(region, worker_id) + if self.level == 2: + self._mapped_region_chip_worker(selected).mapped_region_datacopy_h2region(region.handle, int(offset), data) + return + try: + payload = memoryview(data) + except TypeError as e: + raise ValueError("mapped_region_datacopy_h2region requires a bytes-like object") from e + if not payload.contiguous: + raise ValueError("mapped_region_datacopy_h2region requires a contiguous bytes-like object") + status, _ = self._mapped_region_payload_roundtrip( + selected, + _CTRL_MAPPED_REGION_DATACOPY_H2REGION, + _HDMR_OP_H2REGION, + int(region.handle), + int(offset), + bytes(payload), + 0, + ) + self._raise_mapped_region_status(status) + + def mapped_region_datacopy_region2h( + self, + region: MappedRegion, + offset: int, + nbytes: int, + worker_id: Optional[int] = None, + ) -> bytes: + selected = self._ensure_open_mapped_region(region, worker_id) + if self.level == 2: + return self._mapped_region_chip_worker(selected).mapped_region_datacopy_region2h( + region.handle, int(offset), int(nbytes) + ) + status, payload = self._mapped_region_payload_roundtrip( + selected, + _CTRL_MAPPED_REGION_DATACOPY_REGION2H, + _HDMR_OP_REGION2H, + int(region.handle), + int(offset), + b"", + int(nbytes), + ) + self._raise_mapped_region_status(status) + return payload + + def mapped_region_notify( + self, + region: MappedRegion, + signal_id: int, + value: int, + worker_id: Optional[int] = None, + ) -> None: + selected = self._ensure_open_mapped_region(region, worker_id) + if self.level == 2: + self._mapped_region_chip_worker(selected).mapped_region_notify(region.handle, int(signal_id), int(value)) + return + dw = self._ensure_mapped_region_l3_control_ready(selected) + try: + dw.control_mapped_region_notify(selected, int(region.handle), int(signal_id), int(value)) + except RuntimeError as e: + self._raise_mapped_region_control_error(e) + + def mapped_region_wait( + self, + region: MappedRegion, + signal_id: int, + target: int, + timeout_us: int, + worker_id: Optional[int] = None, + ) -> None: + selected = self._ensure_open_mapped_region(region, worker_id) + if self.level == 2: + self._mapped_region_chip_worker(selected).mapped_region_wait( + region.handle, int(signal_id), int(target), int(timeout_us) + ) + return + dw = self._ensure_mapped_region_l3_control_ready(selected) + try: + dw.control_mapped_region_wait(selected, int(region.handle), int(signal_id), int(target), int(timeout_us)) + except RuntimeError as e: + self._raise_mapped_region_control_error(e) + def _init_hierarchical(self) -> None: device_ids = self._config.get("device_ids", []) n_sub = self._config.get("num_sub_workers", 0) diff --git a/src/a2a3/platform/onboard/host/CMakeLists.txt b/src/a2a3/platform/onboard/host/CMakeLists.txt index f0f01d438..c60ecf0b7 100644 --- a/src/a2a3/platform/onboard/host/CMakeLists.txt +++ b/src/a2a3/platform/onboard/host/CMakeLists.txt @@ -51,8 +51,10 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/device_runner.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/memory_allocator.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/pto_runtime_c_api.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/host_device_mapped_region_onboard.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/platform_compile_info.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/host_regs.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host_device_comm/host_device_mapped_region.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/l2_perf_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/tensor_dump_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/comm_hccl.cpp" diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index 4f04a94ce..967e0f139 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -495,6 +495,107 @@ void DeviceRunner::free_tensor(void *dev_ptr) { } } +int DeviceRunner::host_register_device_memory(void *dev_ptr, size_t bytes, void **host_ptr) { + if (dev_ptr == nullptr || host_ptr == nullptr || bytes == 0) { + return -1; + } + *host_ptr = nullptr; + if (device_id_ < 0) { + LOG_ERROR("host_register_device_memory requires an attached device"); + return -1; + } + if (load_hal_if_needed() != 0) { + LOG_ERROR("Failed to load ascend_hal for mapped region: %s", dlerror()); + return -1; + } + HalHostRegisterFn fn = get_halHostRegister(); + if (fn == nullptr) { + LOG_ERROR("halHostRegister symbol not found: %s", dlerror()); + return -1; + } + int rc = fn(dev_ptr, bytes, DEV_SVM_MAP_HOST, device_id_, host_ptr); + if (rc != 0) { + LOG_ERROR( + "halHostRegister mapped region failed: dev_ptr=%p size=%zu device=%d rc=%d", dev_ptr, bytes, device_id_, rc + ); + } + return rc; +} + +int DeviceRunner::host_unregister_device_memory(void *host_ptr) { + if (host_ptr == nullptr) { + return 0; + } + if (device_id_ < 0) { + LOG_ERROR("host_unregister_device_memory requires an attached device"); + return -1; + } + HalHostUnregisterFn fn = get_halHostUnregister(); + if (fn == nullptr) { + LOG_ERROR("halHostUnregister symbol not found: %s", dlerror()); + return -1; + } + int rc = fn(host_ptr, device_id_); + if (rc != 0) { + LOG_ERROR("halHostUnregister mapped region failed: host_ptr=%p device=%d rc=%d", host_ptr, device_id_, rc); + } + return rc; +} + +namespace { + +void clean_host_cache_range(void *host_ptr, size_t bytes) { + if (host_ptr == nullptr || bytes == 0) { + return; + } +#if defined(__aarch64__) + constexpr uintptr_t kCacheLineBytes = 64; + uintptr_t start = reinterpret_cast(host_ptr) & ~(kCacheLineBytes - 1U); + uintptr_t end = (reinterpret_cast(host_ptr) + bytes + kCacheLineBytes - 1U) & ~(kCacheLineBytes - 1U); + for (uintptr_t p = start; p < end; p += kCacheLineBytes) { + __asm__ __volatile__("dc cvac, %0" ::"r"(p) : "memory"); + } + __asm__ __volatile__("dsb sy" ::: "memory"); + __asm__ __volatile__("isb" ::: "memory"); +#elif defined(__x86_64__) + __asm__ __volatile__("" ::: "memory"); +#else + __asm__ __volatile__("" ::: "memory"); +#endif +} + +void clean_invalidate_host_cache_range(void *host_ptr, size_t bytes) { + if (host_ptr == nullptr || bytes == 0) { + return; + } +#if defined(__aarch64__) + constexpr uintptr_t kCacheLineBytes = 64; + uintptr_t start = reinterpret_cast(host_ptr) & ~(kCacheLineBytes - 1U); + uintptr_t end = (reinterpret_cast(host_ptr) + bytes + kCacheLineBytes - 1U) & ~(kCacheLineBytes - 1U); + for (uintptr_t p = start; p < end; p += kCacheLineBytes) { + __asm__ __volatile__("dc civac, %0" ::"r"(p) : "memory"); + } + __asm__ __volatile__("dsb sy" ::: "memory"); + __asm__ __volatile__("isb" ::: "memory"); +#elif defined(__x86_64__) + __asm__ __volatile__("" ::: "memory"); +#else + __asm__ __volatile__("" ::: "memory"); +#endif +} + +} // namespace + +int DeviceRunner::flush_host_cache_range(void *host_ptr, size_t bytes) { + clean_host_cache_range(host_ptr, bytes); + return 0; +} + +int DeviceRunner::invalidate_host_cache_range(void *host_ptr, size_t bytes) { + clean_invalidate_host_cache_range(host_ptr, bytes); + return 0; +} + int DeviceRunner::copy_to_device(void *dev_ptr, const void *host_ptr, size_t bytes) { return rtMemcpy(dev_ptr, bytes, host_ptr, bytes, RT_MEMCPY_HOST_TO_DEVICE); } diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h index bd9d088b0..8870c8d58 100644 --- a/src/a2a3/platform/onboard/host/device_runner.h +++ b/src/a2a3/platform/onboard/host/device_runner.h @@ -233,6 +233,28 @@ class DeviceRunner { */ void free_tensor(void *dev_ptr); + /** + * Map device memory into a host-visible aperture via Ascend HAL. + * + * @param dev_ptr Device allocation returned by allocate_tensor(). + * @param bytes Mapping size in bytes. + * @param host_ptr Output host mapping pointer. + * @return 0 on success, non-zero HAL/runtime error on failure. + */ + int host_register_device_memory(void *dev_ptr, size_t bytes, void **host_ptr); + + /** + * Unmap a host-visible aperture created by host_register_device_memory(). + * + * @param host_ptr Host mapping pointer returned by HAL. + * @return 0 on success, non-zero HAL/runtime error on failure. + */ + int host_unregister_device_memory(void *host_ptr); + + int flush_host_cache_range(void *host_ptr, size_t bytes); + + int invalidate_host_cache_range(void *host_ptr, size_t bytes); + /** * Copy data from host to device * diff --git a/src/a2a3/platform/onboard/host/host_device_mapped_region_onboard.cpp b/src/a2a3/platform/onboard/host/host_device_mapped_region_onboard.cpp new file mode 100644 index 000000000..1c10e6970 --- /dev/null +++ b/src/a2a3/platform/onboard/host/host_device_mapped_region_onboard.cpp @@ -0,0 +1,99 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include "host_device_mapped_region_onboard.h" + +#include + +#include + +#include "device_runner.h" + +namespace { + +struct OnboardMappedRegionResource { + DeviceRunner *runner = nullptr; + void *dev_ptr = nullptr; + void *host_ptr = nullptr; +}; + +} // namespace + +int a2a3_onboard_host_device_mapped_region_allocate( + DeviceContextHandle ctx, uint64_t total_bytes, HostDeviceMappedRegionPlatform *platform, void **host_base, + void **device_base +) { + if (ctx == NULL || platform == NULL || host_base == NULL || device_base == NULL) { + return -EINVAL; + } + *host_base = nullptr; + *device_base = nullptr; + auto *runner = static_cast(ctx); + if (runner->device_id() < 0) { + return -EIO; + } + + auto *resource = new (std::nothrow) OnboardMappedRegionResource; + if (resource == nullptr) { + return -ENOMEM; + } + resource->runner = runner; + resource->dev_ptr = runner->allocate_tensor(static_cast(total_bytes)); + if (resource->dev_ptr == nullptr) { + delete resource; + return -ENOMEM; + } + + int rc = + runner->host_register_device_memory(resource->dev_ptr, static_cast(total_bytes), &resource->host_ptr); + if (rc != 0 || resource->host_ptr == nullptr) { + runner->free_tensor(resource->dev_ptr); + delete resource; + return -EIO; + } + + platform->resource = resource; + platform->device_id = static_cast(runner->device_id()); + platform->cache_ops_cookie = runner; + platform->flush_host_range = [](HostDeviceMappedRegionPlatform *p, void *host_ptr, uint64_t bytes) { + auto *mapped_runner = static_cast(p->cache_ops_cookie); + if (mapped_runner == nullptr) { + return -1; + } + return mapped_runner->flush_host_cache_range(host_ptr, static_cast(bytes)); + }; + platform->invalidate_host_range = [](HostDeviceMappedRegionPlatform *p, void *host_ptr, uint64_t bytes) { + auto *mapped_runner = static_cast(p->cache_ops_cookie); + if (mapped_runner == nullptr) { + return -1; + } + return mapped_runner->invalidate_host_cache_range(host_ptr, static_cast(bytes)); + }; + platform->release = [](HostDeviceMappedRegionPlatform *p) { + auto *r = static_cast(p->resource); + if (r == nullptr) { + return; + } + if (r->runner != nullptr) { + if (r->host_ptr != nullptr) { + (void)r->runner->host_unregister_device_memory(r->host_ptr); + } + if (r->dev_ptr != nullptr) { + r->runner->free_tensor(r->dev_ptr); + } + } + delete r; + p->resource = nullptr; + }; + *host_base = resource->host_ptr; + *device_base = resource->dev_ptr; + return 0; +} diff --git a/src/a2a3/platform/onboard/host/host_device_mapped_region_onboard.h b/src/a2a3/platform/onboard/host/host_device_mapped_region_onboard.h new file mode 100644 index 000000000..6bfd77d6b --- /dev/null +++ b/src/a2a3/platform/onboard/host/host_device_mapped_region_onboard.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef SRC_A2A3_PLATFORM_ONBOARD_HOST_HOST_DEVICE_MAPPED_REGION_ONBOARD_H_ +#define SRC_A2A3_PLATFORM_ONBOARD_HOST_HOST_DEVICE_MAPPED_REGION_ONBOARD_H_ + +#include "host_device_comm/host_device_mapped_region.h" + +int a2a3_onboard_host_device_mapped_region_allocate( + DeviceContextHandle ctx, uint64_t total_bytes, HostDeviceMappedRegionPlatform *platform, void **host_base, + void **device_base +); + +#endif // SRC_A2A3_PLATFORM_ONBOARD_HOST_HOST_DEVICE_MAPPED_REGION_ONBOARD_H_ diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp index f36aa6f0d..457449506 100644 --- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp @@ -18,9 +18,12 @@ #include "pto_runtime_c_api.h" #include "callable.h" +#include "host_device_comm/host_device_mapped_region.h" +#include "host_device_mapped_region_onboard.h" #include "prepare_callable_common.h" #include "task_args.h" +#include #include #include @@ -144,7 +147,10 @@ DeviceContextHandle create_device_context(void) { } } -void destroy_device_context(DeviceContextHandle ctx) { delete static_cast(ctx); } +void destroy_device_context(DeviceContextHandle ctx) { + host_device_mapped_region_close_all_common(ctx); + delete static_cast(ctx); +} size_t get_runtime_size(void) { return sizeof(Runtime); } @@ -218,12 +224,59 @@ int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *de int finalize_device(DeviceContextHandle ctx) { if (ctx == NULL) return -1; try { + host_device_mapped_region_close_all_common(ctx); return static_cast(ctx)->finalize(); } catch (...) { return -1; } } +int open_host_device_mapped_region_ctx( + DeviceContextHandle ctx, const HostDeviceMappedRegionConfig *cfg, HostDeviceMappedRegionHandle *out_region +) { + return host_device_mapped_region_open_common(ctx, cfg, out_region, a2a3_onboard_host_device_mapped_region_allocate); +} + +int close_host_device_mapped_region_ctx(DeviceContextHandle ctx, HostDeviceMappedRegionHandle region) { + return host_device_mapped_region_close_common(ctx, region); +} + +int host_device_mapped_region_info_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, HostDeviceMappedRegionInfo *info +) { + int rc = host_device_mapped_region_info_common(ctx, region, info); + if (rc == 0) { + info->host_data_ptr = 0; + info->host_signal_ptr = 0; + } + return rc; +} + +int host_device_mapped_region_datacopy_h2region_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint64_t offset, const void *src, size_t nbytes +) { + return host_device_mapped_region_datacopy_h2region_common(ctx, region, offset, src, nbytes); +} + +int host_device_mapped_region_datacopy_region2h_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint64_t offset, void *dst, size_t nbytes +) { + return host_device_mapped_region_datacopy_region2h_common(ctx, region, offset, dst, nbytes); +} + +int host_device_mapped_region_notify_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint32_t signal_id, uint32_t value +) { + return host_device_mapped_region_notify_common(ctx, region, signal_id, value); +} + +int host_device_mapped_region_wait_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint32_t signal_id, uint32_t target, + uint32_t timeout_us +) { + return host_device_mapped_region_wait_common(ctx, region, signal_id, target, timeout_us); +} + int simpler_init( DeviceContextHandle ctx, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size diff --git a/src/a2a3/platform/sim/host/CMakeLists.txt b/src/a2a3/platform/sim/host/CMakeLists.txt index 49d0fe62a..c0e407eef 100644 --- a/src/a2a3/platform/sim/host/CMakeLists.txt +++ b/src/a2a3/platform/sim/host/CMakeLists.txt @@ -43,6 +43,8 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/memory_allocator.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/pto_runtime_c_api.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/platform_compile_info.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host_device_comm/host_device_mapped_region.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host_device_comm/host_device_mapped_region_sim.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/l2_perf_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/tensor_dump_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/pmu_collector.cpp" diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp index 7c1e3cb7e..b5cc84bdd 100644 --- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp @@ -18,9 +18,12 @@ #include "pto_runtime_c_api.h" #include "callable.h" +#include "host_device_comm/host_device_mapped_region.h" +#include "host_device_comm/host_device_mapped_region_sim.h" #include "prepare_callable_common.h" #include "task_args.h" +#include #include #include @@ -139,7 +142,10 @@ DeviceContextHandle create_device_context(void) { } } -void destroy_device_context(DeviceContextHandle ctx) { delete static_cast(ctx); } +void destroy_device_context(DeviceContextHandle ctx) { + host_device_mapped_region_close_all_common(ctx); + delete static_cast(ctx); +} size_t get_runtime_size(void) { return sizeof(Runtime); } @@ -180,6 +186,7 @@ int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *de int finalize_device(DeviceContextHandle ctx) { if (ctx == NULL) return -1; try { + host_device_mapped_region_close_all_common(ctx); int rc = static_cast(ctx)->finalize(); int dev = pto_cpu_sim_get_bound_device(); if (dev >= 0) { @@ -191,6 +198,52 @@ int finalize_device(DeviceContextHandle ctx) { } } +int open_host_device_mapped_region_ctx( + DeviceContextHandle ctx, const HostDeviceMappedRegionConfig *cfg, HostDeviceMappedRegionHandle *out_region +) { + return host_device_mapped_region_open_common(ctx, cfg, out_region, host_device_mapped_region_allocate_sim); +} + +int close_host_device_mapped_region_ctx(DeviceContextHandle ctx, HostDeviceMappedRegionHandle region) { + return host_device_mapped_region_close_common(ctx, region); +} + +int host_device_mapped_region_info_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, HostDeviceMappedRegionInfo *info +) { + int rc = host_device_mapped_region_info_common(ctx, region, info); + if (rc == 0) { + info->host_data_ptr = 0; + info->host_signal_ptr = 0; + } + return rc; +} + +int host_device_mapped_region_datacopy_h2region_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint64_t offset, const void *src, size_t nbytes +) { + return host_device_mapped_region_datacopy_h2region_common(ctx, region, offset, src, nbytes); +} + +int host_device_mapped_region_datacopy_region2h_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint64_t offset, void *dst, size_t nbytes +) { + return host_device_mapped_region_datacopy_region2h_common(ctx, region, offset, dst, nbytes); +} + +int host_device_mapped_region_notify_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint32_t signal_id, uint32_t value +) { + return host_device_mapped_region_notify_common(ctx, region, signal_id, value); +} + +int host_device_mapped_region_wait_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint32_t signal_id, uint32_t target, + uint32_t timeout_us +) { + return host_device_mapped_region_wait_common(ctx, region, signal_id, target, timeout_us); +} + /* =========================================================================== * ACL lifecycle stubs. Sim has no ACL / aclrtStream concept, so these * no-op to satisfy the uniform host_runtime.so ABI (ChipWorker dlsym's the diff --git a/src/a5/platform/onboard/host/CMakeLists.txt b/src/a5/platform/onboard/host/CMakeLists.txt index e5b57bf7a..44a0b2184 100644 --- a/src/a5/platform/onboard/host/CMakeLists.txt +++ b/src/a5/platform/onboard/host/CMakeLists.txt @@ -40,6 +40,7 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/pto_runtime_c_api.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/platform_compile_info.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/host_regs.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host_device_comm/host_device_mapped_region.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/l2_perf_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/pmu_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/tensor_dump_collector.cpp" diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp index 21f919fd0..b180bdc89 100644 --- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp @@ -18,9 +18,11 @@ #include "pto_runtime_c_api.h" #include "callable.h" +#include "host_device_comm/host_device_mapped_region.h" #include "prepare_callable_common.h" #include "task_args.h" +#include #include #include @@ -185,12 +187,65 @@ int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *de int finalize_device(DeviceContextHandle ctx) { if (ctx == NULL) return -1; try { + host_device_mapped_region_close_all_common(ctx); return static_cast(ctx)->finalize(); } catch (...) { return -1; } } +int open_host_device_mapped_region_ctx( + DeviceContextHandle ctx, const HostDeviceMappedRegionConfig *cfg, HostDeviceMappedRegionHandle *out_region +) { + (void)ctx; + (void)cfg; + if (out_region != NULL) { + *out_region = NULL; + } + return -ENOTSUP; +} + +int close_host_device_mapped_region_ctx(DeviceContextHandle ctx, HostDeviceMappedRegionHandle region) { + int rc = host_device_mapped_region_close_common(ctx, region); + return rc == -EINVAL ? -ENOTSUP : rc; +} + +int host_device_mapped_region_info_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, HostDeviceMappedRegionInfo *info +) { + int rc = host_device_mapped_region_info_common(ctx, region, info); + return rc == -EINVAL ? -ENOTSUP : rc; +} + +int host_device_mapped_region_datacopy_h2region_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint64_t offset, const void *src, size_t nbytes +) { + int rc = host_device_mapped_region_datacopy_h2region_common(ctx, region, offset, src, nbytes); + return rc == -EINVAL ? -ENOTSUP : rc; +} + +int host_device_mapped_region_datacopy_region2h_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint64_t offset, void *dst, size_t nbytes +) { + int rc = host_device_mapped_region_datacopy_region2h_common(ctx, region, offset, dst, nbytes); + return rc == -EINVAL ? -ENOTSUP : rc; +} + +int host_device_mapped_region_notify_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint32_t signal_id, uint32_t value +) { + int rc = host_device_mapped_region_notify_common(ctx, region, signal_id, value); + return rc == -EINVAL ? -ENOTSUP : rc; +} + +int host_device_mapped_region_wait_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint32_t signal_id, uint32_t target, + uint32_t timeout_us +) { + int rc = host_device_mapped_region_wait_common(ctx, region, signal_id, target, timeout_us); + return rc == -EINVAL ? -ENOTSUP : rc; +} + /* =========================================================================== * ACL + comm_* placeholders (distributed runtime not yet implemented on a5) * diff --git a/src/a5/platform/sim/host/CMakeLists.txt b/src/a5/platform/sim/host/CMakeLists.txt index c42ed3fa7..31e7c311c 100644 --- a/src/a5/platform/sim/host/CMakeLists.txt +++ b/src/a5/platform/sim/host/CMakeLists.txt @@ -44,6 +44,8 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/profiling_copy.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/pto_runtime_c_api.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/platform_compile_info.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host_device_comm/host_device_mapped_region.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host_device_comm/host_device_mapped_region_sim.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/l2_perf_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/pmu_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/tensor_dump_collector.cpp" diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp index debf09f75..f1746811a 100644 --- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp @@ -18,9 +18,12 @@ #include "pto_runtime_c_api.h" #include "callable.h" +#include "host_device_comm/host_device_mapped_region.h" +#include "host_device_comm/host_device_mapped_region_sim.h" #include "prepare_callable_common.h" #include "task_args.h" +#include #include #include @@ -139,7 +142,10 @@ DeviceContextHandle create_device_context(void) { } } -void destroy_device_context(DeviceContextHandle ctx) { delete static_cast(ctx); } +void destroy_device_context(DeviceContextHandle ctx) { + host_device_mapped_region_close_all_common(ctx); + delete static_cast(ctx); +} size_t get_runtime_size(void) { return sizeof(Runtime); } @@ -180,6 +186,7 @@ int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *de int finalize_device(DeviceContextHandle ctx) { if (ctx == NULL) return -1; try { + host_device_mapped_region_close_all_common(ctx); int rc = static_cast(ctx)->finalize(); int dev = pto_cpu_sim_get_bound_device(); if (dev >= 0) { @@ -191,6 +198,52 @@ int finalize_device(DeviceContextHandle ctx) { } } +int open_host_device_mapped_region_ctx( + DeviceContextHandle ctx, const HostDeviceMappedRegionConfig *cfg, HostDeviceMappedRegionHandle *out_region +) { + return host_device_mapped_region_open_common(ctx, cfg, out_region, host_device_mapped_region_allocate_sim); +} + +int close_host_device_mapped_region_ctx(DeviceContextHandle ctx, HostDeviceMappedRegionHandle region) { + return host_device_mapped_region_close_common(ctx, region); +} + +int host_device_mapped_region_info_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, HostDeviceMappedRegionInfo *info +) { + int rc = host_device_mapped_region_info_common(ctx, region, info); + if (rc == 0) { + info->host_data_ptr = 0; + info->host_signal_ptr = 0; + } + return rc; +} + +int host_device_mapped_region_datacopy_h2region_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint64_t offset, const void *src, size_t nbytes +) { + return host_device_mapped_region_datacopy_h2region_common(ctx, region, offset, src, nbytes); +} + +int host_device_mapped_region_datacopy_region2h_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint64_t offset, void *dst, size_t nbytes +) { + return host_device_mapped_region_datacopy_region2h_common(ctx, region, offset, dst, nbytes); +} + +int host_device_mapped_region_notify_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint32_t signal_id, uint32_t value +) { + return host_device_mapped_region_notify_common(ctx, region, signal_id, value); +} + +int host_device_mapped_region_wait_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint32_t signal_id, uint32_t target, + uint32_t timeout_us +) { + return host_device_mapped_region_wait_common(ctx, region, signal_id, target, timeout_us); +} + /* =========================================================================== * ACL lifecycle stubs. Sim has no ACL / aclrtStream concept, so these no-op * to satisfy the uniform host_runtime.so ABI that ChipWorker dlsym's. The diff --git a/src/common/hierarchical/worker.h b/src/common/hierarchical/worker.h index 3ff7ec1be..0b8c925c8 100644 --- a/src/common/hierarchical/worker.h +++ b/src/common/hierarchical/worker.h @@ -105,6 +105,23 @@ class Worker { void control_comm_init(int worker_id, const std::string &request_shm_name) { manager_.control_comm_init(worker_id, request_shm_name.c_str()); } + uint64_t control_open_mapped_region(int worker_id, uint64_t data_bytes, uint32_t signal_count, uint32_t flags) { + return manager_.control_open_mapped_region(worker_id, data_bytes, signal_count, flags); + } + void control_close_mapped_region(int worker_id, uint64_t handle) { + manager_.control_close_mapped_region(worker_id, handle); + } + void control_mapped_region_payload(int worker_id, uint64_t sub_cmd, const std::string &shm_name) { + manager_.control_mapped_region_payload(worker_id, sub_cmd, shm_name.c_str()); + } + void control_mapped_region_notify(int worker_id, uint64_t handle, uint32_t signal_id, uint32_t value) { + manager_.control_mapped_region_notify(worker_id, handle, signal_id, value); + } + void control_mapped_region_wait( + int worker_id, uint64_t handle, uint32_t signal_id, uint32_t target, uint32_t timeout_us + ) { + manager_.control_mapped_region_wait(worker_id, handle, signal_id, target, timeout_us); + } // Broadcast CTRL_REGISTER / CTRL_UNREGISTER for a ChipCallable cid to // every NEXT_LEVEL child in parallel. `blob_ptr`/`blob_size` describe diff --git a/src/common/hierarchical/worker_manager.cpp b/src/common/hierarchical/worker_manager.cpp index c26f3e2fe..390159af2 100644 --- a/src/common/hierarchical/worker_manager.cpp +++ b/src/common/hierarchical/worker_manager.cpp @@ -323,11 +323,13 @@ WorkerThread *WorkerManager::pick_idle_excluding(WorkerType type, const std::vec // WorkerThread — memory control (orch thread, concurrent with worker thread) // ============================================================================= -static void write_control_args(char *mbox, uint64_t sub_cmd, uint64_t a0 = 0, uint64_t a1 = 0, uint64_t a2 = 0) { +static void +write_control_args(char *mbox, uint64_t sub_cmd, uint64_t a0 = 0, uint64_t a1 = 0, uint64_t a2 = 0, uint64_t a3 = 0) { std::memcpy(mbox + MAILBOX_OFF_CALLABLE, &sub_cmd, sizeof(uint64_t)); std::memcpy(mbox + CTRL_OFF_ARG0, &a0, sizeof(uint64_t)); std::memcpy(mbox + CTRL_OFF_ARG1, &a1, sizeof(uint64_t)); std::memcpy(mbox + CTRL_OFF_ARG2, &a2, sizeof(uint64_t)); + std::memcpy(mbox + CTRL_OFF_ARG3, &a3, sizeof(uint64_t)); } static uint64_t read_control_result(const char *mbox) { @@ -336,6 +338,8 @@ static uint64_t read_control_result(const char *mbox) { return r; } +static void write_shm_name_pair(char *mbox, const char *request_shm_name, const char *reply_shm_name); + // Issue a control sub-command and block until the child publishes // CONTROL_DONE. Caller must hold `mailbox_mu_`. On a non-zero error code // from the child, throws and leaves the mailbox in IDLE before unwinding @@ -442,6 +446,54 @@ void WorkerThread::control_copy_from(uint64_t dst, uint64_t src, size_t size) { run_control_command("control_copy_from"); } +uint64_t WorkerThread::control_open_mapped_region(uint64_t data_bytes, uint32_t signal_count, uint32_t flags) { + std::lock_guard lk(mailbox_mu_); + write_control_args( + mbox(), CTRL_OPEN_MAPPED_REGION, data_bytes, static_cast(signal_count), static_cast(flags) + ); + run_control_command("control_open_mapped_region"); + return read_control_result(mbox()); +} + +void WorkerThread::control_close_mapped_region(uint64_t handle) { + std::lock_guard lk(mailbox_mu_); + write_control_args(mbox(), CTRL_CLOSE_MAPPED_REGION, handle); + run_control_command("control_close_mapped_region"); +} + +void WorkerThread::control_mapped_region_payload(uint64_t sub_cmd, const char *shm_name) { + if (sub_cmd != CTRL_MAPPED_REGION_INFO && sub_cmd != CTRL_MAPPED_REGION_DATACOPY_H2REGION && + sub_cmd != CTRL_MAPPED_REGION_DATACOPY_REGION2H) { + throw std::runtime_error("control_mapped_region_payload: invalid sub-command"); + } + if (!shm_name || !*shm_name) { + throw std::runtime_error("control_mapped_region_payload: shm name must be non-empty"); + } + std::lock_guard lk(mailbox_mu_); + std::memcpy(mbox() + MAILBOX_OFF_CALLABLE, &sub_cmd, sizeof(uint64_t)); + write_shm_name_pair(mbox(), shm_name, ""); + run_control_command("control_mapped_region_payload"); +} + +void WorkerThread::control_mapped_region_notify(uint64_t handle, uint32_t signal_id, uint32_t value) { + std::lock_guard lk(mailbox_mu_); + write_control_args( + mbox(), CTRL_MAPPED_REGION_NOTIFY, handle, static_cast(signal_id), static_cast(value) + ); + run_control_command("control_mapped_region_notify"); +} + +void WorkerThread::control_mapped_region_wait( + uint64_t handle, uint32_t signal_id, uint32_t target, uint32_t timeout_us +) { + std::lock_guard lk(mailbox_mu_); + write_control_args( + mbox(), CTRL_MAPPED_REGION_WAIT, handle, static_cast(signal_id), static_cast(target), + static_cast(timeout_us) + ); + run_control_command("control_mapped_region_wait"); +} + // Stage two NUL-terminated shm names at MAILBOX_OFF_ARGS: request first // (CTRL_SHM_NAME_BYTES wide) then reply (CTRL_SHM_NAME_BYTES wide). Pads each // slot with zeros so stale bytes from a prior op cannot leak into the child's @@ -624,6 +676,49 @@ void WorkerManager::control_comm_init(int worker_id, const char *request_shm_nam wt->control_comm_init(request_shm_name); } +uint64_t +WorkerManager::control_open_mapped_region(int worker_id, uint64_t data_bytes, uint32_t signal_count, uint32_t flags) { + auto *wt = get_worker(WorkerType::NEXT_LEVEL, worker_id); + if (wt == nullptr) { + throw std::runtime_error("control_open_mapped_region: invalid worker_id " + std::to_string(worker_id)); + } + return wt->control_open_mapped_region(data_bytes, signal_count, flags); +} + +void WorkerManager::control_close_mapped_region(int worker_id, uint64_t handle) { + auto *wt = get_worker(WorkerType::NEXT_LEVEL, worker_id); + if (wt == nullptr) { + throw std::runtime_error("control_close_mapped_region: invalid worker_id " + std::to_string(worker_id)); + } + wt->control_close_mapped_region(handle); +} + +void WorkerManager::control_mapped_region_payload(int worker_id, uint64_t sub_cmd, const char *shm_name) { + auto *wt = get_worker(WorkerType::NEXT_LEVEL, worker_id); + if (wt == nullptr) { + throw std::runtime_error("control_mapped_region_payload: invalid worker_id " + std::to_string(worker_id)); + } + wt->control_mapped_region_payload(sub_cmd, shm_name); +} + +void WorkerManager::control_mapped_region_notify(int worker_id, uint64_t handle, uint32_t signal_id, uint32_t value) { + auto *wt = get_worker(WorkerType::NEXT_LEVEL, worker_id); + if (wt == nullptr) { + throw std::runtime_error("control_mapped_region_notify: invalid worker_id " + std::to_string(worker_id)); + } + wt->control_mapped_region_notify(handle, signal_id, value); +} + +void WorkerManager::control_mapped_region_wait( + int worker_id, uint64_t handle, uint32_t signal_id, uint32_t target, uint32_t timeout_us +) { + auto *wt = get_worker(WorkerType::NEXT_LEVEL, worker_id); + if (wt == nullptr) { + throw std::runtime_error("control_mapped_region_wait: invalid worker_id " + std::to_string(worker_id)); + } + wt->control_mapped_region_wait(handle, signal_id, target, timeout_us); +} + void WorkerManager::broadcast_register_all(int32_t cid, const void *blob_ptr, size_t blob_size) { if (next_level_threads_.empty()) return; diff --git a/src/common/hierarchical/worker_manager.h b/src/common/hierarchical/worker_manager.h index 76a4bf2c7..1af45cffb 100644 --- a/src/common/hierarchical/worker_manager.h +++ b/src/common/hierarchical/worker_manager.h @@ -123,16 +123,25 @@ static constexpr uint64_t CTRL_RELEASE_DOMAIN = 8; static constexpr uint64_t CTRL_COMM_INIT = 9; static constexpr uint64_t CTRL_PY_REGISTER = 10; static constexpr uint64_t CTRL_PY_UNREGISTER = 11; +static constexpr uint64_t CTRL_OPEN_MAPPED_REGION = 12; +static constexpr uint64_t CTRL_CLOSE_MAPPED_REGION = 13; +static constexpr uint64_t CTRL_MAPPED_REGION_INFO = 14; +static constexpr uint64_t CTRL_MAPPED_REGION_DATACOPY_H2REGION = 15; +static constexpr uint64_t CTRL_MAPPED_REGION_DATACOPY_REGION2H = 16; +static constexpr uint64_t CTRL_MAPPED_REGION_NOTIFY = 17; +static constexpr uint64_t CTRL_MAPPED_REGION_WAIT = 18; // Control args reuse the task mailbox region (mutually exclusive with task dispatch): // offset 16: uint64 arg0 (size for malloc; ptr for free; dst for copy; cid for register) // offset 24: uint64 arg1 (src for copy) // offset 32: uint64 arg2 (nbytes for copy) -// offset 40: uint64 result (returned ptr from malloc) +// offset 40: uint64 arg3 (timeout_us for mapped-region wait) +// offset 48: uint64 result (returned ptr from malloc / open) static constexpr ptrdiff_t CTRL_OFF_ARG0 = 16; static constexpr ptrdiff_t CTRL_OFF_ARG1 = 24; static constexpr ptrdiff_t CTRL_OFF_ARG2 = 32; -static constexpr ptrdiff_t CTRL_OFF_RESULT = 40; +static constexpr ptrdiff_t CTRL_OFF_ARG3 = 40; +static constexpr ptrdiff_t CTRL_OFF_RESULT = 48; // CTRL_REGISTER puts the NUL-terminated POSIX shm name at MAILBOX_OFF_ARGS. // Fixed-width so the wire layout stays simple; well above the encoded length @@ -209,6 +218,11 @@ class WorkerThread { void control_free(uint64_t ptr); void control_copy_to(uint64_t dst, uint64_t src, size_t size); void control_copy_from(uint64_t dst, uint64_t src, size_t size); + uint64_t control_open_mapped_region(uint64_t data_bytes, uint32_t signal_count, uint32_t flags); + void control_close_mapped_region(uint64_t handle); + void control_mapped_region_payload(uint64_t sub_cmd, const char *shm_name); + void control_mapped_region_notify(uint64_t handle, uint32_t signal_id, uint32_t value); + void control_mapped_region_wait(uint64_t handle, uint32_t signal_id, uint32_t target, uint32_t timeout_us); // Pre-warm a chip child by triggering prepare_callable for `cid` in the // child via CTRL_PREPARE. Issued from the parent at end of init() so the @@ -309,6 +323,13 @@ class WorkerManager { void control_alloc_domain(int worker_id, const char *request_shm_name, const char *reply_shm_name); void control_release_domain(int worker_id, const char *request_shm_name); void control_comm_init(int worker_id, const char *request_shm_name); + uint64_t control_open_mapped_region(int worker_id, uint64_t data_bytes, uint32_t signal_count, uint32_t flags); + void control_close_mapped_region(int worker_id, uint64_t handle); + void control_mapped_region_payload(int worker_id, uint64_t sub_cmd, const char *shm_name); + void control_mapped_region_notify(int worker_id, uint64_t handle, uint32_t signal_id, uint32_t value); + void control_mapped_region_wait( + int worker_id, uint64_t handle, uint32_t signal_id, uint32_t target, uint32_t timeout_us + ); // Broadcast CTRL_REGISTER for `cid` to every NEXT_LEVEL worker in // parallel. Stages `blob_size` bytes from `blob_ptr` into a per-call diff --git a/src/common/host_device_comm/host_device_mapped_region.cpp b/src/common/host_device_comm/host_device_mapped_region.cpp new file mode 100644 index 000000000..66999497a --- /dev/null +++ b/src/common/host_device_comm/host_device_mapped_region.cpp @@ -0,0 +1,490 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include "host_device_comm/host_device_mapped_region.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static_assert(sizeof(HostDeviceMappedRegionHeader) == 64); +static_assert(alignof(HostDeviceMappedRegionHeader) == 64); +static_assert(sizeof(HostDeviceMappedRegionSignalSlot) == 64); +static_assert(alignof(HostDeviceMappedRegionSignalSlot) == 64); +static_assert(offsetof(HostDeviceMappedRegionSignalSlot, value) == 0); + +static_assert(offsetof(HostDeviceMappedRegionInfo, host_data_ptr) == 0); +static_assert(offsetof(HostDeviceMappedRegionInfo, device_data_ptr) == 8); +static_assert(offsetof(HostDeviceMappedRegionInfo, data_bytes) == 16); +static_assert(offsetof(HostDeviceMappedRegionInfo, host_signal_ptr) == 24); +static_assert(offsetof(HostDeviceMappedRegionInfo, device_signal_ptr) == 32); +static_assert(offsetof(HostDeviceMappedRegionInfo, signal_count) == 40); +static_assert(offsetof(HostDeviceMappedRegionInfo, total_bytes) == 48); +static_assert(offsetof(HostDeviceMappedRegionInfo, flags) == 56); +static_assert(sizeof(HostDeviceMappedRegionInfo) == 64); + +namespace { + +constexpr uint64_t kAlignment = 64; + +struct HostDeviceMappedRegion { + DeviceContextHandle owner_ctx = nullptr; + void *host_base = nullptr; + void *device_base = nullptr; + uint64_t total_bytes = 0; + uint64_t data_offset = 0; + uint64_t data_bytes = 0; + uint64_t signal_offset = 0; + uint32_t signal_count = 0; + uint32_t flags = 0; + std::mutex op_mu; + std::condition_variable op_cv; + uint32_t active_ops = 0; + HostDeviceMappedRegionPlatform platform{}; +}; + +std::mutex ®istry_mutex() { + static std::mutex mu; + return mu; +} + +std::unordered_map> ®istry_by_ctx() { + static std::unordered_map> registry; + return registry; +} + +std::unordered_map ®istry_by_handle() { + static std::unordered_map registry; + return registry; +} + +bool add_overflow(uint64_t a, uint64_t b, uint64_t *out) { + if (a > std::numeric_limits::max() - b) { + return true; + } + *out = a + b; + return false; +} + +bool mul_overflow(uint64_t a, uint64_t b, uint64_t *out) { + if (a != 0 && b > std::numeric_limits::max() / a) { + return true; + } + *out = a * b; + return false; +} + +bool align64(uint64_t value, uint64_t *out) { + uint64_t padded = 0; + if (add_overflow(value, kAlignment - 1, &padded)) { + return false; + } + *out = padded & ~(kAlignment - 1); + return true; +} + +uint8_t *byte_ptr(void *base, uint64_t offset) { return static_cast(base) + offset; } + +const uint8_t *byte_ptr(const void *base, uint64_t offset) { return static_cast(base) + offset; } + +HostDeviceMappedRegion *lookup_region_locked(DeviceContextHandle ctx, HostDeviceMappedRegionHandle handle) { + if (ctx == nullptr || handle == nullptr) { + return nullptr; + } + auto it = registry_by_handle().find(handle); + if (it == registry_by_handle().end()) { + return nullptr; + } + HostDeviceMappedRegion *region = it->second; + if (region == nullptr || region->owner_ctx != ctx) { + return nullptr; + } + return region; +} + +HostDeviceMappedRegion *acquire_region(DeviceContextHandle ctx, HostDeviceMappedRegionHandle handle) { + std::lock_guard lock(registry_mutex()); + HostDeviceMappedRegion *region = lookup_region_locked(ctx, handle); + if (region == nullptr) { + return nullptr; + } + { + std::lock_guard op_lock(region->op_mu); + ++region->active_ops; + } + return region; +} + +void release_region_op(HostDeviceMappedRegion *region) { + std::lock_guard op_lock(region->op_mu); + --region->active_ops; + if (region->active_ops == 0) { + region->op_cv.notify_all(); + } +} + +void wait_for_region_ops(HostDeviceMappedRegion *region) { + std::unique_lock op_lock(region->op_mu); + region->op_cv.wait(op_lock, [region] { + return region->active_ops == 0; + }); +} + +void release_region(HostDeviceMappedRegion *region) { + if (region == nullptr) { + return; + } + if (region->platform.release != nullptr) { + region->platform.release(®ion->platform); + } + delete region; +} + +int validate_range(HostDeviceMappedRegion *region, uint64_t offset, size_t nbytes) { + if (offset > region->data_bytes) { + return -EINVAL; + } + if (static_cast(nbytes) > region->data_bytes - offset) { + return -EINVAL; + } + return 0; +} + +HostDeviceMappedRegionSignalSlot *signal_slot(HostDeviceMappedRegion *region, uint32_t signal_id) { + auto *slots = + reinterpret_cast(byte_ptr(region->host_base, region->signal_offset)); + return &slots[signal_id]; +} + +int flush_host_range(HostDeviceMappedRegion *region, void *host_ptr, uint64_t bytes) { + if (bytes == 0 || region->platform.flush_host_range == nullptr) { + return 0; + } + return region->platform.flush_host_range(®ion->platform, host_ptr, bytes); +} + +int invalidate_host_range(HostDeviceMappedRegion *region, void *host_ptr, uint64_t bytes) { + if (bytes == 0 || region->platform.invalidate_host_range == nullptr) { + return 0; + } + return region->platform.invalidate_host_range(®ion->platform, host_ptr, bytes); +} + +} // namespace + +int host_device_mapped_region_compute_total_bytes( + const HostDeviceMappedRegionConfig *cfg, uint64_t *signal_offset, uint64_t *data_offset, uint64_t *total_bytes +) { + if (cfg == nullptr || signal_offset == nullptr || data_offset == nullptr || total_bytes == nullptr) { + return -EINVAL; + } + if (cfg->data_bytes == 0 || cfg->signal_count == 0 || cfg->flags != 0) { + return -EINVAL; + } + + uint64_t signal_bytes = 0; + if (mul_overflow(cfg->signal_count, sizeof(HostDeviceMappedRegionSignalSlot), &signal_bytes)) { + return -EINVAL; + } + + *signal_offset = sizeof(HostDeviceMappedRegionHeader); + uint64_t signals_end = 0; + if (add_overflow(*signal_offset, signal_bytes, &signals_end)) { + return -EINVAL; + } + if (!align64(signals_end, data_offset)) { + return -EINVAL; + } + + uint64_t data_end = 0; + if (add_overflow(*data_offset, cfg->data_bytes, &data_end)) { + return -EINVAL; + } + if (!align64(data_end, total_bytes)) { + return -EINVAL; + } + return 0; +} + +int host_device_mapped_region_open_common( + DeviceContextHandle ctx, const HostDeviceMappedRegionConfig *cfg, HostDeviceMappedRegionHandle *out_region, + HostDeviceMappedRegionAllocateFn allocate +) { + if (out_region != nullptr) { + *out_region = nullptr; + } + if (ctx == nullptr || cfg == nullptr || out_region == nullptr || allocate == nullptr) { + return -EINVAL; + } + + uint64_t signal_offset = 0; + uint64_t data_offset = 0; + uint64_t total_bytes = 0; + int rc = host_device_mapped_region_compute_total_bytes(cfg, &signal_offset, &data_offset, &total_bytes); + if (rc != 0) { + return rc; + } + + auto *region = new (std::nothrow) HostDeviceMappedRegion; + if (region == nullptr) { + return -ENOMEM; + } + region->owner_ctx = ctx; + region->total_bytes = total_bytes; + region->data_offset = data_offset; + region->data_bytes = cfg->data_bytes; + region->signal_offset = signal_offset; + region->signal_count = cfg->signal_count; + region->flags = cfg->flags; + + rc = allocate(ctx, total_bytes, ®ion->platform, ®ion->host_base, ®ion->device_base); + if (rc != 0) { + delete region; + return rc; + } + if (region->host_base == nullptr || region->device_base == nullptr) { + release_region(region); + return -EIO; + } + + std::memset(region->host_base, 0, static_cast(total_bytes)); + auto *header = reinterpret_cast(region->host_base); + header->magic = HDMR_MAGIC; + header->version = HDMR_VERSION; + header->flags = cfg->flags; + header->signal_count = cfg->signal_count; + header->signal_offset = signal_offset; + header->data_offset = data_offset; + header->data_bytes = cfg->data_bytes; + header->total_bytes = total_bytes; + rc = flush_host_range(region, region->host_base, total_bytes); + if (rc != 0) { + release_region(region); + return -EIO; + } + + HostDeviceMappedRegionHandle handle = static_cast(region); + { + std::lock_guard lock(registry_mutex()); + registry_by_handle()[handle] = region; + registry_by_ctx()[ctx].push_back(region); + } + *out_region = handle; + return 0; +} + +int host_device_mapped_region_close_common(DeviceContextHandle ctx, HostDeviceMappedRegionHandle handle) { + HostDeviceMappedRegion *region = nullptr; + { + std::lock_guard lock(registry_mutex()); + region = lookup_region_locked(ctx, handle); + if (region == nullptr) { + return -EINVAL; + } + registry_by_handle().erase(handle); + auto ®ions = registry_by_ctx()[ctx]; + for (auto it = regions.begin(); it != regions.end(); ++it) { + if (*it == region) { + regions.erase(it); + break; + } + } + if (regions.empty()) { + registry_by_ctx().erase(ctx); + } + } + wait_for_region_ops(region); + release_region(region); + return 0; +} + +void host_device_mapped_region_close_all_common(DeviceContextHandle ctx) { + if (ctx == nullptr) { + return; + } + std::vector regions; + { + std::lock_guard lock(registry_mutex()); + auto it = registry_by_ctx().find(ctx); + if (it == registry_by_ctx().end()) { + return; + } + regions.swap(it->second); + registry_by_ctx().erase(it); + for (HostDeviceMappedRegion *region : regions) { + registry_by_handle().erase(static_cast(region)); + } + } + for (HostDeviceMappedRegion *region : regions) { + wait_for_region_ops(region); + release_region(region); + } +} + +int host_device_mapped_region_info_common( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle handle, HostDeviceMappedRegionInfo *info +) { + if (info == nullptr) { + return -EINVAL; + } + HostDeviceMappedRegion *region = acquire_region(ctx, handle); + if (region == nullptr) { + return -EINVAL; + } + std::memset(info, 0, sizeof(*info)); + info->host_data_ptr = reinterpret_cast(byte_ptr(region->host_base, region->data_offset)); + info->device_data_ptr = reinterpret_cast(byte_ptr(region->device_base, region->data_offset)); + info->data_bytes = region->data_bytes; + info->host_signal_ptr = reinterpret_cast(byte_ptr(region->host_base, region->signal_offset)); + info->device_signal_ptr = reinterpret_cast(byte_ptr(region->device_base, region->signal_offset)); + info->signal_count = region->signal_count; + info->total_bytes = region->total_bytes; + info->flags = region->flags; + release_region_op(region); + return 0; +} + +int host_device_mapped_region_datacopy_h2region_common( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle handle, uint64_t offset, const void *src, size_t nbytes +) { + if (src == nullptr && nbytes != 0) { + return -EINVAL; + } + HostDeviceMappedRegion *region = acquire_region(ctx, handle); + if (region == nullptr) { + return -EINVAL; + } + int rc = validate_range(region, offset, nbytes); + if (rc != 0) { + release_region_op(region); + return rc; + } + if (nbytes != 0) { + uint8_t *dst = byte_ptr(region->host_base, region->data_offset + offset); + std::memcpy(dst, src, nbytes); + rc = flush_host_range(region, dst, static_cast(nbytes)); + if (rc != 0) { + release_region_op(region); + return -EIO; + } + } + release_region_op(region); + return 0; +} + +int host_device_mapped_region_datacopy_region2h_common( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle handle, uint64_t offset, void *dst, size_t nbytes +) { + if (dst == nullptr && nbytes != 0) { + return -EINVAL; + } + HostDeviceMappedRegion *region = acquire_region(ctx, handle); + if (region == nullptr) { + return -EINVAL; + } + int rc = validate_range(region, offset, nbytes); + if (rc != 0) { + release_region_op(region); + return rc; + } + if (nbytes != 0) { + uint8_t *src = byte_ptr(region->host_base, region->data_offset + offset); + rc = invalidate_host_range(region, src, static_cast(nbytes)); + if (rc != 0) { + release_region_op(region); + return -EIO; + } + std::memcpy(dst, src, nbytes); + } + release_region_op(region); + return 0; +} + +int host_device_mapped_region_notify_common( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle handle, uint32_t signal_id, uint32_t value +) { + HostDeviceMappedRegion *region = acquire_region(ctx, handle); + if (region == nullptr || signal_id >= region->signal_count) { + if (region != nullptr) { + release_region_op(region); + } + return -EINVAL; + } + auto *slot = signal_slot(region, signal_id); + auto *atomic_value = reinterpret_cast *>(const_cast(&slot->value)); + uint32_t current = atomic_value->load(std::memory_order_acquire); + if (value < current) { + release_region_op(region); + return -EINVAL; + } + atomic_value->store(value, std::memory_order_release); + int rc = flush_host_range(region, slot, sizeof(*slot)); + if (rc != 0) { + release_region_op(region); + return -EIO; + } + release_region_op(region); + return 0; +} + +int host_device_mapped_region_wait_common( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle handle, uint32_t signal_id, uint32_t target, + uint32_t timeout_us +) { + HostDeviceMappedRegion *region = acquire_region(ctx, handle); + if (region == nullptr || signal_id >= region->signal_count) { + if (region != nullptr) { + release_region_op(region); + } + return -EINVAL; + } + HostDeviceMappedRegionSignalSlot *slot = signal_slot(region, signal_id); + + int rc = invalidate_host_range(region, slot, sizeof(*slot)); + if (rc != 0) { + release_region_op(region); + return -EIO; + } + auto *atomic_value = reinterpret_cast *>(const_cast(&slot->value)); + if (atomic_value->load(std::memory_order_acquire) >= target) { + release_region_op(region); + return 0; + } + if (timeout_us == 0) { + release_region_op(region); + return -EAGAIN; + } + + const auto deadline = std::chrono::steady_clock::now() + std::chrono::microseconds(timeout_us); + do { + rc = invalidate_host_range(region, slot, sizeof(*slot)); + if (rc != 0) { + release_region_op(region); + return -EIO; + } + if (atomic_value->load(std::memory_order_acquire) >= target) { + release_region_op(region); + return 0; + } + std::this_thread::sleep_for(std::chrono::microseconds(10)); + } while (std::chrono::steady_clock::now() < deadline); + + release_region_op(region); + return -EAGAIN; +} diff --git a/src/common/host_device_comm/host_device_mapped_region.h b/src/common/host_device_comm/host_device_mapped_region.h new file mode 100644 index 000000000..9ef562893 --- /dev/null +++ b/src/common/host_device_comm/host_device_mapped_region.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef SRC_COMMON_WORKER_HOST_DEVICE_MAPPED_REGION_H_ +#define SRC_COMMON_WORKER_HOST_DEVICE_MAPPED_REGION_H_ + +#include +#include + +#include "worker/pto_runtime_c_api.h" + +static constexpr uint32_t HDMR_MAGIC = 0x48444D52U; +static constexpr uint32_t HDMR_VERSION = 1; + +struct alignas(64) HostDeviceMappedRegionHeader { + uint32_t magic; + uint32_t version; + uint32_t flags; + uint32_t signal_count; + uint64_t signal_offset; + uint64_t data_offset; + uint64_t data_bytes; + uint64_t total_bytes; + uint64_t reserved[2]; +}; + +struct alignas(64) HostDeviceMappedRegionSignalSlot { + volatile uint32_t value; + uint32_t reserved0; + uint64_t reserved[7]; +}; + +struct HostDeviceMappedRegionPlatform { + void *resource; + uint64_t device_id; + void *cache_ops_cookie; + int (*flush_host_range)(HostDeviceMappedRegionPlatform *platform, void *host_ptr, uint64_t bytes); + int (*invalidate_host_range)(HostDeviceMappedRegionPlatform *platform, void *host_ptr, uint64_t bytes); + void (*release)(HostDeviceMappedRegionPlatform *platform); +}; + +using HostDeviceMappedRegionAllocateFn = int (*)( + DeviceContextHandle ctx, uint64_t total_bytes, HostDeviceMappedRegionPlatform *platform, void **host_base, + void **device_base +); + +int host_device_mapped_region_compute_total_bytes( + const HostDeviceMappedRegionConfig *cfg, uint64_t *signal_offset, uint64_t *data_offset, uint64_t *total_bytes +); + +int host_device_mapped_region_open_common( + DeviceContextHandle ctx, const HostDeviceMappedRegionConfig *cfg, HostDeviceMappedRegionHandle *out_region, + HostDeviceMappedRegionAllocateFn allocate +); + +int host_device_mapped_region_close_common(DeviceContextHandle ctx, HostDeviceMappedRegionHandle region); + +void host_device_mapped_region_close_all_common(DeviceContextHandle ctx); + +int host_device_mapped_region_info_common( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, HostDeviceMappedRegionInfo *info +); + +int host_device_mapped_region_datacopy_h2region_common( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint64_t offset, const void *src, size_t nbytes +); + +int host_device_mapped_region_datacopy_region2h_common( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint64_t offset, void *dst, size_t nbytes +); + +int host_device_mapped_region_notify_common( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint32_t signal_id, uint32_t value +); + +int host_device_mapped_region_wait_common( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint32_t signal_id, uint32_t target, + uint32_t timeout_us +); + +#endif // SRC_COMMON_WORKER_HOST_DEVICE_MAPPED_REGION_H_ diff --git a/src/common/host_device_comm/host_device_mapped_region_sim.cpp b/src/common/host_device_comm/host_device_mapped_region_sim.cpp new file mode 100644 index 000000000..6894c5800 --- /dev/null +++ b/src/common/host_device_comm/host_device_mapped_region_sim.cpp @@ -0,0 +1,35 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include "host_device_comm/host_device_mapped_region_sim.h" + +#include + +#include + +int host_device_mapped_region_allocate_sim( + DeviceContextHandle ctx, uint64_t total_bytes, HostDeviceMappedRegionPlatform *platform, void **host_base, + void **device_base +) { + (void)ctx; + void *ptr = nullptr; + if (posix_memalign(&ptr, 64, static_cast(total_bytes)) != 0) { + return -ENOMEM; + } + platform->resource = ptr; + platform->release = [](HostDeviceMappedRegionPlatform *p) { + std::free(p->resource); + p->resource = nullptr; + }; + *host_base = ptr; + *device_base = ptr; + return 0; +} diff --git a/src/common/host_device_comm/host_device_mapped_region_sim.h b/src/common/host_device_comm/host_device_mapped_region_sim.h new file mode 100644 index 000000000..b725e8bf1 --- /dev/null +++ b/src/common/host_device_comm/host_device_mapped_region_sim.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef SRC_COMMON_HOST_DEVICE_COMM_HOST_DEVICE_MAPPED_REGION_SIM_H_ +#define SRC_COMMON_HOST_DEVICE_COMM_HOST_DEVICE_MAPPED_REGION_SIM_H_ + +#include "host_device_comm/host_device_mapped_region.h" + +int host_device_mapped_region_allocate_sim( + DeviceContextHandle ctx, uint64_t total_bytes, HostDeviceMappedRegionPlatform *platform, void **host_base, + void **device_base +); + +#endif // SRC_COMMON_HOST_DEVICE_COMM_HOST_DEVICE_MAPPED_REGION_SIM_H_ diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp index 7fab4c295..566cf0357 100644 --- a/src/common/worker/chip_worker.cpp +++ b/src/common/worker/chip_worker.cpp @@ -12,6 +12,7 @@ #include "chip_worker.h" #include +#include #include #include @@ -125,6 +126,15 @@ void ChipWorker::init( load_symbol(handle, "comm_release_domain_windows"); comm_barrier_fn_ = load_symbol(handle, "comm_barrier"); comm_destroy_fn_ = load_symbol(handle, "comm_destroy"); + open_mapped_region_fn_ = load_symbol(handle, "open_host_device_mapped_region_ctx"); + close_mapped_region_fn_ = load_symbol(handle, "close_host_device_mapped_region_ctx"); + mapped_region_info_fn_ = load_symbol(handle, "host_device_mapped_region_info_ctx"); + mapped_region_datacopy_h2region_fn_ = + load_symbol(handle, "host_device_mapped_region_datacopy_h2region_ctx"); + mapped_region_datacopy_region2h_fn_ = + load_symbol(handle, "host_device_mapped_region_datacopy_region2h_ctx"); + mapped_region_notify_fn_ = load_symbol(handle, "host_device_mapped_region_notify_ctx"); + mapped_region_wait_fn_ = load_symbol(handle, "host_device_mapped_region_wait_ctx"); } catch (...) { dlclose(handle); throw; @@ -188,6 +198,13 @@ void ChipWorker::init( comm_release_domain_windows_fn_ = nullptr; comm_barrier_fn_ = nullptr; comm_destroy_fn_ = nullptr; + open_mapped_region_fn_ = nullptr; + close_mapped_region_fn_ = nullptr; + mapped_region_info_fn_ = nullptr; + mapped_region_datacopy_h2region_fn_ = nullptr; + mapped_region_datacopy_region2h_fn_ = nullptr; + mapped_region_notify_fn_ = nullptr; + mapped_region_wait_fn_ = nullptr; runtime_buf_.clear(); throw; } @@ -227,6 +244,13 @@ void ChipWorker::init( comm_release_domain_windows_fn_ = nullptr; comm_barrier_fn_ = nullptr; comm_destroy_fn_ = nullptr; + open_mapped_region_fn_ = nullptr; + close_mapped_region_fn_ = nullptr; + mapped_region_info_fn_ = nullptr; + mapped_region_datacopy_h2region_fn_ = nullptr; + mapped_region_datacopy_region2h_fn_ = nullptr; + mapped_region_notify_fn_ = nullptr; + mapped_region_wait_fn_ = nullptr; runtime_buf_.clear(); throw std::runtime_error("simpler_init failed with code " + std::to_string(init_rc)); } @@ -276,6 +300,13 @@ void ChipWorker::finalize() { comm_release_domain_windows_fn_ = nullptr; comm_barrier_fn_ = nullptr; comm_destroy_fn_ = nullptr; + open_mapped_region_fn_ = nullptr; + close_mapped_region_fn_ = nullptr; + mapped_region_info_fn_ = nullptr; + mapped_region_datacopy_h2region_fn_ = nullptr; + mapped_region_datacopy_region2h_fn_ = nullptr; + mapped_region_notify_fn_ = nullptr; + mapped_region_wait_fn_ = nullptr; runtime_buf_.clear(); initialized_ = false; device_id_ = -1; @@ -482,6 +513,95 @@ void ChipWorker::copy_from(uint64_t dst, uint64_t src, size_t size) { } } +void ChipWorker::check_mapped_region_rc(int rc, const char *op_name) { + if (rc == 0) { + return; + } + std::string msg = op_name; + msg += " failed with code "; + msg += std::to_string(rc); + if (rc == -EINVAL) { + throw std::invalid_argument(msg); + } + throw std::runtime_error(msg); +} + +uint64_t ChipWorker::open_mapped_region(uint64_t data_bytes, uint32_t signal_count, uint32_t flags) { + if (!initialized_) { + throw std::runtime_error("ChipWorker not initialized; call init() first"); + } + HostDeviceMappedRegionConfig cfg{data_bytes, signal_count, flags}; + HostDeviceMappedRegionHandle region = nullptr; + int rc = open_mapped_region_fn_(device_ctx_, &cfg, ®ion); + check_mapped_region_rc(rc, "open_mapped_region"); + if (region == nullptr) { + throw std::runtime_error("open_mapped_region returned null handle"); + } + return reinterpret_cast(region); +} + +void ChipWorker::close_mapped_region(uint64_t handle) { + if (!initialized_) { + throw std::runtime_error("ChipWorker not initialized; call init() first"); + } + int rc = close_mapped_region_fn_(device_ctx_, reinterpret_cast(handle)); + check_mapped_region_rc(rc, "close_mapped_region"); +} + +HostDeviceMappedRegionInfo ChipWorker::mapped_region_info(uint64_t handle) { + if (!initialized_) { + throw std::runtime_error("ChipWorker not initialized; call init() first"); + } + HostDeviceMappedRegionInfo info{}; + int rc = mapped_region_info_fn_(device_ctx_, reinterpret_cast(handle), &info); + check_mapped_region_rc(rc, "mapped_region_info"); + info.host_data_ptr = 0; + info.host_signal_ptr = 0; + return info; +} + +void ChipWorker::mapped_region_datacopy_h2region(uint64_t handle, uint64_t offset, const void *src, size_t nbytes) { + if (!initialized_) { + throw std::runtime_error("ChipWorker not initialized; call init() first"); + } + int rc = mapped_region_datacopy_h2region_fn_( + device_ctx_, reinterpret_cast(handle), offset, src, nbytes + ); + check_mapped_region_rc(rc, "mapped_region_datacopy_h2region"); +} + +void ChipWorker::mapped_region_datacopy_region2h(uint64_t handle, uint64_t offset, void *dst, size_t nbytes) { + if (!initialized_) { + throw std::runtime_error("ChipWorker not initialized; call init() first"); + } + int rc = mapped_region_datacopy_region2h_fn_( + device_ctx_, reinterpret_cast(handle), offset, dst, nbytes + ); + check_mapped_region_rc(rc, "mapped_region_datacopy_region2h"); +} + +void ChipWorker::mapped_region_notify(uint64_t handle, uint32_t signal_id, uint32_t value) { + if (!initialized_) { + throw std::runtime_error("ChipWorker not initialized; call init() first"); + } + int rc = + mapped_region_notify_fn_(device_ctx_, reinterpret_cast(handle), signal_id, value); + check_mapped_region_rc(rc, "mapped_region_notify"); +} + +void ChipWorker::mapped_region_wait(uint64_t handle, uint32_t signal_id, uint32_t target, uint32_t timeout_us) { + if (!initialized_) { + throw std::runtime_error("ChipWorker not initialized; call init() first"); + } + int rc = mapped_region_wait_fn_( + device_ctx_, reinterpret_cast(handle), signal_id, target, timeout_us + ); + if (rc == -EAGAIN || rc == -EWOULDBLOCK) { + throw std::runtime_error("mapped_region_wait timed out"); + } + check_mapped_region_rc(rc, "mapped_region_wait"); +} + uint64_t ChipWorker::comm_init(int rank, int nranks, const std::string &rootinfo_path) { if (!initialized_) { throw std::runtime_error("ChipWorker not initialized; call init() first"); diff --git a/src/common/worker/chip_worker.h b/src/common/worker/chip_worker.h index 2227245f1..7abc8ab76 100644 --- a/src/common/worker/chip_worker.h +++ b/src/common/worker/chip_worker.h @@ -81,6 +81,14 @@ class ChipWorker { void copy_to(uint64_t dst, uint64_t src, size_t size); void copy_from(uint64_t dst, uint64_t src, size_t size); + uint64_t open_mapped_region(uint64_t data_bytes, uint32_t signal_count, uint32_t flags); + void close_mapped_region(uint64_t handle); + HostDeviceMappedRegionInfo mapped_region_info(uint64_t handle); + void mapped_region_datacopy_h2region(uint64_t handle, uint64_t offset, const void *src, size_t nbytes); + void mapped_region_datacopy_region2h(uint64_t handle, uint64_t offset, void *dst, size_t nbytes); + void mapped_region_notify(uint64_t handle, uint32_t signal_id, uint32_t value); + void mapped_region_wait(uint64_t handle, uint32_t signal_id, uint32_t target, uint32_t timeout_us); + /// Distributed communication primitives (optional — only available when /// the bound runtime exports comm_*). Wraps the backend-neutral C API /// defined in src//platform/include/host/comm.h. @@ -158,6 +166,14 @@ class ChipWorker { using CommReleaseDomainWindowsFn = int (*)(void *, uint64_t, size_t, uint32_t); using CommBarrierFn = int (*)(void *); using CommDestroyFn = int (*)(void *); + using OpenMappedRegionFn = int (*)(void *, const HostDeviceMappedRegionConfig *, HostDeviceMappedRegionHandle *); + using CloseMappedRegionFn = int (*)(void *, HostDeviceMappedRegionHandle); + using MappedRegionInfoFn = int (*)(void *, HostDeviceMappedRegionHandle, HostDeviceMappedRegionInfo *); + using MappedRegionDatacopyH2RegionFn = + int (*)(void *, HostDeviceMappedRegionHandle, uint64_t, const void *, size_t); + using MappedRegionDatacopyRegion2HFn = int (*)(void *, HostDeviceMappedRegionHandle, uint64_t, void *, size_t); + using MappedRegionNotifyFn = int (*)(void *, HostDeviceMappedRegionHandle, uint32_t, uint32_t); + using MappedRegionWaitFn = int (*)(void *, HostDeviceMappedRegionHandle, uint32_t, uint32_t, uint32_t); struct CommSession { void *handle = nullptr; @@ -175,6 +191,7 @@ class ChipWorker { int destroy_comm_session(CommSession &session); uint64_t create_base_comm(int rank, int nranks, const std::string &rootinfo_path); void clear_comm_sessions(); + void check_mapped_region_rc(int rc, const char *op_name); void *lib_handle_ = nullptr; CreateDeviceContextFn create_device_context_fn_ = nullptr; @@ -203,6 +220,13 @@ class ChipWorker { CommReleaseDomainWindowsFn comm_release_domain_windows_fn_ = nullptr; CommBarrierFn comm_barrier_fn_ = nullptr; CommDestroyFn comm_destroy_fn_ = nullptr; + OpenMappedRegionFn open_mapped_region_fn_ = nullptr; + CloseMappedRegionFn close_mapped_region_fn_ = nullptr; + MappedRegionInfoFn mapped_region_info_fn_ = nullptr; + MappedRegionDatacopyH2RegionFn mapped_region_datacopy_h2region_fn_ = nullptr; + MappedRegionDatacopyRegion2HFn mapped_region_datacopy_region2h_fn_ = nullptr; + MappedRegionNotifyFn mapped_region_notify_fn_ = nullptr; + MappedRegionWaitFn mapped_region_wait_fn_ = nullptr; void *device_ctx_ = nullptr; std::vector comm_sessions_; std::unordered_map comm_session_index_; diff --git a/src/common/worker/pto_runtime_c_api.h b/src/common/worker/pto_runtime_c_api.h index 00debb446..a55f855ce 100644 --- a/src/common/worker/pto_runtime_c_api.h +++ b/src/common/worker/pto_runtime_c_api.h @@ -47,6 +47,7 @@ extern "C" { typedef void *RuntimeHandle; typedef void *DeviceContextHandle; +typedef void *HostDeviceMappedRegionHandle; /** * Timing breakdown for a single run_prepared() invocation. @@ -73,6 +74,25 @@ typedef struct PtoRunTiming { uint64_t device_wall_ns; } PtoRunTiming; +typedef struct HostDeviceMappedRegionConfig { + uint64_t data_bytes; + uint32_t signal_count; + uint32_t flags; +} HostDeviceMappedRegionConfig; + +typedef struct HostDeviceMappedRegionInfo { + uint64_t host_data_ptr; + uint64_t device_data_ptr; + uint64_t data_bytes; + uint64_t host_signal_ptr; + uint64_t device_signal_ptr; + uint32_t signal_count; + uint32_t reserved0; + uint64_t total_bytes; + uint32_t flags; + uint32_t reserved1; +} HostDeviceMappedRegionInfo; + /* =========================================================================== * Public API (resolved by ChipWorker via dlsym) * =========================================================================== */ @@ -141,6 +161,33 @@ int simpler_init( */ int finalize_device(DeviceContextHandle ctx); +int open_host_device_mapped_region_ctx( + DeviceContextHandle ctx, const HostDeviceMappedRegionConfig *cfg, HostDeviceMappedRegionHandle *out_region +); + +int close_host_device_mapped_region_ctx(DeviceContextHandle ctx, HostDeviceMappedRegionHandle region); + +int host_device_mapped_region_info_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, HostDeviceMappedRegionInfo *info +); + +int host_device_mapped_region_datacopy_h2region_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint64_t offset, const void *src, size_t nbytes +); + +int host_device_mapped_region_datacopy_region2h_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint64_t offset, void *dst, size_t nbytes +); + +int host_device_mapped_region_notify_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint32_t signal_id, uint32_t value +); + +int host_device_mapped_region_wait_ctx( + DeviceContextHandle ctx, HostDeviceMappedRegionHandle region, uint32_t signal_id, uint32_t target, + uint32_t timeout_us +); + /* =========================================================================== * Per-callable_id preparation * diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt index 89314d800..3445a2c2f 100644 --- a/tests/ut/cpp/CMakeLists.txt +++ b/tests/ut/cpp/CMakeLists.txt @@ -237,6 +237,7 @@ add_hierarchical_test(test_ring hierarchical/test_ring.cpp) add_hierarchical_test(test_scope hierarchical/test_scope.cpp) add_hierarchical_test(test_orchestrator hierarchical/test_orchestrator.cpp) add_hierarchical_test(test_scheduler hierarchical/test_scheduler.cpp) +add_hierarchical_test(test_mailbox_control_layout hierarchical/test_mailbox_control_layout.cpp) # --------------------------------------------------------------------------- # Types / task_interface tests (src/common/task_interface/) @@ -271,6 +272,24 @@ add_common_utils_test(test_elf_build_id common/test_elf_build_id.cpp) add_common_utils_test(test_runtime_orch_so common/test_runtime_orch_so.cpp) add_common_utils_test(test_device_arena common/test_device_arena.cpp) +add_executable(test_host_device_mapped_region + common/test_host_device_mapped_region.cpp + ${CMAKE_SOURCE_DIR}/../../../src/common/host_device_comm/host_device_mapped_region.cpp +) +target_include_directories(test_host_device_mapped_region PRIVATE + ${GTEST_INCLUDE_DIRS} + ${CMAKE_SOURCE_DIR}/../../../src/common + ${CMAKE_SOURCE_DIR}/../../../src/common/worker +) +target_compile_options(test_host_device_mapped_region PRIVATE -D_GLIBCXX_USE_CXX11_ABI=0) +target_link_libraries(test_host_device_mapped_region PRIVATE + ${GTEST_MAIN_LIB} + ${GTEST_LIB} + pthread +) +add_test(NAME test_host_device_mapped_region COMMAND test_host_device_mapped_region) +set_tests_properties(test_host_device_mapped_region PROPERTIES LABELS "no_hardware") + # Per-callable_id orch SO file naming regression (see rtStreamSynchronize # 507018 root cause). Compiles the a2a3 onboard `create_orch_so_file` # against the test source so it runs on no-hw runners too. diff --git a/tests/ut/cpp/common/test_host_device_mapped_region.cpp b/tests/ut/cpp/common/test_host_device_mapped_region.cpp new file mode 100644 index 000000000..bcd608524 --- /dev/null +++ b/tests/ut/cpp/common/test_host_device_mapped_region.cpp @@ -0,0 +1,319 @@ +#include "host_device_comm/host_device_mapped_region.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace { + +struct CacheOpsRecorder { + int flush_count = 0; + int invalidate_count = 0; + std::vector events; +}; + +int allocate_heap_region( + DeviceContextHandle, uint64_t total_bytes, HostDeviceMappedRegionPlatform *platform, void **host_base, + void **device_base +) { + void *ptr = nullptr; + if (posix_memalign(&ptr, 64, static_cast(total_bytes)) != 0) { + return -ENOMEM; + } + platform->resource = ptr; + platform->release = [](HostDeviceMappedRegionPlatform *p) { + std::free(p->resource); + p->resource = nullptr; + }; + *host_base = ptr; + *device_base = ptr; + return 0; +} + +int allocate_heap_region_with_cache_ops( + DeviceContextHandle ctx, uint64_t total_bytes, HostDeviceMappedRegionPlatform *platform, void **host_base, + void **device_base +) { + int rc = allocate_heap_region(ctx, total_bytes, platform, host_base, device_base); + if (rc != 0) { + return rc; + } + platform->cache_ops_cookie = ctx; + platform->flush_host_range = [](HostDeviceMappedRegionPlatform *p, void *, uint64_t) { + auto *recorder = static_cast(p->cache_ops_cookie); + ++recorder->flush_count; + recorder->events.push_back("flush"); + return 0; + }; + platform->invalidate_host_range = [](HostDeviceMappedRegionPlatform *p, void *, uint64_t) { + auto *recorder = static_cast(p->cache_ops_cookie); + ++recorder->invalidate_count; + recorder->events.push_back("invalidate"); + return 0; + }; + return 0; +} + +struct ReleaseRecorder { + int release_count = 0; +}; + +struct ReleaseState { + ReleaseRecorder *recorder = nullptr; + void *resource = nullptr; +}; + +void release_recorded_heap_region(HostDeviceMappedRegionPlatform *p) { + auto *state = static_cast(p->resource); + ++state->recorder->release_count; + std::free(state->resource); + delete state; + p->resource = nullptr; +} + +int allocate_region_without_host_base( + DeviceContextHandle ctx, uint64_t total_bytes, HostDeviceMappedRegionPlatform *platform, void **host_base, + void **device_base +) { + int rc = allocate_heap_region(ctx, total_bytes, platform, host_base, device_base); + if (rc != 0) { + return rc; + } + auto *recorder = static_cast(ctx); + void *resource = platform->resource; + platform->resource = new ReleaseState{recorder, resource}; + platform->release = release_recorded_heap_region; + *host_base = nullptr; + return 0; +} + +int allocate_region_with_failing_initial_flush( + DeviceContextHandle ctx, uint64_t total_bytes, HostDeviceMappedRegionPlatform *platform, void **host_base, + void **device_base +) { + int rc = allocate_heap_region(ctx, total_bytes, platform, host_base, device_base); + if (rc != 0) { + return rc; + } + auto *recorder = static_cast(ctx); + void *resource = platform->resource; + platform->resource = new ReleaseState{recorder, resource}; + platform->release = release_recorded_heap_region; + platform->flush_host_range = [](HostDeviceMappedRegionPlatform *, void *, uint64_t) { + return -EIO; + }; + return 0; +} + +} // namespace + +TEST(HostDeviceMappedRegion, PublicAbiLayoutIsStable) { + EXPECT_EQ(offsetof(HostDeviceMappedRegionInfo, host_data_ptr), 0u); + EXPECT_EQ(offsetof(HostDeviceMappedRegionInfo, device_data_ptr), 8u); + EXPECT_EQ(offsetof(HostDeviceMappedRegionInfo, data_bytes), 16u); + EXPECT_EQ(offsetof(HostDeviceMappedRegionInfo, host_signal_ptr), 24u); + EXPECT_EQ(offsetof(HostDeviceMappedRegionInfo, device_signal_ptr), 32u); + EXPECT_EQ(offsetof(HostDeviceMappedRegionInfo, signal_count), 40u); + EXPECT_EQ(offsetof(HostDeviceMappedRegionInfo, total_bytes), 48u); + EXPECT_EQ(offsetof(HostDeviceMappedRegionInfo, flags), 56u); + EXPECT_EQ(sizeof(HostDeviceMappedRegionInfo), 64u); +} + +TEST(HostDeviceMappedRegion, InternalLayoutIsStable) { + EXPECT_EQ(sizeof(HostDeviceMappedRegionHeader), 64u); + EXPECT_EQ(alignof(HostDeviceMappedRegionHeader), 64u); + EXPECT_EQ(sizeof(HostDeviceMappedRegionSignalSlot), 64u); + EXPECT_EQ(alignof(HostDeviceMappedRegionSignalSlot), 64u); + EXPECT_EQ(offsetof(HostDeviceMappedRegionSignalSlot, value), 0u); +} + +TEST(HostDeviceMappedRegion, RejectsInvalidConfig) { + HostDeviceMappedRegionHandle region = reinterpret_cast(0x1); + HostDeviceMappedRegionConfig cfg{0, 1, 0}; + EXPECT_EQ( + host_device_mapped_region_open_common( + reinterpret_cast(0x10), &cfg, ®ion, allocate_heap_region + ), + -EINVAL + ); + EXPECT_EQ(region, nullptr); + + cfg = HostDeviceMappedRegionConfig{16, 0, 0}; + EXPECT_EQ( + host_device_mapped_region_open_common( + reinterpret_cast(0x10), &cfg, ®ion, allocate_heap_region + ), + -EINVAL + ); + EXPECT_EQ(region, nullptr); + + cfg = HostDeviceMappedRegionConfig{16, 1, 1}; + EXPECT_EQ( + host_device_mapped_region_open_common( + reinterpret_cast(0x10), &cfg, ®ion, allocate_heap_region + ), + -EINVAL + ); + EXPECT_EQ(region, nullptr); +} + +TEST(HostDeviceMappedRegion, ReleasesBackendResourceWhenOpenValidationFailsAfterAllocate) { + ReleaseRecorder recorder; + auto ctx = reinterpret_cast(&recorder); + HostDeviceMappedRegionConfig cfg{16, 1, 0}; + HostDeviceMappedRegionHandle region = reinterpret_cast(0x1); + + EXPECT_EQ(host_device_mapped_region_open_common(ctx, &cfg, ®ion, allocate_region_without_host_base), -EIO); + EXPECT_EQ(region, nullptr); + EXPECT_EQ(recorder.release_count, 1); +} + +TEST(HostDeviceMappedRegion, ReleasesBackendResourceWhenInitialFlushFails) { + ReleaseRecorder recorder; + auto ctx = reinterpret_cast(&recorder); + HostDeviceMappedRegionConfig cfg{16, 1, 0}; + HostDeviceMappedRegionHandle region = reinterpret_cast(0x1); + + EXPECT_EQ( + host_device_mapped_region_open_common(ctx, &cfg, ®ion, allocate_region_with_failing_initial_flush), -EIO + ); + EXPECT_EQ(region, nullptr); + EXPECT_EQ(recorder.release_count, 1); +} + +TEST(HostDeviceMappedRegion, OpensZeroInitializedRegionAndReportsInfo) { + auto ctx = reinterpret_cast(0x20); + HostDeviceMappedRegionConfig cfg{17, 2, 0}; + HostDeviceMappedRegionHandle region = nullptr; + ASSERT_EQ(host_device_mapped_region_open_common(ctx, &cfg, ®ion, allocate_heap_region), 0); + ASSERT_NE(region, nullptr); + + HostDeviceMappedRegionInfo info{}; + ASSERT_EQ(host_device_mapped_region_info_common(ctx, region, &info), 0); + EXPECT_NE(info.host_data_ptr, 0u); + EXPECT_EQ(info.host_data_ptr, info.device_data_ptr); + EXPECT_NE(info.host_signal_ptr, 0u); + EXPECT_EQ(info.host_signal_ptr, info.device_signal_ptr); + EXPECT_EQ(info.data_bytes, 17u); + EXPECT_EQ(info.signal_count, 2u); + EXPECT_EQ(info.flags, 0u); + EXPECT_EQ(info.total_bytes, 64u + 2u * 64u + 64u); + + auto *host_base = reinterpret_cast(info.host_signal_ptr - sizeof(HostDeviceMappedRegionHeader)); + auto *header = reinterpret_cast(host_base); + EXPECT_EQ(header->magic, HDMR_MAGIC); + EXPECT_EQ(header->version, HDMR_VERSION); + EXPECT_EQ(header->flags, 0u); + EXPECT_EQ(header->signal_count, 2u); + EXPECT_EQ(header->signal_offset, 64u); + EXPECT_EQ(header->data_offset, 64u + 2u * 64u); + EXPECT_EQ(header->data_bytes, 17u); + EXPECT_EQ(header->total_bytes, info.total_bytes); + + std::vector out(17, 0xAA); + ASSERT_EQ(host_device_mapped_region_datacopy_region2h_common(ctx, region, 0, out.data(), out.size()), 0); + EXPECT_EQ(out, std::vector(17, 0)); + + EXPECT_EQ(host_device_mapped_region_close_common(ctx, region), 0); +} + +TEST(HostDeviceMappedRegion, DatacopyValidatesBoundsAndRoundTripsBytes) { + auto ctx = reinterpret_cast(0x30); + HostDeviceMappedRegionConfig cfg{8, 1, 0}; + HostDeviceMappedRegionHandle region = nullptr; + ASSERT_EQ(host_device_mapped_region_open_common(ctx, &cfg, ®ion, allocate_heap_region), 0); + + const uint8_t input[4] = {1, 2, 3, 4}; + ASSERT_EQ(host_device_mapped_region_datacopy_h2region_common(ctx, region, 2, input, sizeof(input)), 0); + uint8_t output[8] = {}; + ASSERT_EQ(host_device_mapped_region_datacopy_region2h_common(ctx, region, 0, output, sizeof(output)), 0); + const uint8_t expected[8] = {0, 0, 1, 2, 3, 4, 0, 0}; + EXPECT_EQ(std::memcmp(output, expected, sizeof(expected)), 0); + + EXPECT_EQ(host_device_mapped_region_datacopy_h2region_common(ctx, region, 8, input, 0), 0); + EXPECT_EQ(host_device_mapped_region_datacopy_h2region_common(ctx, region, 8, input, 1), -EINVAL); + EXPECT_EQ(host_device_mapped_region_datacopy_region2h_common(ctx, region, 9, output, 0), -EINVAL); + + EXPECT_EQ(host_device_mapped_region_close_common(ctx, region), 0); +} + +TEST(HostDeviceMappedRegion, InvokesPlatformCacheOpsAroundHostAccess) { + CacheOpsRecorder recorder; + auto ctx = reinterpret_cast(&recorder); + HostDeviceMappedRegionConfig cfg{8, 2, 0}; + HostDeviceMappedRegionHandle region = nullptr; + ASSERT_EQ(host_device_mapped_region_open_common(ctx, &cfg, ®ion, allocate_heap_region_with_cache_ops), 0); + EXPECT_GT(recorder.flush_count, 0); + + recorder.events.clear(); + const uint8_t input[4] = {1, 2, 3, 4}; + EXPECT_EQ(host_device_mapped_region_datacopy_h2region_common(ctx, region, 0, input, sizeof(input)), 0); + ASSERT_FALSE(recorder.events.empty()); + EXPECT_EQ(recorder.events.back(), "flush"); + + recorder.events.clear(); + EXPECT_EQ(host_device_mapped_region_notify_common(ctx, region, 0, 1), 0); + ASSERT_FALSE(recorder.events.empty()); + EXPECT_EQ(recorder.events.back(), "flush"); + + recorder.events.clear(); + EXPECT_EQ(host_device_mapped_region_wait_common(ctx, region, 0, 1, 0), 0); + ASSERT_FALSE(recorder.events.empty()); + EXPECT_EQ(recorder.events.front(), "invalidate"); + + recorder.events.clear(); + uint8_t output[4] = {}; + EXPECT_EQ(host_device_mapped_region_datacopy_region2h_common(ctx, region, 0, output, sizeof(output)), 0); + ASSERT_FALSE(recorder.events.empty()); + EXPECT_EQ(recorder.events.front(), "invalidate"); + + EXPECT_EQ(host_device_mapped_region_close_common(ctx, region), 0); +} + +TEST(HostDeviceMappedRegion, NotifyWaitUsesMonotonicSignals) { + auto ctx = reinterpret_cast(0x40); + HostDeviceMappedRegionConfig cfg{8, 1, 0}; + HostDeviceMappedRegionHandle region = nullptr; + ASSERT_EQ(host_device_mapped_region_open_common(ctx, &cfg, ®ion, allocate_heap_region), 0); + + EXPECT_EQ(host_device_mapped_region_wait_common(ctx, region, 0, 0, 0), 0); + EXPECT_EQ(host_device_mapped_region_wait_common(ctx, region, 0, 1, 0), -EAGAIN); + EXPECT_EQ(host_device_mapped_region_notify_common(ctx, region, 0, 7), 0); + EXPECT_EQ(host_device_mapped_region_wait_common(ctx, region, 0, 7, 0), 0); + EXPECT_EQ(host_device_mapped_region_wait_common(ctx, region, 0, 8, 100), -EAGAIN); + EXPECT_EQ(host_device_mapped_region_notify_common(ctx, region, 0, 6), -EINVAL); + EXPECT_EQ(host_device_mapped_region_notify_common(ctx, region, 1, 8), -EINVAL); + + EXPECT_EQ(host_device_mapped_region_close_common(ctx, region), 0); +} + +TEST(HostDeviceMappedRegion, RejectsStaleAndCrossContextHandles) { + auto ctx_a = reinterpret_cast(0x50); + auto ctx_b = reinterpret_cast(0x51); + HostDeviceMappedRegionConfig cfg{8, 1, 0}; + HostDeviceMappedRegionHandle region = nullptr; + ASSERT_EQ(host_device_mapped_region_open_common(ctx_a, &cfg, ®ion, allocate_heap_region), 0); + + HostDeviceMappedRegionInfo info{}; + EXPECT_EQ(host_device_mapped_region_info_common(ctx_b, region, &info), -EINVAL); + EXPECT_EQ(host_device_mapped_region_close_common(ctx_a, region), 0); + EXPECT_EQ(host_device_mapped_region_close_common(ctx_a, region), -EINVAL); + EXPECT_EQ(host_device_mapped_region_info_common(ctx_a, region, &info), -EINVAL); +} + +TEST(HostDeviceMappedRegion, CloseAllReleasesContextRegions) { + auto ctx = reinterpret_cast(0x60); + HostDeviceMappedRegionConfig cfg{8, 1, 0}; + HostDeviceMappedRegionHandle region = nullptr; + ASSERT_EQ(host_device_mapped_region_open_common(ctx, &cfg, ®ion, allocate_heap_region), 0); + host_device_mapped_region_close_all_common(ctx); + HostDeviceMappedRegionInfo info{}; + EXPECT_EQ(host_device_mapped_region_info_common(ctx, region, &info), -EINVAL); +} diff --git a/tests/ut/cpp/hierarchical/test_mailbox_control_layout.cpp b/tests/ut/cpp/hierarchical/test_mailbox_control_layout.cpp new file mode 100644 index 000000000..a531817a0 --- /dev/null +++ b/tests/ut/cpp/hierarchical/test_mailbox_control_layout.cpp @@ -0,0 +1,11 @@ +#include + +#include "worker_manager.h" + +TEST(MailboxControlLayout, UsesFourArgsAndMovesResultToOffset48) { + EXPECT_EQ(CTRL_OFF_ARG0, 16); + EXPECT_EQ(CTRL_OFF_ARG1, 24); + EXPECT_EQ(CTRL_OFF_ARG2, 32); + EXPECT_EQ(CTRL_OFF_ARG3, 40); + EXPECT_EQ(CTRL_OFF_RESULT, 48); +} diff --git a/tests/ut/py/test_chip_worker.py b/tests/ut/py/test_chip_worker.py index 0d6762c14..41fe8cd08 100644 --- a/tests/ut/py/test_chip_worker.py +++ b/tests/ut/py/test_chip_worker.py @@ -9,7 +9,7 @@ """Tests for CallConfig and ChipWorker state machine.""" import pytest -from _task_interface import CallConfig, _ChipWorker # pyright: ignore[reportMissingImports] +from _task_interface import CallConfig, MappedRegionInfo, _ChipWorker # pyright: ignore[reportMissingImports] # ============================================================================ # CallConfig tests @@ -136,6 +136,34 @@ def test_unregister_callable_before_init_raises(self): with pytest.raises(RuntimeError, match="not initialized"): worker.unregister_callable(0) + def test_mapped_region_methods_before_init_raise(self): + worker = _ChipWorker() + with pytest.raises(RuntimeError, match="not initialized"): + worker.open_mapped_region(8, 1, 0) + with pytest.raises(RuntimeError, match="not initialized"): + worker.mapped_region_info(1) + with pytest.raises(RuntimeError, match="not initialized"): + worker.mapped_region_datacopy_h2region(1, 0, b"x") + with pytest.raises(RuntimeError, match="not initialized"): + worker.mapped_region_datacopy_region2h(1, 0, 1) + with pytest.raises(RuntimeError, match="not initialized"): + worker.mapped_region_notify(1, 0, 1) + with pytest.raises(RuntimeError, match="not initialized"): + worker.mapped_region_wait(1, 0, 1, 0) + with pytest.raises(RuntimeError, match="not initialized"): + worker.close_mapped_region(1) + + def test_mapped_region_info_is_structured(self): + info = MappedRegionInfo(0, 0x1000, 16, 0, 0x2000, 2, 256, 0) + assert info.host_data_ptr == 0 + assert info.device_data_ptr == 0x1000 + assert info.data_bytes == 16 + assert info.host_signal_ptr == 0 + assert info.device_signal_ptr == 0x2000 + assert info.signal_count == 2 + assert info.total_bytes == 256 + assert info.flags == 0 + # ============================================================================ # Python-level ChipWorker wrapper tests diff --git a/tests/ut/py/test_worker/test_mapped_region_hw.py b/tests/ut/py/test_worker/test_mapped_region_hw.py new file mode 100644 index 000000000..bcdb92c0d --- /dev/null +++ b/tests/ut/py/test_worker/test_mapped_region_hw.py @@ -0,0 +1,53 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Hardware smoke coverage for HostDeviceMappedRegion on a2a3 onboard.""" + +from __future__ import annotations + +import os + +import pytest + + +@pytest.mark.requires_hardware("a2a3") +@pytest.mark.platforms(["a2a3"]) +def test_a2a3_onboard_mapped_region_host_side_smoke(st_device_ids): + from simpler.worker import Worker + from simpler_setup.runtime_builder import RuntimeBuilder + + build = bool(os.environ.get("PTO_UT_BUILD")) + _ = RuntimeBuilder(platform="a2a3").get_binaries("tensormap_and_ringbuffer", build=build) + device_id = int(st_device_ids[0]) + + worker = Worker(level=2, platform="a2a3", runtime="tensormap_and_ringbuffer", device_id=device_id, build=build) + worker.init() + try: + region = worker.open_mapped_region(128, signal_count=2) + try: + info = worker.mapped_region_info(region) + assert info.host_data_ptr == 0 + assert info.host_signal_ptr == 0 + assert info.device_data_ptr != 0 + assert info.device_signal_ptr != 0 + assert info.data_bytes == 128 + assert info.signal_count == 2 + + payload = bytes((i * 7) % 251 for i in range(96)) + worker.mapped_region_datacopy_h2region(region, 16, payload) + assert worker.mapped_region_datacopy_region2h(region, 16, len(payload)) == payload + + worker.mapped_region_wait(region, 0, 0, 0) + with pytest.raises(TimeoutError): + worker.mapped_region_wait(region, 0, 1, 0) + worker.mapped_region_notify(region, 0, 3) + worker.mapped_region_wait(region, 0, 3, 0) + finally: + worker.close_mapped_region(region) + finally: + worker.close() diff --git a/tests/ut/py/test_worker/test_mapped_region_round_trip_hw.py b/tests/ut/py/test_worker/test_mapped_region_round_trip_hw.py new file mode 100644 index 000000000..aed24a719 --- /dev/null +++ b/tests/ut/py/test_worker/test_mapped_region_round_trip_hw.py @@ -0,0 +1,56 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""CI gate for the mapped-region round-trip example.""" + +from __future__ import annotations + +import os +import subprocess +import sys +from pathlib import Path + +import pytest + + +EXAMPLE = ( + Path(__file__).resolve().parents[4] + / "examples" + / "a2a3" + / "tensormap_and_ringbuffer" + / "host_device_mapped_region_round_trip" + / "main.py" +) +REPO_ROOT = Path(__file__).resolve().parents[4] + + +def _subprocess_env() -> dict[str, str]: + env = os.environ.copy() + paths = [str(REPO_ROOT), str(REPO_ROOT / "python")] + venv_lib = REPO_ROOT / ".venv" / "lib" + if venv_lib.exists(): + paths.extend(str(p) for p in sorted(venv_lib.glob("python*/site-packages"))) + existing = env.get("PYTHONPATH") + if existing: + paths.append(existing) + env["PYTHONPATH"] = os.pathsep.join(paths) + return env + + +@pytest.mark.requires_hardware("a2a3") +@pytest.mark.platforms(["a2a3"]) +def test_a2a3_onboard_mapped_region_real_npu_round_trip(st_device_ids): + result = subprocess.run( + [sys.executable, str(EXAMPLE), "-p", "a2a3", "-d", str(int(st_device_ids[0])), "--iters", "10"], + text=True, + capture_output=True, + timeout=180, + check=False, + env=_subprocess_env(), + ) + assert result.returncode == 0, result.stdout + result.stderr diff --git a/tests/ut/py/test_worker/test_mapped_region_sim.py b/tests/ut/py/test_worker/test_mapped_region_sim.py new file mode 100644 index 000000000..1bb17f808 --- /dev/null +++ b/tests/ut/py/test_worker/test_mapped_region_sim.py @@ -0,0 +1,225 @@ +import pytest + + +def _run_python_snippet(code: str) -> None: + import subprocess + import sys + import textwrap + + result = subprocess.run( + [sys.executable, "-c", textwrap.dedent(code)], + text=True, + capture_output=True, + timeout=60, + check=False, + ) + assert result.returncode == 0, result.stderr + result.stdout + + +def test_worker_mailbox_control_offsets_match_cpp_contract(): + import simpler.worker as worker_mod + from _task_interface import ( # pyright: ignore[reportMissingImports] + CTRL_OFF_ARG0, + CTRL_OFF_ARG1, + CTRL_OFF_ARG2, + CTRL_OFF_ARG3, + CTRL_OFF_RESULT, + ) + + assert (worker_mod._CTRL_OFF_ARG0, worker_mod._CTRL_OFF_ARG1, worker_mod._CTRL_OFF_ARG2) == (16, 24, 32) + assert worker_mod._CTRL_OFF_ARG3 == 40 + assert worker_mod._CTRL_OFF_RESULT == 48 + assert (CTRL_OFF_ARG0, CTRL_OFF_ARG1, CTRL_OFF_ARG2, CTRL_OFF_ARG3, CTRL_OFF_RESULT) == ( + worker_mod._CTRL_OFF_ARG0, + worker_mod._CTRL_OFF_ARG1, + worker_mod._CTRL_OFF_ARG2, + worker_mod._CTRL_OFF_ARG3, + worker_mod._CTRL_OFF_RESULT, + ) + + +class FakeChipWorker: + def __init__(self): + self.calls = [] + + def open_mapped_region(self, data_bytes, signal_count=1, flags=0): + self.calls.append(("open", data_bytes, signal_count, flags)) + return 0xABC + + def mapped_region_info(self, handle): + self.calls.append(("info", handle)) + from simpler.task_interface import MappedRegionInfo + + return MappedRegionInfo(0, 0x1000, 16, 0, 0x2000, 2, 256, 0) + + def mapped_region_datacopy_h2region(self, handle, offset, data): + self.calls.append(("h2region", handle, offset, data)) + + def mapped_region_datacopy_region2h(self, handle, offset, nbytes): + self.calls.append(("region2h", handle, offset, nbytes)) + return b"out" + + def mapped_region_notify(self, handle, signal_id, value): + self.calls.append(("notify", handle, signal_id, value)) + + def mapped_region_wait(self, handle, signal_id, target, timeout_us): + self.calls.append(("wait", handle, signal_id, target, timeout_us)) + + def close_mapped_region(self, handle): + self.calls.append(("close", handle)) + + +def make_l2_worker_with_fake_chip(): + from simpler.worker import Worker + + worker = Worker(level=2) + worker._chip_worker = FakeChipWorker() + worker._initialized = True + return worker + + +def test_worker_l2_mapped_region_round_trips_to_chip_worker(): + worker = make_l2_worker_with_fake_chip() + + region = worker.open_mapped_region(16, signal_count=2) + assert region.handle == 0xABC + assert region.worker_id == 0 + assert region.data_bytes == 16 + assert region.signal_count == 2 + assert region.flags == 0 + assert region.closed is False + + info = worker.mapped_region_info(region) + assert info.host_data_ptr == 0 + assert info.host_signal_ptr == 0 + assert info.device_data_ptr == 0x1000 + + worker.mapped_region_datacopy_h2region(region, 4, b"abcd") + assert worker.mapped_region_datacopy_region2h(region, 0, 3) == b"out" + worker.mapped_region_notify(region, 1, 7) + worker.mapped_region_wait(region, 1, 7, 100) + worker.close_mapped_region(region) + + assert region.closed is True + assert worker._chip_worker.calls == [ + ("open", 16, 2, 0), + ("info", 0xABC), + ("h2region", 0xABC, 4, b"abcd"), + ("region2h", 0xABC, 0, 3), + ("notify", 0xABC, 1, 7), + ("wait", 0xABC, 1, 7, 100), + ("close", 0xABC), + ] + + +def test_worker_mapped_region_rejects_mismatched_worker_id_and_closed_wrapper(): + worker = make_l2_worker_with_fake_chip() + region = worker.open_mapped_region(16, signal_count=1) + + with pytest.raises(ValueError, match="worker_id"): + worker.mapped_region_info(region, worker_id=1) + + worker.close_mapped_region(region) + with pytest.raises(ValueError, match="closed"): + worker.mapped_region_notify(region, 0, 1) + + worker.close_mapped_region(region) + assert worker._chip_worker.calls[-1] == ("close", 0xABC) + assert worker._chip_worker.calls.count(("close", 0xABC)) == 1 + + +def test_worker_mapped_region_rejects_str_h2region_input(): + worker = make_l2_worker_with_fake_chip() + region = worker.open_mapped_region(16, signal_count=1) + + with pytest.raises(ValueError, match="bytes-like"): + worker.mapped_region_datacopy_h2region(region, 0, "text") + + +@pytest.mark.parametrize("platform", ["a2a3sim", "a5sim"]) +def test_worker_l2_mapped_region_sim_backend_round_trip(platform): + _run_python_snippet( + f""" + from simpler.worker import Worker + + worker = Worker(level=2, platform="{platform}", runtime="tensormap_and_ringbuffer", build=True) + worker.init() + try: + region = worker.open_mapped_region(8, signal_count=1) + info = worker.mapped_region_info(region) + assert info.host_data_ptr == 0 + assert info.host_signal_ptr == 0 + assert info.device_data_ptr != 0 + assert info.device_signal_ptr != 0 + assert info.data_bytes == 8 + assert info.signal_count == 1 + + worker.mapped_region_datacopy_h2region(region, 2, b"abcd") + assert worker.mapped_region_datacopy_region2h(region, 0, 8) == b"\\x00\\x00abcd\\x00\\x00" + + worker.mapped_region_wait(region, 0, 0, 0) + try: + worker.mapped_region_wait(region, 0, 1, 0) + raise AssertionError("mapped_region_wait unexpectedly succeeded") + except TimeoutError: + pass + worker.mapped_region_notify(region, 0, 3) + worker.mapped_region_wait(region, 0, 3, 0) + + worker.close_mapped_region(region) + try: + worker.mapped_region_info(region) + raise AssertionError("closed mapped region unexpectedly succeeded") + except ValueError: + pass + finally: + worker.close() + """ + ) + + +@pytest.mark.parametrize("platform", ["a2a3sim", "a5sim"]) +def test_worker_l3_mapped_region_sim_backend_round_trip(platform): + _run_python_snippet( + f""" + from simpler.worker import Worker + + worker = Worker(level=3, device_ids=[0], platform="{platform}", runtime="tensormap_and_ringbuffer", build=True) + worker.init() + try: + region = worker.open_mapped_region(8192, signal_count=2, worker_id=0) + info = worker.mapped_region_info(region) + assert info.host_data_ptr == 0 + assert info.host_signal_ptr == 0 + assert info.device_data_ptr != 0 + assert info.device_signal_ptr != 0 + assert info.data_bytes == 8192 + assert info.signal_count == 2 + + payload = bytes((i % 251 for i in range(5000))) + worker.mapped_region_datacopy_h2region(region, 1024, payload) + assert worker.mapped_region_datacopy_region2h(region, 1024, len(payload)) == payload + + try: + worker.mapped_region_wait(region, 1, 1, 0) + raise AssertionError("mapped_region_wait unexpectedly succeeded") + except TimeoutError: + pass + worker.mapped_region_notify(region, 1, 9) + worker.mapped_region_wait(region, 1, 9, 0) + + try: + worker.mapped_region_info(region, worker_id=1) + raise AssertionError("mismatched worker_id unexpectedly succeeded") + except ValueError: + pass + worker.close_mapped_region(region) + try: + worker.mapped_region_info(region) + raise AssertionError("closed mapped region unexpectedly succeeded") + except ValueError: + pass + finally: + worker.close() + """ + )