From 2ea79f2b545368c4ee35d6e2e430a9d6ab71dea8 Mon Sep 17 00:00:00 2001 From: Kewen Meng Date: Tue, 9 Dec 2025 22:23:03 -0600 Subject: [PATCH 1/2] [OpenMP][AMDGPU] Introduce memory initialization --- offload/plugins-nextgen/amdgpu/src/rtl.cpp | 89 +++++++++++++++++++++- openmp/device/CMakeLists.txt | 1 + openmp/device/src/DeviceMemInit.cpp | 16 ++++ 3 files changed, 105 insertions(+), 1 deletion(-) create mode 100644 openmp/device/src/DeviceMemInit.cpp diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 020815980ec3a..1bd6504b123d7 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -3644,6 +3644,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { // Load the HSA executable. if (Error Err = AMDImage->loadExecutable(*this)) return std::move(Err); + + // Launch the special kernel for device memory initialization + if (Error Err = launchDMInitKernel(*AMDImage)) + return std::move(Err); + return AMDImage; } @@ -4642,13 +4647,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Error preAllocateDeviceMemoryPool() { void *DevPtr; + // Use PER_DEVICE_PREALLOC_SIZE (128KB) as heap and allocate 512MB for + // device memory + size_t PreAllocSize = hsa_utils::PER_DEVICE_PREALLOC_SIZE + DMSlabSize; + for (AMDGPUMemoryPoolTy *MemoryPool : AllMemoryPools) { if (!MemoryPool->isGlobal()) continue; if (MemoryPool->isCoarseGrained()) { DevPtr = nullptr; - size_t PreAllocSize = hsa_utils::PER_DEVICE_PREALLOC_SIZE; Error Err = MemoryPool->allocate(PreAllocSize, &DevPtr); if (Err) @@ -4664,6 +4672,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { "Zero initialization of preallocated device memory pool failed"); PreAllocatedDeviceMemoryPool = DevPtr; + + DMHeapPtr = DevPtr; + DMSlabPtr = + reinterpret_cast(reinterpret_cast(DevPtr) + + hsa_utils::PER_DEVICE_PREALLOC_SIZE); } } return Plugin::success(); @@ -5070,6 +5083,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { /// True if in multi-device mode. bool IsMultiDeviceEnabled = false; + /// Arguments for device memory initialization. + void *DMHeapPtr = nullptr; + void *DMSlabPtr = nullptr; + bool DMInitialized = false; + static constexpr uint32_t DMNumSlabs = 256; + static constexpr size_t DMSlabSize = DMNumSlabs * (2 * 1024 * 1024); // 512MB + /// Struct holding time in ns at a point in time for both host and device /// This is used to compute a device-to-host offset and skew. Required for /// OMPT function translate_time. @@ -5167,6 +5187,73 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return It->second; } + /// Launch the device memory initialization kernel. + Error launchDMInitKernel(AMDGPUDeviceImageTy &Image) { + // Already initialized, skip + if (DMInitialized) + return Plugin::success(); + + if (!DMHeapPtr || !DMSlabPtr) + return Plugin::error( + ErrorCode::UNKNOWN, + "Device memory not allocated for launching DM init kernel."); + + // Check if this image contains the DM init kernel + const char *KernelName = "__omp_dm_init_kernel"; + + GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); + if (!Handler.isSymbolInImage(*this, Image, KernelName)) { + DP("DM init kernel is not in this image.\n"); + return Plugin::success(); + } + + AMDGPUKernelTy DMInitKernel(KernelName, Plugin.getGlobalHandler()); + if (auto Err = DMInitKernel.init(*this, Image)) { + return Err; + } + + DP("Device memory initializing...\n"); + + // Prepare kernel arguments + uint64_t HeapAddr = reinterpret_cast(DMHeapPtr); + uint64_t SlabAddr = reinterpret_cast(DMSlabPtr); + + struct __attribute__((packed)) { + uint64_t HeapAddr; + uint64_t SlabAddr; + } Args; + + Args.HeapAddr = reinterpret_cast(DMHeapPtr); + Args.SlabAddr = reinterpret_cast(DMSlabPtr); + + KernelArgsTy KernelArgs; + KernelLaunchParamsTy LaunchParams; + LaunchParams.Data = &Args; + LaunchParams.Size = sizeof(Args); + + AsyncInfoWrapperTy AsyncInfo(*this, nullptr); + + uint32_t NumThreads[3] = {256u, 1u, 1u}; + uint32_t NumBlocks[3] = {1u, 1u, 1u}; + + // Launch kernel with 256 threads and 1 block + if (auto Err = DMInitKernel.launchImpl(*this, NumThreads, NumBlocks, + KernelArgs, LaunchParams, AsyncInfo)) + return Err; + + // Wait for completion + Error Err = Plugin::success(); + AsyncInfo.finalize(Err); + + // Mark as successfully initialized + if (!Err) { + DMInitialized = true; + DP("Device memory initialized successfully\n"); + } + + return Err; + } + public: /// Return if it is an MI300 series device. bool checkIfMI300Device() { diff --git a/openmp/device/CMakeLists.txt b/openmp/device/CMakeLists.txt index b88c8e6074a2e..5722cea1bf9e2 100644 --- a/openmp/device/CMakeLists.txt +++ b/openmp/device/CMakeLists.txt @@ -36,6 +36,7 @@ set(src_files ${CMAKE_CURRENT_SOURCE_DIR}/src/Xteamr.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/Memory.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/Xteams.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/DeviceMemInit.cpp ${emissary_sources} ) diff --git a/openmp/device/src/DeviceMemInit.cpp b/openmp/device/src/DeviceMemInit.cpp new file mode 100644 index 0000000000000..03a26a698d5f9 --- /dev/null +++ b/openmp/device/src/DeviceMemInit.cpp @@ -0,0 +1,16 @@ +extern "C" { +void __ockl_dm_init_v1(unsigned long hp, unsigned long sp, unsigned int hb, + unsigned int nis); + +/// Device memory initialization kernel +__attribute__((amdgpu_kernel, amdgpu_flat_work_group_size(256, 256), + amdgpu_max_num_work_groups(1), visibility("protected"))) void +__omp_dm_init_kernel(unsigned long heap_ptr, unsigned long slab_ptr) { + + unsigned int HEAP_BYTES = 1; + unsigned int NUM_SLABS = 256; + + // Use 256 * 2MB = 512MB for GPU memory allocation. + __ockl_dm_init_v1(heap_ptr, slab_ptr, HEAP_BYTES, NUM_SLABS); +} +} From ac1c6c3277fce4bfff6bc3e20a37db51e6686bd8 Mon Sep 17 00:00:00 2001 From: Kewen Meng Date: Tue, 9 Dec 2025 23:25:05 -0600 Subject: [PATCH 2/2] cleanup --- offload/plugins-nextgen/amdgpu/src/rtl.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 1bd6504b123d7..b9b4fdfc8231f 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -5215,9 +5215,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { DP("Device memory initializing...\n"); // Prepare kernel arguments - uint64_t HeapAddr = reinterpret_cast(DMHeapPtr); - uint64_t SlabAddr = reinterpret_cast(DMSlabPtr); - struct __attribute__((packed)) { uint64_t HeapAddr; uint64_t SlabAddr;