ROCm · ronlieb · Dec 10, 2025 · Dec 10, 2025 · Dec 10, 2025 · jplehr
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -3644,6 +3644,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     // Load the HSA executable.
     if (Error Err = AMDImage->loadExecutable(*this))
       return std::move(Err);
+
+    // Launch the special kernel for device memory initialization
+    if (Error Err = launchDMInitKernel(*AMDImage))
+      return std::move(Err);
+
     return AMDImage;
   }
 
@@ -4642,13 +4647,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   Error preAllocateDeviceMemoryPool() {
 
     void *DevPtr;
+    // Use PER_DEVICE_PREALLOC_SIZE (128KB) as heap and allocate 512MB for
+    // device memory
+    size_t PreAllocSize = hsa_utils::PER_DEVICE_PREALLOC_SIZE + DMSlabSize;
+
     for (AMDGPUMemoryPoolTy *MemoryPool : AllMemoryPools) {
       if (!MemoryPool->isGlobal())
         continue;
 
       if (MemoryPool->isCoarseGrained()) {
         DevPtr = nullptr;
-        size_t PreAllocSize = hsa_utils::PER_DEVICE_PREALLOC_SIZE;
 
         Error Err = MemoryPool->allocate(PreAllocSize, &DevPtr);
         if (Err)
@@ -4664,6 +4672,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
               "Zero initialization of preallocated device memory pool failed");
 
         PreAllocatedDeviceMemoryPool = DevPtr;
+
+        DMHeapPtr = DevPtr;
+        DMSlabPtr =
+            reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(DevPtr) +
+                                     hsa_utils::PER_DEVICE_PREALLOC_SIZE);
       }
     }
     return Plugin::success();
@@ -5070,6 +5083,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// True if in multi-device mode.
   bool IsMultiDeviceEnabled = false;
 
+  /// Arguments for device memory initialization.
+  void *DMHeapPtr = nullptr;
+  void *DMSlabPtr = nullptr;
+  bool DMInitialized = false;
+  static constexpr uint32_t DMNumSlabs = 256;
+  static constexpr size_t DMSlabSize = DMNumSlabs * (2 * 1024 * 1024); // 512MB
+
   /// Struct holding time in ns at a point in time for both host and device
   /// This is used to compute a device-to-host offset and skew. Required for
   /// OMPT function translate_time.
@@ -5167,6 +5187,70 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return It->second;
   }
 
+  /// Launch the device memory initialization kernel.
+  Error launchDMInitKernel(AMDGPUDeviceImageTy &Image) {
+    // Already initialized, skip
+    if (DMInitialized)
+      return Plugin::success();
+
+    if (!DMHeapPtr || !DMSlabPtr)
+      return Plugin::error(
+          ErrorCode::UNKNOWN,
+          "Device memory not allocated for launching DM init kernel.");
+
+    // Check if this image contains the DM init kernel
+    const char *KernelName = "__omp_dm_init_kernel";
+
+    GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
+    if (!Handler.isSymbolInImage(*this, Image, KernelName)) {
+      DP("DM init kernel is not in this image.\n");
+      return Plugin::success();
+    }
+
+    AMDGPUKernelTy DMInitKernel(KernelName, Plugin.getGlobalHandler());
+    if (auto Err = DMInitKernel.init(*this, Image)) {
+      return Err;
+    }
+
+    DP("Device memory initializing...\n");
+
+    // Prepare kernel arguments
+    struct __attribute__((packed)) {
+      uint64_t HeapAddr;
+      uint64_t SlabAddr;
+    } Args;
+
+    Args.HeapAddr = reinterpret_cast<uint64_t>(DMHeapPtr);
+    Args.SlabAddr = reinterpret_cast<uint64_t>(DMSlabPtr);
+
+    KernelArgsTy KernelArgs;
+    KernelLaunchParamsTy LaunchParams;
+    LaunchParams.Data = &Args;
+    LaunchParams.Size = sizeof(Args);
+
+    AsyncInfoWrapperTy AsyncInfo(*this, nullptr);
+
+    uint32_t NumThreads[3] = {256u, 1u, 1u};
+    uint32_t NumBlocks[3] = {1u, 1u, 1u};
+
+    // Launch kernel with 256 threads and 1 block
+    if (auto Err = DMInitKernel.launchImpl(*this, NumThreads, NumBlocks,
+                                           KernelArgs, LaunchParams, AsyncInfo))
+      return Err;
+
+    // Wait for completion
+    Error Err = Plugin::success();
+    AsyncInfo.finalize(Err);
+
+    // Mark as successfully initialized
+    if (!Err) {
+      DMInitialized = true;
+      DP("Device memory initialized successfully\n");
+    }
+
+    return Err;
+  }
+
 public:
   /// Return if it is an MI300 series device.
   bool checkIfMI300Device() {

diff --git a/openmp/device/CMakeLists.txt b/openmp/device/CMakeLists.txt
@@ -36,6 +36,7 @@ set(src_files
   ${CMAKE_CURRENT_SOURCE_DIR}/src/Xteamr.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/src/Memory.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/src/Xteams.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/DeviceMemInit.cpp
   ${emissary_sources}
 
 )

diff --git a/openmp/device/src/DeviceMemInit.cpp b/openmp/device/src/DeviceMemInit.cpp
@@ -0,0 +1,16 @@
+extern "C" {
+void __ockl_dm_init_v1(unsigned long hp, unsigned long sp, unsigned int hb,
+                       unsigned int nis);
+
+/// Device memory initialization kernel
+__attribute__((amdgpu_kernel, amdgpu_flat_work_group_size(256, 256),
+               amdgpu_max_num_work_groups(1), visibility("protected"))) void
+__omp_dm_init_kernel(unsigned long heap_ptr, unsigned long slab_ptr) {
+
+  unsigned int HEAP_BYTES = 1;
+  unsigned int NUM_SLABS = 256;
+
+  // Use 256 * 2MB = 512MB for GPU memory allocation.
+  __ockl_dm_init_v1(heap_ptr, slab_ptr, HEAP_BYTES, NUM_SLABS);
+}
+}