[OpenMP][AMDGPU] Introduce device memory initialization (#807)

Kewen12 · web-flow · commit 7ea86bbee916 · 2025-12-10T13:49:01.000-05:00
This PR is to fix the issue in https://ontrack-internal.amd.com/browse/SWDEV-566712 Launch ` __ockl_dm_init_v1` in a special kernel before user kernels for initializing device memory. In this implementation, we launch the kernel after the image is loaded so that it doesn't have to repeat the image load/unload logic. This special kernel will be built into device image (there probably will be duplication for multiple images). I think ideally this kernel could be in a separate image (or a binary blob into host runtime) so that it will be more self-contained. I will think about it as future work. Smoke tests all passed.
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -3644,6 +3644,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     // Load the HSA executable.
     if (Error Err = AMDImage->loadExecutable(*this))
       return std::move(Err);
+
+    // Launch the special kernel for device memory initialization
+    if (Error Err = launchDMInitKernel(*AMDImage))
+      return std::move(Err);
+
     return AMDImage;
   }
 
@@ -4642,13 +4647,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   Error preAllocateDeviceMemoryPool() {
 
     void *DevPtr;
+    // Use PER_DEVICE_PREALLOC_SIZE (128KB) as heap and allocate 512MB for
+    // device memory
+    size_t PreAllocSize = hsa_utils::PER_DEVICE_PREALLOC_SIZE + DMSlabSize;
+
     for (AMDGPUMemoryPoolTy *MemoryPool : AllMemoryPools) {
       if (!MemoryPool->isGlobal())
         continue;
 
       if (MemoryPool->isCoarseGrained()) {
         DevPtr = nullptr;
-        size_t PreAllocSize = hsa_utils::PER_DEVICE_PREALLOC_SIZE;
 
         Error Err = MemoryPool->allocate(PreAllocSize, &DevPtr);
         if (Err)
@@ -4664,6 +4672,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
               "Zero initialization of preallocated device memory pool failed");
 
         PreAllocatedDeviceMemoryPool = DevPtr;
+
+        DMHeapPtr = DevPtr;
+        DMSlabPtr =
+            reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(DevPtr) +
+                                     hsa_utils::PER_DEVICE_PREALLOC_SIZE);
       }
     }
     return Plugin::success();
@@ -5070,6 +5083,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// True if in multi-device mode.
   bool IsMultiDeviceEnabled = false;
 
+  /// Arguments for device memory initialization.
+  void *DMHeapPtr = nullptr;
+  void *DMSlabPtr = nullptr;
+  bool DMInitialized = false;
+  static constexpr uint32_t DMNumSlabs = 256;
+  static constexpr size_t DMSlabSize = DMNumSlabs * (2 * 1024 * 1024); // 512MB
+
   /// Struct holding time in ns at a point in time for both host and device
   /// This is used to compute a device-to-host offset and skew. Required for
   /// OMPT function translate_time.
@@ -5167,6 +5187,70 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return It->second;
   }
 
+  /// Launch the device memory initialization kernel.
+  Error launchDMInitKernel(AMDGPUDeviceImageTy &Image) {
+    // Already initialized, skip
+    if (DMInitialized)
+      return Plugin::success();
+
+    if (!DMHeapPtr || !DMSlabPtr)
+      return Plugin::error(
+          ErrorCode::UNKNOWN,
+          "Device memory not allocated for launching DM init kernel.");
+
+    // Check if this image contains the DM init kernel
+    const char *KernelName = "__omp_dm_init_kernel";
+
+    GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
+    if (!Handler.isSymbolInImage(*this, Image, KernelName)) {
+      DP("DM init kernel is not in this image.\n");
+      return Plugin::success();
+    }
+
+    AMDGPUKernelTy DMInitKernel(KernelName, Plugin.getGlobalHandler());
+    if (auto Err = DMInitKernel.init(*this, Image)) {
+      return Err;
+    }
+
+    DP("Device memory initializing...\n");
+
+    // Prepare kernel arguments
+    struct __attribute__((packed)) {
+      uint64_t HeapAddr;
+      uint64_t SlabAddr;
+    } Args;
+
+    Args.HeapAddr = reinterpret_cast<uint64_t>(DMHeapPtr);
+    Args.SlabAddr = reinterpret_cast<uint64_t>(DMSlabPtr);
+
+    KernelArgsTy KernelArgs;
+    KernelLaunchParamsTy LaunchParams;
+    LaunchParams.Data = &Args;
+    LaunchParams.Size = sizeof(Args);
+
+    AsyncInfoWrapperTy AsyncInfo(*this, nullptr);
+
+    uint32_t NumThreads[3] = {256u, 1u, 1u};
+    uint32_t NumBlocks[3] = {1u, 1u, 1u};
+
+    // Launch kernel with 256 threads and 1 block
+    if (auto Err = DMInitKernel.launchImpl(*this, NumThreads, NumBlocks,
+                                           KernelArgs, LaunchParams, AsyncInfo))
+      return Err;
+
+    // Wait for completion
+    Error Err = Plugin::success();
+    AsyncInfo.finalize(Err);
+
+    // Mark as successfully initialized
+    if (!Err) {
+      DMInitialized = true;
+      DP("Device memory initialized successfully\n");
+    }
+
+    return Err;
+  }
+
 public:
   /// Return if it is an MI300 series device.
   bool checkIfMI300Device() {
diff --git a/openmp/device/CMakeLists.txt b/openmp/device/CMakeLists.txt
@@ -36,6 +36,7 @@ set(src_files
   ${CMAKE_CURRENT_SOURCE_DIR}/src/Xteamr.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/src/Memory.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/src/Xteams.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/DeviceMemInit.cpp
   ${emissary_sources}
 
 )
diff --git a/openmp/device/src/DeviceMemInit.cpp b/openmp/device/src/DeviceMemInit.cpp
@@ -0,0 +1,16 @@
+extern "C" {
+void __ockl_dm_init_v1(unsigned long hp, unsigned long sp, unsigned int hb,
+                       unsigned int nis);
+
+/// Device memory initialization kernel
+__attribute__((amdgpu_kernel, amdgpu_flat_work_group_size(256, 256),
+               amdgpu_max_num_work_groups(1), visibility("protected"))) void
+__omp_dm_init_kernel(unsigned long heap_ptr, unsigned long slab_ptr) {
+
+  unsigned int HEAP_BYTES = 1;
+  unsigned int NUM_SLABS = 256;
+
+  // Use 256 * 2MB = 512MB for GPU memory allocation.
+  __ockl_dm_init_v1(heap_ptr, slab_ptr, HEAP_BYTES, NUM_SLABS);
+}
+}

Original file line number	Diff line number	Diff line change
`@@ -36,6 +36,7 @@ set(src_files`
`36`	`36`	`${CMAKE_CURRENT_SOURCE_DIR}/src/Xteamr.cpp`
`37`	`37`	`${CMAKE_CURRENT_SOURCE_DIR}/src/Memory.cpp`
`38`	`38`	`${CMAKE_CURRENT_SOURCE_DIR}/src/Xteams.cpp`
	`39`	`+ ${CMAKE_CURRENT_SOURCE_DIR}/src/DeviceMemInit.cpp`
`39`	`40`	`${emissary_sources}`
`40`	`41`
`41`	`42`	`)`