-
Notifications
You must be signed in to change notification settings - Fork 77
[OpenMP][AMDGPU] Introduce device memory initialization #807
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3644,6 +3644,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { | |
| // Load the HSA executable. | ||
| if (Error Err = AMDImage->loadExecutable(*this)) | ||
| return std::move(Err); | ||
|
|
||
| // Launch the special kernel for device memory initialization | ||
| if (Error Err = launchDMInitKernel(*AMDImage)) | ||
| return std::move(Err); | ||
|
|
||
| return AMDImage; | ||
| } | ||
|
|
||
|
|
@@ -4642,13 +4647,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { | |
| Error preAllocateDeviceMemoryPool() { | ||
|
|
||
| void *DevPtr; | ||
| // Use PER_DEVICE_PREALLOC_SIZE (128KB) as heap and allocate 512MB for | ||
| // device memory | ||
| size_t PreAllocSize = hsa_utils::PER_DEVICE_PREALLOC_SIZE + DMSlabSize; | ||
|
|
||
| for (AMDGPUMemoryPoolTy *MemoryPool : AllMemoryPools) { | ||
| if (!MemoryPool->isGlobal()) | ||
| continue; | ||
|
|
||
| if (MemoryPool->isCoarseGrained()) { | ||
| DevPtr = nullptr; | ||
| size_t PreAllocSize = hsa_utils::PER_DEVICE_PREALLOC_SIZE; | ||
|
|
||
| Error Err = MemoryPool->allocate(PreAllocSize, &DevPtr); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm a little concerned about doing this in a single allocation. The slabs must be 2MB aligned. With the prealloc at the beginning, it seems unlikely that the slabs will end up being 2MB aligned. Putting it at the end might raise the possibility, or allocating it separately. We might want an assert or some other guard to ensure that DMSlabPtr is 2MB aligned.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the comment! Looking into this now |
||
| if (Err) | ||
|
|
@@ -4664,6 +4672,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { | |
| "Zero initialization of preallocated device memory pool failed"); | ||
|
|
||
| PreAllocatedDeviceMemoryPool = DevPtr; | ||
|
|
||
| DMHeapPtr = DevPtr; | ||
| DMSlabPtr = | ||
| reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(DevPtr) + | ||
| hsa_utils::PER_DEVICE_PREALLOC_SIZE); | ||
| } | ||
| } | ||
| return Plugin::success(); | ||
|
|
@@ -5070,6 +5083,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { | |
| /// True if in multi-device mode. | ||
| bool IsMultiDeviceEnabled = false; | ||
|
|
||
| /// Arguments for device memory initialization. | ||
| void *DMHeapPtr = nullptr; | ||
| void *DMSlabPtr = nullptr; | ||
| bool DMInitialized = false; | ||
| static constexpr uint32_t DMNumSlabs = 256; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems really large, especially if malloc is unused. HIP uses 4 here (8MB).
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the review, Brian! For my learning, if slab runs out for malloc, will it cause substantial performance overhead (I guess there will be dynamic allocation)?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Once the initial DMNumSlabs=256 slabs are exhausted, then the implementation will start asking for new slabs via hostcall/hostrpc and that will be slower. But on the other hand, every process running on the GPU taking away 1/2 GB of space each and sometimes using none of it seems wasteful, and could be problematic for applications that want to use most of the device memory for their own purposes and are avoiding using device malloc. The OpenMP team will have to decide on the best tradeoff of course, but I think 256 is pretty large.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks Brian, Kewen will try the smaller threshold later today, follow on PR
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, Brian! I will put up another PR following your suggestion to lower the slab size (to 8MB to see how it performs). |
||
| static constexpr size_t DMSlabSize = DMNumSlabs * (2 * 1024 * 1024); // 512MB | ||
|
|
||
| /// Struct holding time in ns at a point in time for both host and device | ||
| /// This is used to compute a device-to-host offset and skew. Required for | ||
| /// OMPT function translate_time. | ||
|
|
@@ -5167,6 +5187,70 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { | |
| return It->second; | ||
| } | ||
|
|
||
| /// Launch the device memory initialization kernel. | ||
| Error launchDMInitKernel(AMDGPUDeviceImageTy &Image) { | ||
| // Already initialized, skip | ||
| if (DMInitialized) | ||
| return Plugin::success(); | ||
|
|
||
| if (!DMHeapPtr || !DMSlabPtr) | ||
| return Plugin::error( | ||
| ErrorCode::UNKNOWN, | ||
| "Device memory not allocated for launching DM init kernel."); | ||
|
|
||
| // Check if this image contains the DM init kernel | ||
| const char *KernelName = "__omp_dm_init_kernel"; | ||
|
|
||
| GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); | ||
| if (!Handler.isSymbolInImage(*this, Image, KernelName)) { | ||
| DP("DM init kernel is not in this image.\n"); | ||
| return Plugin::success(); | ||
| } | ||
|
|
||
| AMDGPUKernelTy DMInitKernel(KernelName, Plugin.getGlobalHandler()); | ||
| if (auto Err = DMInitKernel.init(*this, Image)) { | ||
| return Err; | ||
| } | ||
|
|
||
| DP("Device memory initializing...\n"); | ||
|
|
||
| // Prepare kernel arguments | ||
| struct __attribute__((packed)) { | ||
| uint64_t HeapAddr; | ||
| uint64_t SlabAddr; | ||
| } Args; | ||
|
|
||
| Args.HeapAddr = reinterpret_cast<uint64_t>(DMHeapPtr); | ||
| Args.SlabAddr = reinterpret_cast<uint64_t>(DMSlabPtr); | ||
|
|
||
| KernelArgsTy KernelArgs; | ||
| KernelLaunchParamsTy LaunchParams; | ||
| LaunchParams.Data = &Args; | ||
| LaunchParams.Size = sizeof(Args); | ||
|
|
||
| AsyncInfoWrapperTy AsyncInfo(*this, nullptr); | ||
|
|
||
| uint32_t NumThreads[3] = {256u, 1u, 1u}; | ||
| uint32_t NumBlocks[3] = {1u, 1u, 1u}; | ||
|
|
||
| // Launch kernel with 256 threads and 1 block | ||
| if (auto Err = DMInitKernel.launchImpl(*this, NumThreads, NumBlocks, | ||
| KernelArgs, LaunchParams, AsyncInfo)) | ||
| return Err; | ||
|
|
||
| // Wait for completion | ||
| Error Err = Plugin::success(); | ||
| AsyncInfo.finalize(Err); | ||
|
|
||
| // Mark as successfully initialized | ||
| if (!Err) { | ||
| DMInitialized = true; | ||
| DP("Device memory initialized successfully\n"); | ||
| } | ||
|
|
||
| return Err; | ||
| } | ||
|
|
||
| public: | ||
| /// Return if it is an MI300 series device. | ||
| bool checkIfMI300Device() { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,16 @@ | ||
| extern "C" { | ||
| void __ockl_dm_init_v1(unsigned long hp, unsigned long sp, unsigned int hb, | ||
| unsigned int nis); | ||
|
|
||
| /// Device memory initialization kernel | ||
| __attribute__((amdgpu_kernel, amdgpu_flat_work_group_size(256, 256), | ||
| amdgpu_max_num_work_groups(1), visibility("protected"))) void | ||
| __omp_dm_init_kernel(unsigned long heap_ptr, unsigned long slab_ptr) { | ||
|
|
||
| unsigned int HEAP_BYTES = 1; | ||
| unsigned int NUM_SLABS = 256; | ||
|
|
||
| // Use 256 * 2MB = 512MB for GPU memory allocation. | ||
| __ockl_dm_init_v1(heap_ptr, slab_ptr, HEAP_BYTES, NUM_SLABS); | ||
| } | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why can't we simply
return Errhere?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, we can directly return Err here. I think compiler will do implicit move for us. Using move just to match the coding style with existing code.