Skip to content

Commit d271144

Browse files
committed
GPU: Enforce some GPU-kernels are RTC-compiled with correct WARP_SIZE as launch-bounds
1 parent a79c4fc commit d271144

File tree

5 files changed

+22
-1
lines changed

5 files changed

+22
-1
lines changed

GPU/GPUTracking/Base/GPUReconstructionProcessing.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,9 +162,10 @@ class GPUReconstructionProcessing : public GPUReconstruction
162162
// Interface to query name of a kernel
163163
template <class T, int32_t I>
164164
static const char* GetKernelName();
165-
const std::string& GetKernelName(int32_t i) const { return mKernelNames[i]; }
165+
static const std::string& GetKernelName(int32_t i) { return mKernelNames[i]; }
166166
template <class T, int32_t I = 0>
167167
static uint32_t GetKernelNum();
168+
static uint32_t GetNKernels() { return mKernelNames.size(); }
168169

169170
// Public queries for timers
170171
auto& getRecoStepTimer(RecoStep step) { return mTimersRecoSteps[getRecoStepNum(step)]; }

GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,13 @@
1717
#include "GPUReconstructionCUDA.h"
1818
#include "GPUParamRTC.h"
1919
#include "GPUDefParametersLoad.inc"
20+
#include "GPUKernelsWith1Warp.inc"
2021
#include <unistd.h>
2122
#include "Framework/SHA1.h"
2223
#include <sys/stat.h>
2324
#include <fcntl.h>
2425
#include <filesystem>
26+
#include <algorithm>
2527

2628
#include <oneapi/tbb.h>
2729
using namespace o2::gpu;
@@ -81,6 +83,11 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
8183
GPUFatal("AMD_EUS_PER_CU not set in the parameters provided for the AMD GPU, you can override this via --PROChipOverrideAMDEUSperCU [n]");
8284
}
8385
}
86+
for (uint32_t i = 0; i < GetNKernels(); i++) {
87+
if (std::find(gpuKernelsWith1Warp.begin(), gpuKernelsWith1Warp.end(), GetKernelName(i)) != gpuKernelsWith1Warp.end()) {
88+
mParDevice->par_LB_maxThreads[i] = mWarpSize;
89+
}
90+
}
8491
const std::string launchBounds = o2::gpu::internal::GPUDefParametersExport(*mParDevice, true, mParDevice->par_AMD_EUS_PER_CU ? (mParDevice->par_AMD_EUS_PER_CU * mWarpSize) : 0) +
8592
"#define GPUCA_WARP_SIZE " + std::to_string(mWarpSize) + "\n";
8693
if (GetProcessingSettings().rtctech.printLaunchBounds || GetProcessingSettings().debugLevel >= 3) {

GPU/GPUTracking/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,11 @@ set(GPU_DEFAULT_PARAMS_HEADER_DEVICE ${ON_THE_FLY_DIR}/GPUDefParametersDefaultsD
299299
generate_gpu_param_header("${GPU_PARAM_JSON_FILES}" "ALL" "${GPU_DEFAULT_PARAMS_HEADER}" "${GPU_DEFAULT_PARAMS_HEADER_DEVICE}" GPU_CONST_PARAM_ARCHITECTUES) # generate header with default GPU parameters for all architectures
300300
list(APPEND GENERATED_HEADERS_LIST ${GPU_DEFAULT_PARAMS_HEADER} ${GPU_DEFAULT_PARAMS_HEADER_DEVICE})
301301

302+
file(GENERATE
303+
OUTPUT "${ON_THE_FLY_DIR}/GPUKernelsWith1Warp.inc"
304+
CONTENT "namespace o2::gpu { static const std::vector<std::string> gpuKernelsWith1Warp = {\"$<JOIN:$<TARGET_PROPERTY:O2_GPU_KERNELS,O2_GPU_KERNELS_FORCE_1_WARP>,\"$<COMMA> \">\"}; }")
305+
list(APPEND GENERATED_HEADERS_LIST ${GPU_DEFAULT_PARAMS_HEADER} ${ON_THE_FLY_DIR}/GPUKernelsWith1Warp.inc)
306+
302307
set(HDRS_INSTALL ${HDRS_INSTALL} ${GENERATED_HEADERS_LIST})
303308
include(kernels.cmake)
304309

GPU/GPUTracking/cmake/kernel_helpers.cmake

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ define_property(TARGET PROPERTY O2_GPU_KERNEL_FILES)
2020
define_property(TARGET PROPERTY O2_GPU_KERNEL_NO_FAST_MATH)
2121
define_property(TARGET PROPERTY O2_GPU_KERNEL_PARAMS)
2222
define_property(TARGET PROPERTY O2_GPU_KERNEL_STRING_PARAMS)
23+
define_property(TARGET PROPERTY O2_GPU_KERNELS_FORCE_1_WARP)
2324
set(O2_GPU_KERNEL_WRAPPER_FOLDER "${CMAKE_CURRENT_BINARY_DIR}/GPU/include_gpu_onthefly")
2425
file(MAKE_DIRECTORY ${O2_GPU_KERNEL_WRAPPER_FOLDER})
2526
set(O2_GPU_BASE_DIR "${CMAKE_CURRENT_LIST_DIR}/../")
@@ -184,3 +185,7 @@ function(o2_gpu_kernel_add_string_parameter)
184185
set_property(TARGET O2_GPU_KERNELS APPEND PROPERTY O2_GPU_KERNEL_STRING_PARAMS "${ARGV${i}}")
185186
endforeach()
186187
endfunction()
188+
189+
function(o2_gpu_kernel_requires_1_warp kernel)
190+
set_property(TARGET O2_GPU_KERNELS APPEND PROPERTY O2_GPU_KERNELS_FORCE_1_WARP "${kernel}")
191+
endfunction()

GPU/GPUTracking/kernels.cmake

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,3 +150,6 @@ o2_gpu_kernel_add_parameter(NEIGHBOURS_FINDER_MAX_NNEIGHUP # Number of neighhbo
150150

151151
o2_gpu_kernel_add_string_parameter(DEDX_STORAGE_TYPE # Data type to use for intermediate storage of dEdx truncated mean inputs
152152
MERGER_INTERPOLATION_ERROR_TYPE) # Data type for storing intermediate track residuals for interpolation
153+
154+
o2_gpu_kernel_requires_1_warp("GPUTPCCFDecodeZSLink")
155+
o2_gpu_kernel_requires_1_warp("GPUTPCCFDecodeZSDenseLink")

0 commit comments

Comments
 (0)