diff --git a/src/CHIPBackend.cc b/src/CHIPBackend.cc index 03631ad06..d68426f68 100755 --- a/src/CHIPBackend.cc +++ b/src/CHIPBackend.cc @@ -2000,10 +2000,6 @@ std::shared_ptr chipstar::Queue::RegisteredVarCopy(chipstar::ExecItem *ExecItem, MANAGED_MEM_STATE ExecState) { - // TODO: Inspect kernel code for indirect allocation accesses. If - // the kernel does not have any, we only need inspect kernels - // pointer arguments for allocations to be synchronized. - auto *AllocTracker = ::Backend->getActiveDevice()->AllocTracker; if (!AllocTracker->getNumHostAllocations() && !AllocTracker->getNumManagedAllocations()) @@ -2021,6 +2017,28 @@ chipstar::Queue::RegisteredVarCopy(chipstar::ExecItem *ExecItem, MemMap(&AllocInfo, chipstar::Queue::MEM_MAP_TYPE::HOST_READ_WRITE); } else if (AllocInfo.HostPtr && AllocInfo.MemoryType == hipMemoryTypeManaged) { + // If the module has no indirect global buffer accesses, only sync + // allocations whose DevPtr is explicitly passed as a kernel argument. + const SPVModuleInfo &ModInfo = ExecItem->getKernel()->getModule()->getInfo(); + if (ModInfo.HasNoIGBAs) { + bool IsKernelArg = false; + const auto &FuncInfo = *ExecItem->getKernel()->getFuncInfo(); + FuncInfo.visitKernelArgs(ExecItem->getArgs(), + [&](const SPVFuncInfo::KernelArg &Arg) { + if (Arg.Kind == SPVTypeKind::Pointer && + !Arg.isWorkgroupPtr()) { + void *PtrVal = *static_cast( + const_cast(Arg.Data)); + if (PtrVal == AllocInfo.DevPtr) + IsKernelArg = true; + } + }); + if (!IsKernelArg) { + logDebug("Skipping sync of managed memory {} - not a kernel arg", + AllocInfo.DevPtr); + return; + } + } void *Src = PreKernel ? AllocInfo.HostPtr : AllocInfo.DevPtr; void *Dst = PreKernel ? AllocInfo.DevPtr : AllocInfo.HostPtr; logDebug("Sync managed memory {} -> {} ({})", Src, Dst, diff --git a/tests/benchmarks/CMakeLists.txt b/tests/benchmarks/CMakeLists.txt index 706ac2cb3..a36e018a3 100644 --- a/tests/benchmarks/CMakeLists.txt +++ b/tests/benchmarks/CMakeLists.txt @@ -18,3 +18,10 @@ set_target_properties(manyMallocMemcopies PROPERTIES CXX_STANDARD_REQUIRED ON) target_link_libraries(manyMallocMemcopies CHIP deviceInternal) target_include_directories(manyMallocMemcopies PRIVATE $) + +set_source_files_properties(hostRegisterOverhead.hip PROPERTIES LANGUAGE CXX) +add_executable(hostRegisterOverhead hostRegisterOverhead.hip) +set_target_properties(hostRegisterOverhead PROPERTIES CXX_STANDARD_REQUIRED ON) +target_link_libraries(hostRegisterOverhead CHIP deviceInternal) +target_include_directories(hostRegisterOverhead + PRIVATE $) diff --git a/tests/benchmarks/hostRegisterOverhead.hip b/tests/benchmarks/hostRegisterOverhead.hip new file mode 100644 index 000000000..99822bc64 --- /dev/null +++ b/tests/benchmarks/hostRegisterOverhead.hip @@ -0,0 +1,86 @@ +#include +#include +#include +#include +#include "hip/hip_runtime.h" + +#define CHECK(cmd) \ + do { \ + hipError_t e = (cmd); \ + if (e != hipSuccess) { \ + std::cerr << "HIP error " << hipGetErrorString(e) << " at " << __FILE__ \ + << ":" << __LINE__ << "\n"; \ + exit(1); \ + } \ + } while (0) + +static constexpr int N_KERNELS = 40000; +static constexpr size_t BUF_SIZE = 1024 * 1024 * 4; // 1 MB, enough for a realistic double buffer + +#ifdef INDIRECT_KERNEL +__global__ void bench_kernel(double **ptr) { (*ptr)[0] += 1.0; } +#define KERNEL_LAUNCH(ptr) hipLaunchKernelGGL(bench_kernel, dim3(1), dim3(1), 0, 0, ptr) +#define KERNEL_LABEL "indirect-ptr kernels" +#else +__global__ void bench_kernel() {} +#define KERNEL_LAUNCH(ptr) hipLaunchKernelGGL(bench_kernel, dim3(1), dim3(1), 0, 0) +#define KERNEL_LABEL "empty kernels " +#endif + +int main() { + CHECK(hipSetDevice(0)); + + double *HostBuf = (double *)malloc(BUF_SIZE); + memset(HostBuf, 0, BUF_SIZE); + + double *DevBuf; + double **DevPtrPtr; + CHECK(hipMalloc(&DevBuf, sizeof(double))); + CHECK(hipMalloc(&DevPtrPtr, sizeof(double *))); + CHECK(hipMemcpy(DevPtrPtr, &DevBuf, sizeof(double *), hipMemcpyHostToDevice)); + + // --- with hipHostRegister --- + CHECK(hipHostRegister(HostBuf, BUF_SIZE, 0)); + + CHECK(hipDeviceSynchronize()); + auto t0 = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < N_KERNELS; ++i) { + KERNEL_LAUNCH(DevPtrPtr); + CHECK(hipDeviceSynchronize()); + } + auto t1 = std::chrono::high_resolution_clock::now(); + + CHECK(hipHostUnregister(HostBuf)); + + // --- without hipHostRegister --- + CHECK(hipDeviceSynchronize()); + auto t2 = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < N_KERNELS; ++i) { + KERNEL_LAUNCH(DevPtrPtr); + CHECK(hipDeviceSynchronize()); + } + auto t3 = std::chrono::high_resolution_clock::now(); + + free(HostBuf); + CHECK(hipFree(DevBuf)); + CHECK(hipFree(DevPtrPtr)); + + double with_ms = std::chrono::duration(t1 - t0).count(); + double without_ms = std::chrono::duration(t3 - t2).count(); + + std::cout << N_KERNELS << " " KERNEL_LABEL " WITH hipHostRegister: " + << with_ms << " ms (" << with_ms / N_KERNELS << " ms/kernel)\n"; + std::cout << N_KERNELS << " " KERNEL_LABEL " WITHOUT hipHostRegister: " + << without_ms << " ms (" << without_ms / N_KERNELS << " ms/kernel)\n"; + std::cout << "overhead factor: " << with_ms / without_ms << "x\n"; + + if( with_ms / without_ms > 5 ) { + printf( "Fail"); + return 1; + } + else { + printf( "Pass\n"); + return 0; + } + +}