From d5e11a204e5c69da3ea4cd237895f5c74b6b0a38 Mon Sep 17 00:00:00 2001 From: Colleen Bertoni Date: Fri, 1 May 2026 18:48:46 -0500 Subject: [PATCH 1/3] implementing TODO in RegisterVarCopy --- src/CHIPBackend.cc | 25 ++++++-- tests/benchmarks/CMakeLists.txt | 7 +++ tests/benchmarks/hostRegisterOverhead.hip | 71 +++++++++++++++++++++++ 3 files changed, 99 insertions(+), 4 deletions(-) create mode 100644 tests/benchmarks/hostRegisterOverhead.hip diff --git a/src/CHIPBackend.cc b/src/CHIPBackend.cc index 03631ad06..4d9bebdc1 100755 --- a/src/CHIPBackend.cc +++ b/src/CHIPBackend.cc @@ -2000,10 +2000,6 @@ std::shared_ptr chipstar::Queue::RegisteredVarCopy(chipstar::ExecItem *ExecItem, MANAGED_MEM_STATE ExecState) { - // TODO: Inspect kernel code for indirect allocation accesses. If - // the kernel does not have any, we only need inspect kernels - // pointer arguments for allocations to be synchronized. - auto *AllocTracker = ::Backend->getActiveDevice()->AllocTracker; if (!AllocTracker->getNumHostAllocations() && !AllocTracker->getNumManagedAllocations()) @@ -2021,6 +2017,27 @@ chipstar::Queue::RegisteredVarCopy(chipstar::ExecItem *ExecItem, MemMap(&AllocInfo, chipstar::Queue::MEM_MAP_TYPE::HOST_READ_WRITE); } else if (AllocInfo.HostPtr && AllocInfo.MemoryType == hipMemoryTypeManaged) { + // If the module has no indirect global buffer accesses, only sync + // allocations whose DevPtr is explicitly passed as a kernel argument. + const SPVModuleInfo &ModInfo = ExecItem->getKernel()->getModule()->getInfo(); + if (ModInfo.HasNoIGBAs) { + bool IsKernelArg = false; + const auto &FuncInfo = *ExecItem->getKernel()->getFuncInfo(); + FuncInfo.visitKernelArgs(ExecItem->getArgs(), + [&](const SPVFuncInfo::KernelArg &Arg) { + if (Arg.Kind == SPVTypeKind::Pointer && + !Arg.isWorkgroupPtr()) { + void *PtrVal = *static_cast( + const_cast(Arg.Data)); + if (PtrVal == AllocInfo.DevPtr) + IsKernelArg = true; + } + }); + if (!IsKernelArg) { + logDebug("Skipping sync of managed memory {} - not a kernel arg", + AllocInfo.DevPtr); + return; + } void *Src = PreKernel ? AllocInfo.HostPtr : AllocInfo.DevPtr; void *Dst = PreKernel ? AllocInfo.DevPtr : AllocInfo.HostPtr; logDebug("Sync managed memory {} -> {} ({})", Src, Dst, diff --git a/tests/benchmarks/CMakeLists.txt b/tests/benchmarks/CMakeLists.txt index 706ac2cb3..a36e018a3 100644 --- a/tests/benchmarks/CMakeLists.txt +++ b/tests/benchmarks/CMakeLists.txt @@ -18,3 +18,10 @@ set_target_properties(manyMallocMemcopies PROPERTIES CXX_STANDARD_REQUIRED ON) target_link_libraries(manyMallocMemcopies CHIP deviceInternal) target_include_directories(manyMallocMemcopies PRIVATE $) + +set_source_files_properties(hostRegisterOverhead.hip PROPERTIES LANGUAGE CXX) +add_executable(hostRegisterOverhead hostRegisterOverhead.hip) +set_target_properties(hostRegisterOverhead PROPERTIES CXX_STANDARD_REQUIRED ON) +target_link_libraries(hostRegisterOverhead CHIP deviceInternal) +target_include_directories(hostRegisterOverhead + PRIVATE $) diff --git a/tests/benchmarks/hostRegisterOverhead.hip b/tests/benchmarks/hostRegisterOverhead.hip new file mode 100644 index 000000000..899a1993e --- /dev/null +++ b/tests/benchmarks/hostRegisterOverhead.hip @@ -0,0 +1,71 @@ +#include +#include +#include +#include +#include "hip/hip_runtime.h" + +#define CHECK(cmd) \ + do { \ + hipError_t e = (cmd); \ + if (e != hipSuccess) { \ + std::cerr << "HIP error " << hipGetErrorString(e) << " at " << __FILE__ \ + << ":" << __LINE__ << "\n"; \ + exit(1); \ + } \ + } while (0) + +static constexpr int N_KERNELS = 100; +static constexpr size_t BUF_SIZE = 1024 * 1024; // 1 MB, enough for a realistic double buffer + +__global__ void empty_kernel() {} + +int main() { + CHECK(hipSetDevice(0)); + + double *HostBuf = (double *)malloc(BUF_SIZE); + memset(HostBuf, 0, BUF_SIZE); + + // --- with hipHostRegister --- + CHECK(hipHostRegister(HostBuf, BUF_SIZE, 0)); + + CHECK(hipDeviceSynchronize()); + auto t0 = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < N_KERNELS; ++i) { + hipLaunchKernelGGL(empty_kernel, dim3(1), dim3(1), 0, 0); + } + CHECK(hipDeviceSynchronize()); + auto t1 = std::chrono::high_resolution_clock::now(); + + CHECK(hipHostUnregister(HostBuf)); + + // --- without hipHostRegister --- + CHECK(hipDeviceSynchronize()); + auto t2 = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < N_KERNELS; ++i) { + hipLaunchKernelGGL(empty_kernel, dim3(1), dim3(1), 0, 0); + } + CHECK(hipDeviceSynchronize()); + auto t3 = std::chrono::high_resolution_clock::now(); + + free(HostBuf); + + double with_ms = std::chrono::duration(t1 - t0).count(); + double without_ms = std::chrono::duration(t3 - t2).count(); + + std::cout << N_KERNELS << " empty kernels WITH hipHostRegister: " + << with_ms << " ms (" << with_ms / N_KERNELS << " ms/kernel)\n"; + std::cout << N_KERNELS << " empty kernels WITHOUT hipHostRegister: " + << without_ms << " ms (" << without_ms / N_KERNELS << " ms/kernel)\n"; + std::cout << "overhead factor: " << with_ms / without_ms << "x\n"; + + + if( with_ms / without_ms > 10 ) { + printf( "Fail"); + return 1; + } + else { + printf( "Pass\n"); + return 0; + } + +} From 42dc6676c71c16ddc233aedf903d06241b1abb06 Mon Sep 17 00:00:00 2001 From: Colleen Bertoni Date: Mon, 11 May 2026 15:19:01 -0500 Subject: [PATCH 2/3] fixing typo and extending test --- src/CHIPBackend.cc | 1 + tests/benchmarks/hostRegisterOverhead.hip | 43 +++++++++++++++-------- 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/src/CHIPBackend.cc b/src/CHIPBackend.cc index 4d9bebdc1..d68426f68 100755 --- a/src/CHIPBackend.cc +++ b/src/CHIPBackend.cc @@ -2038,6 +2038,7 @@ chipstar::Queue::RegisteredVarCopy(chipstar::ExecItem *ExecItem, AllocInfo.DevPtr); return; } + } void *Src = PreKernel ? AllocInfo.HostPtr : AllocInfo.DevPtr; void *Dst = PreKernel ? AllocInfo.DevPtr : AllocInfo.HostPtr; logDebug("Sync managed memory {} -> {} ({})", Src, Dst, diff --git a/tests/benchmarks/hostRegisterOverhead.hip b/tests/benchmarks/hostRegisterOverhead.hip index 899a1993e..dc85616f9 100644 --- a/tests/benchmarks/hostRegisterOverhead.hip +++ b/tests/benchmarks/hostRegisterOverhead.hip @@ -14,10 +14,18 @@ } \ } while (0) -static constexpr int N_KERNELS = 100; -static constexpr size_t BUF_SIZE = 1024 * 1024; // 1 MB, enough for a realistic double buffer +static constexpr int N_KERNELS = 40000; +static constexpr size_t BUF_SIZE = 1024 * 1024 * 4; // 1 MB, enough for a realistic double buffer -__global__ void empty_kernel() {} +#ifdef INDIRECT_KERNEL +__global__ void bench_kernel(double **ptr) { (*ptr)[0] += 1.0; } +#define KERNEL_LAUNCH(ptr) hipLaunchKernelGGL(bench_kernel, dim3(1), dim3(1), 0, 0, ptr) +#define KERNEL_LABEL "indirect-ptr kernels" +#else +__global__ void bench_kernel() {} +#define KERNEL_LAUNCH(ptr) hipLaunchKernelGGL(bench_kernel, dim3(1), dim3(1), 0, 0) +#define KERNEL_LABEL "empty kernels " +#endif int main() { CHECK(hipSetDevice(0)); @@ -25,15 +33,21 @@ int main() { double *HostBuf = (double *)malloc(BUF_SIZE); memset(HostBuf, 0, BUF_SIZE); + double *DevBuf; + double **DevPtrPtr; + CHECK(hipMalloc(&DevBuf, sizeof(double))); + CHECK(hipMalloc(&DevPtrPtr, sizeof(double *))); + CHECK(hipMemcpy(DevPtrPtr, &DevBuf, sizeof(double *), hipMemcpyHostToDevice)); + // --- with hipHostRegister --- CHECK(hipHostRegister(HostBuf, BUF_SIZE, 0)); CHECK(hipDeviceSynchronize()); auto t0 = std::chrono::high_resolution_clock::now(); for (int i = 0; i < N_KERNELS; ++i) { - hipLaunchKernelGGL(empty_kernel, dim3(1), dim3(1), 0, 0); + KERNEL_LAUNCH(DevPtrPtr); + CHECK(hipDeviceSynchronize()); } - CHECK(hipDeviceSynchronize()); auto t1 = std::chrono::high_resolution_clock::now(); CHECK(hipHostUnregister(HostBuf)); @@ -42,30 +56,31 @@ int main() { CHECK(hipDeviceSynchronize()); auto t2 = std::chrono::high_resolution_clock::now(); for (int i = 0; i < N_KERNELS; ++i) { - hipLaunchKernelGGL(empty_kernel, dim3(1), dim3(1), 0, 0); + KERNEL_LAUNCH(DevPtrPtr); + CHECK(hipDeviceSynchronize()); } - CHECK(hipDeviceSynchronize()); auto t3 = std::chrono::high_resolution_clock::now(); free(HostBuf); + CHECK(hipFree(DevBuf)); + CHECK(hipFree(DevPtrPtr)); double with_ms = std::chrono::duration(t1 - t0).count(); double without_ms = std::chrono::duration(t3 - t2).count(); - std::cout << N_KERNELS << " empty kernels WITH hipHostRegister: " + std::cout << N_KERNELS << " " KERNEL_LABEL " WITH hipHostRegister: " << with_ms << " ms (" << with_ms / N_KERNELS << " ms/kernel)\n"; - std::cout << N_KERNELS << " empty kernels WITHOUT hipHostRegister: " + std::cout << N_KERNELS << " " KERNEL_LABEL " WITHOUT hipHostRegister: " << without_ms << " ms (" << without_ms / N_KERNELS << " ms/kernel)\n"; std::cout << "overhead factor: " << with_ms / without_ms << "x\n"; - - if( with_ms / without_ms > 10 ) { + if( with_ms / without_ms > 10 ) { printf( "Fail"); return 1; - } - else { + } + else { printf( "Pass\n"); return 0; - } + } } From b25a77f8fdb119ff60e38a9ca563d0b13b58d3d3 Mon Sep 17 00:00:00 2001 From: Colleen Bertoni Date: Mon, 11 May 2026 15:32:19 -0500 Subject: [PATCH 3/3] fixing typo and extending test --- tests/benchmarks/hostRegisterOverhead.hip | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/benchmarks/hostRegisterOverhead.hip b/tests/benchmarks/hostRegisterOverhead.hip index dc85616f9..99822bc64 100644 --- a/tests/benchmarks/hostRegisterOverhead.hip +++ b/tests/benchmarks/hostRegisterOverhead.hip @@ -74,7 +74,7 @@ int main() { << without_ms << " ms (" << without_ms / N_KERNELS << " ms/kernel)\n"; std::cout << "overhead factor: " << with_ms / without_ms << "x\n"; - if( with_ms / without_ms > 10 ) { + if( with_ms / without_ms > 5 ) { printf( "Fail"); return 1; }