Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 22 additions & 4 deletions src/CHIPBackend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2000,10 +2000,6 @@ std::shared_ptr<chipstar::Event>
chipstar::Queue::RegisteredVarCopy(chipstar::ExecItem *ExecItem,
MANAGED_MEM_STATE ExecState) {

// TODO: Inspect kernel code for indirect allocation accesses. If
// the kernel does not have any, we only need inspect kernels
// pointer arguments for allocations to be synchronized.

auto *AllocTracker = ::Backend->getActiveDevice()->AllocTracker;
if (!AllocTracker->getNumHostAllocations() &&
!AllocTracker->getNumManagedAllocations())
Expand All @@ -2021,6 +2017,28 @@ chipstar::Queue::RegisteredVarCopy(chipstar::ExecItem *ExecItem,
MemMap(&AllocInfo, chipstar::Queue::MEM_MAP_TYPE::HOST_READ_WRITE);
} else if (AllocInfo.HostPtr &&
AllocInfo.MemoryType == hipMemoryTypeManaged) {
// If the module has no indirect global buffer accesses, only sync
// allocations whose DevPtr is explicitly passed as a kernel argument.
const SPVModuleInfo &ModInfo = ExecItem->getKernel()->getModule()->getInfo();
if (ModInfo.HasNoIGBAs) {
bool IsKernelArg = false;
const auto &FuncInfo = *ExecItem->getKernel()->getFuncInfo();
FuncInfo.visitKernelArgs(ExecItem->getArgs(),
[&](const SPVFuncInfo::KernelArg &Arg) {
if (Arg.Kind == SPVTypeKind::Pointer &&
!Arg.isWorkgroupPtr()) {
void *PtrVal = *static_cast<void **>(
const_cast<void *>(Arg.Data));
if (PtrVal == AllocInfo.DevPtr)
IsKernelArg = true;
}
});
if (!IsKernelArg) {
logDebug("Skipping sync of managed memory {} - not a kernel arg",
AllocInfo.DevPtr);
return;
}
}
void *Src = PreKernel ? AllocInfo.HostPtr : AllocInfo.DevPtr;
void *Dst = PreKernel ? AllocInfo.DevPtr : AllocInfo.HostPtr;
logDebug("Sync managed memory {} -> {} ({})", Src, Dst,
Expand Down
7 changes: 7 additions & 0 deletions tests/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,10 @@ set_target_properties(manyMallocMemcopies PROPERTIES CXX_STANDARD_REQUIRED ON)
target_link_libraries(manyMallocMemcopies CHIP deviceInternal)
target_include_directories(manyMallocMemcopies
PRIVATE $<TARGET_PROPERTY:CHIP,INCLUDE_DIRECTORIES>)

set_source_files_properties(hostRegisterOverhead.hip PROPERTIES LANGUAGE CXX)
add_executable(hostRegisterOverhead hostRegisterOverhead.hip)
set_target_properties(hostRegisterOverhead PROPERTIES CXX_STANDARD_REQUIRED ON)
target_link_libraries(hostRegisterOverhead CHIP deviceInternal)
target_include_directories(hostRegisterOverhead
PRIVATE $<TARGET_PROPERTY:CHIP,INCLUDE_DIRECTORIES>)
86 changes: 86 additions & 0 deletions tests/benchmarks/hostRegisterOverhead.hip
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#include <chrono>
#include <cstring>
#include <cstdlib>
#include <iostream>
#include "hip/hip_runtime.h"

#define CHECK(cmd) \
do { \
hipError_t e = (cmd); \
if (e != hipSuccess) { \
std::cerr << "HIP error " << hipGetErrorString(e) << " at " << __FILE__ \
<< ":" << __LINE__ << "\n"; \
exit(1); \
} \
} while (0)

static constexpr int N_KERNELS = 40000;
static constexpr size_t BUF_SIZE = 1024 * 1024 * 4; // 1 MB, enough for a realistic double buffer

#ifdef INDIRECT_KERNEL
__global__ void bench_kernel(double **ptr) { (*ptr)[0] += 1.0; }
#define KERNEL_LAUNCH(ptr) hipLaunchKernelGGL(bench_kernel, dim3(1), dim3(1), 0, 0, ptr)
#define KERNEL_LABEL "indirect-ptr kernels"
#else
__global__ void bench_kernel() {}
#define KERNEL_LAUNCH(ptr) hipLaunchKernelGGL(bench_kernel, dim3(1), dim3(1), 0, 0)
#define KERNEL_LABEL "empty kernels "
#endif

int main() {
CHECK(hipSetDevice(0));

double *HostBuf = (double *)malloc(BUF_SIZE);
memset(HostBuf, 0, BUF_SIZE);

double *DevBuf;
double **DevPtrPtr;
CHECK(hipMalloc(&DevBuf, sizeof(double)));
CHECK(hipMalloc(&DevPtrPtr, sizeof(double *)));
CHECK(hipMemcpy(DevPtrPtr, &DevBuf, sizeof(double *), hipMemcpyHostToDevice));

// --- with hipHostRegister ---
CHECK(hipHostRegister(HostBuf, BUF_SIZE, 0));

CHECK(hipDeviceSynchronize());
auto t0 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < N_KERNELS; ++i) {
KERNEL_LAUNCH(DevPtrPtr);
CHECK(hipDeviceSynchronize());
}
auto t1 = std::chrono::high_resolution_clock::now();

CHECK(hipHostUnregister(HostBuf));

// --- without hipHostRegister ---
CHECK(hipDeviceSynchronize());
auto t2 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < N_KERNELS; ++i) {
KERNEL_LAUNCH(DevPtrPtr);
CHECK(hipDeviceSynchronize());
}
auto t3 = std::chrono::high_resolution_clock::now();

free(HostBuf);
CHECK(hipFree(DevBuf));
CHECK(hipFree(DevPtrPtr));

double with_ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
double without_ms = std::chrono::duration<double, std::milli>(t3 - t2).count();

std::cout << N_KERNELS << " " KERNEL_LABEL " WITH hipHostRegister: "
<< with_ms << " ms (" << with_ms / N_KERNELS << " ms/kernel)\n";
std::cout << N_KERNELS << " " KERNEL_LABEL " WITHOUT hipHostRegister: "
<< without_ms << " ms (" << without_ms / N_KERNELS << " ms/kernel)\n";
std::cout << "overhead factor: " << with_ms / without_ms << "x\n";

if( with_ms / without_ms > 5 ) {
printf( "Fail");
return 1;
}
else {
printf( "Pass\n");
return 0;
}

}
Loading