diff --git a/interfaces/cuda/Control.cu b/interfaces/cuda/Control.cu index b55b6f0..f154787 100644 --- a/interfaces/cuda/Control.cu +++ b/interfaces/cuda/Control.cu @@ -125,11 +125,9 @@ std::string ConcreteAPI::getDeviceInfoAsText(int deviceId) { info << "memPitch: " << property.memPitch << '\n'; info << "maxThreadsPerBlock: " << property.maxThreadsPerBlock << '\n'; info << "totalConstMem: " << property.totalConstMem << '\n'; - info << "clockRate: " << property.clockRate << '\n'; info << "multiProcessorCount: " << property.multiProcessorCount << '\n'; info << "integrated: " << property.integrated << '\n'; info << "canMapHostMemory: " << property.canMapHostMemory << '\n'; - info << "computeMode: " << property.computeMode << '\n'; info << "concurrentKernels: " << property.concurrentKernels << '\n'; info << "pciBusID: " << property.pciBusID << '\n'; info << "pciDeviceID: " << property.pciDeviceID << '\n'; diff --git a/interfaces/cuda/Copy.cu b/interfaces/cuda/Copy.cu index 0b7de7f..8a4c30a 100644 --- a/interfaces/cuda/Copy.cu +++ b/interfaces/cuda/Copy.cu @@ -71,9 +71,28 @@ void ConcreteAPI::prefetchUnifiedMemTo(Destination type, const void *devPtr, siz void *streamPtr) { isFlagSet(status); cudaStream_t stream = (streamPtr == nullptr) ? 0 : (static_cast(streamPtr)); + + cudaMemLocation location{}; + if (type == Destination::Host) { + location.id = cudaCpuDeviceId; +#if CUDART_VERSION >= 13000 + location.type = cudaMemLocationTypeHost; +#endif + } + else if (allowedConcurrentManagedAccess) { + location.id = currentDeviceId; +#if CUDART_VERSION >= 13000 + location.type = cudaMemLocationTypeDevice; +#endif + } + cudaMemPrefetchAsync(devPtr, count, - type == Destination::CurrentDevice ? currentDeviceId : cudaCpuDeviceId, +#if CUDART_VERSION >= 13000 + location, 0, +#else + location.id, +#endif stream); CHECK_ERR; } diff --git a/interfaces/cuda/Memory.cu b/interfaces/cuda/Memory.cu index 8ab88c5..eba255e 100644 --- a/interfaces/cuda/Memory.cu +++ b/interfaces/cuda/Memory.cu @@ -96,14 +96,30 @@ void *ConcreteAPI::allocUnifiedMem(size_t size, bool compress, Destination hint) void *devPtr; cudaMallocManaged(&devPtr, size, cudaMemAttachGlobal); CHECK_ERR; + + cudaMemLocation location{}; if (hint == Destination::Host) { - cudaMemAdvise(devPtr, size, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId); - CHECK_ERR; + location.id = cudaCpuDeviceId; +#if CUDART_VERSION >= 13000 + location.type = cudaMemLocationTypeHost; +#endif } else if (allowedConcurrentManagedAccess) { - cudaMemAdvise(devPtr, size, cudaMemAdviseSetPreferredLocation, currentDeviceId); - CHECK_ERR; + location.id = currentDeviceId; +#if CUDART_VERSION >= 13000 + location.type = cudaMemLocationTypeDevice; +#endif } + + cudaMemAdvise(devPtr, size, cudaMemAdviseSetPreferredLocation, +#if CUDART_VERSION >= 13000 + location +#else + location.id +#endif + ); + CHECK_ERR; + statistics.allocatedMemBytes += size; statistics.allocatedUnifiedMemBytes += size; memToSizeMap[devPtr] = size; diff --git a/interfaces/hip/Control.cpp b/interfaces/hip/Control.cpp index 0e4774e..b2721ac 100644 --- a/interfaces/hip/Control.cpp +++ b/interfaces/hip/Control.cpp @@ -134,11 +134,9 @@ std::string ConcreteAPI::getDeviceInfoAsText(int deviceId) { info << "memPitch: " << property.memPitch << '\n'; info << "maxThreadsPerBlock: " << property.maxThreadsPerBlock << '\n'; info << "totalConstMem: " << property.totalConstMem << '\n'; - info << "clockRate: " << property.clockRate << '\n'; info << "multiProcessorCount: " << property.multiProcessorCount << '\n'; info << "integrated: " << property.integrated << '\n'; info << "canMapHostMemory: " << property.canMapHostMemory << '\n'; - info << "computeMode: " << property.computeMode << '\n'; info << "concurrentKernels: " << property.concurrentKernels << '\n'; info << "pciBusID: " << property.pciBusID << '\n'; info << "pciDeviceID: " << property.pciDeviceID << '\n'; diff --git a/sycl.cmake b/sycl.cmake index b16397e..3813fb4 100644 --- a/sycl.cmake +++ b/sycl.cmake @@ -51,7 +51,11 @@ if ((${DEVICE_BACKEND} STREQUAL "acpp") OR (${DEVICE_BACKEND} STREQUAL "hipsycl" find_package(AdaptiveCpp REQUIRED) find_package(OpenMP REQUIRED) target_compile_options(device PRIVATE -Wno-unknown-cuda-version) - target_link_libraries(device PUBLIC ${OpenMP_CXX_FLAGS}) + + # we need to link both to OpenMP_CXX and the OpenMP_CXX_FLAGS (otherwise some symbols won't be found) + target_link_libraries(device PRIVATE OpenMP::OpenMP_CXX) + target_link_libraries(device PRIVATE ${OpenMP_CXX_FLAGS}) + add_sycl_to_target(TARGET device SOURCES ${DEVICE_SOURCE_FILES}) else() find_package(DpcppFlags REQUIRED)