diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 9fd1045..f1ee0e2 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -23,6 +23,14 @@ jobs: - Release - Debug setup: + - arch: none + backend: none + cc: gcc-13 + cxx: g++-13 + fc: gfortran-13 + container: seissol/gha-cpu:davschneller-gpu-image + runner: ubuntu-24.04 + pythonbreak: true - arch: sm_60 backend: cuda cc: gcc-13 @@ -98,7 +106,7 @@ jobs: cd ../.. - name: checkout-device - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: submodules: recursive diff --git a/CMakeLists.txt b/CMakeLists.txt index 97cab89..b2e7e1a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,19 +13,18 @@ if (NOT DEFINED DEVICE_BACKEND) message(FATAL_ERROR "DEVICE_BACKEND variable has not been provided into the submodule") else() set(FOUND OFF) - foreach(VARIANT cuda hip oneapi hipsycl acpp) + foreach(VARIANT cuda hip oneapi hipsycl acpp none) if (${DEVICE_BACKEND} STREQUAL ${VARIANT}) set(FOUND ON) endif() endforeach() if (NOT FOUND) - message(FATAL_ERROR "DEVICE_BACKEND must be either cuda, hip, opeapi, acpp, or hipsycl. Given: ${DEVICE_BACKEND}") + message(FATAL_ERROR "DEVICE_BACKEND must be either none, cuda, hip, opeapi, acpp, hipsycl. Given: ${DEVICE_BACKEND}") endif() endif() -if (NOT DEFINED DEVICE_ARCH) - message(FATAL_ERROR "DEVICE_ARCH is not defined. " - "Supported for example: sm_60, sm_61, sm_70, sm_71, gfx906, gfx908, dg1, bdw, skl, Gen8, Gen9, Gen11, Gen12LP") +if ((NOT (${DEVICE_BACKEND} STREQUAL "none")) AND (NOT DEFINED DEVICE_ARCH)) + message(FATAL_ERROR "DEVICE_ARCH has not been defined") endif() set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH}) @@ -44,10 +43,10 @@ elseif(${DEVICE_BACKEND} STREQUAL "hip") elseif((${DEVICE_BACKEND} STREQUAL "oneapi") OR (${DEVICE_BACKEND} STREQUAL "hipsycl") OR (${DEVICE_BACKEND} STREQUAL "acpp")) set(BACKEND_FOLDER "sycl") include(sycl.cmake) +else() + add_library(device OBJECT device.cpp algorithms/Dummy.cpp) endif() -string(TOUPPER ${BACKEND_FOLDER} BACKEND_UPPER_CASE) - # common options target_compile_features(device PRIVATE cxx_std_17) @@ -58,15 +57,24 @@ if (ENABLE_PROFILING_MARKERS) target_compile_definitions(device PRIVATE PROFILING_ENABLED) endif() -target_compile_definitions(device PRIVATE DEVICE_LANG_${BACKEND_UPPER_CASE}) +if (NOT (${DEVICE_BACKEND} STREQUAL "none")) + string(TOUPPER ${BACKEND_FOLDER} BACKEND_UPPER_CASE) + target_compile_definitions(device PRIVATE DEVICE_LANG_${BACKEND_UPPER_CASE}) + + target_include_directories(device PRIVATE + interfaces/${BACKEND_FOLDER} + interfaces/common + algorithms/${BACKEND_FOLDER}) +endif() if (LOG_LEVEL_MASTER) target_compile_definitions(device PRIVATE LOG_LEVEL=${LOG_LEVEL_MASTER}) endif() -target_include_directories(device PRIVATE . - interfaces/${BACKEND_FOLDER} - interfaces/common - algorithms/${BACKEND_FOLDER} - submodules) +if (DEFINED DEVICE_SUBMODULES) + target_include_directories(device PRIVATE ${DEVICE_SUBMODULES}) +else() + target_include_directories(device PRIVATE submodules) +endif() +target_include_directories(device PRIVATE .) diff --git a/UsmAllocator.h b/UsmAllocator.h index 89d7f6e..5e9b305 100644 --- a/UsmAllocator.h +++ b/UsmAllocator.h @@ -22,7 +22,7 @@ class UsmAllocator { using difference_type = std::ptrdiff_t; UsmAllocator() noexcept = delete; - UsmAllocator(device::DeviceInstance& instance) noexcept : api(instance.api) {} + UsmAllocator(device::DeviceInstance& instance) noexcept : api(&instance.api()) {} UsmAllocator(const UsmAllocator &) noexcept = default; UsmAllocator(UsmAllocator &&) noexcept = default; diff --git a/algorithms/Dummy.cpp b/algorithms/Dummy.cpp new file mode 100644 index 0000000..aece8b2 --- /dev/null +++ b/algorithms/Dummy.cpp @@ -0,0 +1,136 @@ +// SPDX-FileCopyrightText: 2020-2024 SeisSol Group +// +// SPDX-License-Identifier: BSD-3-Clause + +#include +#include +#include +#include "Algorithms.h" + +namespace device { +template void Algorithms::scaleArray(T *devArray, + T scalar, + const size_t numElements, + void* streamPtr) { +} +template void Algorithms::scaleArray(float *devArray, float scalar, const size_t numElements, void* streamPtr); +template void Algorithms::scaleArray(double *devArray, double scalar, const size_t numElements, void* streamPtr); +template void Algorithms::scaleArray(int *devArray, int scalar, const size_t numElements, void* streamPtr); +template void Algorithms::scaleArray(unsigned *devArray, unsigned scalar, const size_t numElements, void* streamPtr); +template void Algorithms::scaleArray(char *devArray, char scalar, const size_t numElements, void* streamPtr); + +template void Algorithms::fillArray(T *devArray, const T scalar, const size_t numElements, void* streamPtr) { +} +template void Algorithms::fillArray(float *devArray, float scalar, const size_t numElements, void* streamPtr); +template void Algorithms::fillArray(double *devArray, double scalar, const size_t numElements, void* streamPtr); +template void Algorithms::fillArray(int *devArray, int scalar, const size_t numElements, void* streamPtr); +template void Algorithms::fillArray(unsigned *devArray, unsigned scalar, const size_t numElements, void* streamPtr); +template void Algorithms::fillArray(char *devArray, char scalar, const size_t numElements, void* streamPtr); + +void Algorithms::touchMemoryI(void *ptr, size_t size, bool clean, void* streamPtr) { +} + +void Algorithms::incrementalAddI( + void** out, + void *base, + size_t increment, + size_t numElements, + void* streamPtr) { +} + + + void Algorithms::streamBatchedDataI(const void **baseSrcPtr, + void **baseDstPtr, + size_t elementSize, + size_t numElements, + void* streamPtr) { + } + + template + void Algorithms::accumulateBatchedData(const T **baseSrcPtr, + T **baseDstPtr, + size_t elementSize, + size_t numElements, + void* streamPtr) { + } + + template void Algorithms::accumulateBatchedData(const float **baseSrcPtr, + float **baseDstPtr, + size_t elementSize, + size_t numElements, + void* streamPtr); + + template void Algorithms::accumulateBatchedData(const double **baseSrcPtr, + double **baseDstPtr, + size_t elementSize, + size_t numElements, + void* streamPtr); + + void Algorithms::touchBatchedMemoryI(void **basePtr, + size_t elementSize, + size_t numElements, + bool clean, + void* streamPtr) { + } + + +template +void Algorithms::setToValue(T** out, T value, size_t elementSize, size_t numElements, void* streamPtr) { + +} + +template void Algorithms::setToValue(float** out, float value, size_t elementSize, size_t numElements, void* streamPtr); +template void Algorithms::setToValue(double** out, double value, size_t elementSize, size_t numElements, void* streamPtr); +template void Algorithms::setToValue(int** out, int value, size_t elementSize, size_t numElements, void* streamPtr); +template void Algorithms::setToValue(unsigned** out, unsigned value, size_t elementSize, size_t numElements, void* streamPtr); +template void Algorithms::setToValue(char** out, char value, size_t elementSize, size_t numElements, void* streamPtr); + + + void Algorithms::copyUniformToScatterI(const void *src, + void **dst, + size_t srcOffset, + size_t copySize, + size_t numElements, + void* streamPtr) { + } + + void Algorithms::copyScatterToUniformI(const void **src, + void *dst, + size_t dstOffset, + size_t copySize, + size_t numElements, + void* streamPtr) { + } + + +template +void Algorithms::compareDataWithHost(const T *hostPtr, const T *devPtr, const size_t numElements, + const std::string &dataName) { + +}; + +template void Algorithms::compareDataWithHost(const float *hostPtr, const float *devPtr, const size_t numElements, + const std::string &dataName); +template void Algorithms::compareDataWithHost(const double *hostPtr, const double *devPtr, const size_t numElements, + const std::string &dataName); + +template void Algorithms::reduceVector(AccT* result, const VecT *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr) { +} + +template void Algorithms::reduceVector(int* result, const int *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr); +template void Algorithms::reduceVector(unsigned* result, const unsigned *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr); +template void Algorithms::reduceVector(long* result, const int *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr); +template void Algorithms::reduceVector(unsigned long* result, const unsigned *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr); +template void Algorithms::reduceVector(long* result, const long *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr); +template void Algorithms::reduceVector(unsigned long* result, const unsigned long *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr); +template void Algorithms::reduceVector(long long* result, const int *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr); +template void Algorithms::reduceVector(unsigned long long* result, const unsigned *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr); +template void Algorithms::reduceVector(long long* result, const long *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr); +template void Algorithms::reduceVector(unsigned long long* result, const unsigned long *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr); +template void Algorithms::reduceVector(long long* result, const long long *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr); +template void Algorithms::reduceVector(unsigned long long* result, const unsigned long long *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr); +template void Algorithms::reduceVector(float* result, const float *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr); +template void Algorithms::reduceVector(double* result, const float *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr); +template void Algorithms::reduceVector(double* result, const double *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr); + +} // namespace device diff --git a/device.cpp b/device.cpp index 0438a24..f9d89d3 100644 --- a/device.cpp +++ b/device.cpp @@ -3,15 +3,17 @@ // SPDX-License-Identifier: BSD-3-Clause #include "device.h" +#include "Algorithms.h" #ifdef DEVICE_LANG_CUDA #include "interfaces/cuda/CudaWrappedAPI.h" +#define DEVICE_ENABLED #elif DEVICE_LANG_HIP #include "interfaces/hip/HipWrappedAPI.h" +#define DEVICE_ENABLED #elif DEVICE_LANG_SYCL #include "interfaces/sycl/SyclWrappedAPI.h" -#else -#error "Unknown interface for the device wrapper" +#define DEVICE_ENABLED #endif using namespace device; @@ -20,17 +22,41 @@ DeviceInstance::DeviceInstance() { // NOTE: all headers inside of macros define their unique ConcreteInterface. // Make sure to not include multiple different interfaces at the same time. // Only one interface is allowed per program because of issues of unique compilers, etc. - api = new ConcreteAPI; - algorithms.setDeviceApi(api); +#ifdef DEVICE_ENABLED + apiP = std::make_unique(); + algorithmsP = std::make_unique(); + + algorithmsP->setDeviceApi(apiP.get()); +#endif } DeviceInstance::~DeviceInstance() { +#ifdef DEVICE_ENABLED this->finalize(); - delete api; - api = nullptr; +#endif } void DeviceInstance::finalize() { - api->finalize(); +#ifdef DEVICE_ENABLED + api().finalize(); +#endif } +DeviceInstance& DeviceInstance::instance() { + static DeviceInstance currentInstance; + return currentInstance; +} + +AbstractAPI& DeviceInstance::api() { + if (apiP == nullptr) { + throw std::runtime_error("Device API was called; but it is not initialized."); + } + return *apiP; +} + +Algorithms& DeviceInstance::algorithms() { + if (algorithmsP == nullptr) { + throw std::runtime_error("Device API was called; but it is not initialized."); + } + return *algorithmsP; +} diff --git a/device.h b/device.h index 67f6d53..a1aeee5 100644 --- a/device.h +++ b/device.h @@ -7,6 +7,8 @@ #include "AbstractAPI.h" #include "Algorithms.h" +#include +#include namespace device { @@ -15,17 +17,17 @@ class DeviceInstance { public: DeviceInstance(const DeviceInstance &) = delete; DeviceInstance &operator=(const DeviceInstance &) = delete; - static DeviceInstance &getInstance() { - static DeviceInstance instance; - return instance; - } + static DeviceInstance& instance(); ~DeviceInstance(); void finalize(); - AbstractAPI *api{nullptr}; - Algorithms algorithms{}; + AbstractAPI& api(); + + Algorithms& algorithms(); private: + std::unique_ptr apiP{nullptr}; + std::unique_ptr algorithmsP{nullptr}; DeviceInstance(); }; } // namespace device diff --git a/examples/basic/basic.cpp b/examples/basic/basic.cpp index 2c449c4..fb4bb4d 100644 --- a/examples/basic/basic.cpp +++ b/examples/basic/basic.cpp @@ -14,45 +14,45 @@ int main(int argc, char *argv[]) { real *inputArray = new real[size]; real *outputArray = new real[size]; - DeviceInstance &device = DeviceInstance::getInstance(); + DeviceInstance &device = DeviceInstance::instance(); // set up the first device - const int numDevices = device.api->getNumDevices(); + const int numDevices = device.api().getNumDevices(); std::cout << "Num. devices available: " << numDevices << '\n'; if (numDevices > 0) { - device.api->setDevice(0); + device.api().setDevice(0); } - device.api->initialize(); + device.api().initialize(); // print some device info - std::string deviceInfo(device.api->getDeviceInfoAsText(0)); + std::string deviceInfo(device.api().getDeviceInfoAsText(0)); std::cout << deviceInfo << std::endl; - std::cout << "alignment: " << device.api->getGlobMemAlignment() << std::endl; - std::cout << "max available mem: " << device.api->getMaxAvailableMem() << std::endl; - std::cout << "max shared mem: " << device.api->getMaxSharedMemSize() << std::endl; - std::cout << "max thread block size: " << device.api->getMaxThreadBlockSize() << std::endl; + std::cout << "alignment: " << device.api().getGlobMemAlignment() << std::endl; + std::cout << "max available mem: " << device.api().getMaxAvailableMem() << std::endl; + std::cout << "max shared mem: " << device.api().getMaxSharedMemSize() << std::endl; + std::cout << "max thread block size: " << device.api().getMaxThreadBlockSize() << std::endl; // allocate mem. on a device - real *dInputArray = static_cast(device.api->allocGlobMem(sizeof(real) * size)); - real *dOutputArray = static_cast(device.api->allocGlobMem(sizeof(real) * size)); + real *dInputArray = static_cast(device.api().allocGlobMem(sizeof(real) * size)); + real *dOutputArray = static_cast(device.api().allocGlobMem(sizeof(real) * size)); // copy data into a device - device.api->copyTo(dInputArray, inputArray, sizeof(real) * size); + device.api().copyTo(dInputArray, inputArray, sizeof(real) * size); // call a kernel - device.api->checkOffloading(); - device.api->syncDevice(); + device.api().checkOffloading(); + device.api().syncDevice(); // copy data from a device - device.api->copyFrom(outputArray, dOutputArray, sizeof(real) * size); + device.api().copyFrom(outputArray, dOutputArray, sizeof(real) * size); // deallocate mem. on a device - device.api->freeGlobMem(dInputArray); - device.api->freeGlobMem(dOutputArray); + device.api().freeGlobMem(dInputArray); + device.api().freeGlobMem(dOutputArray); - std::cout << device.api->getMemLeaksReport(); + std::cout << device.api().getMemLeaksReport(); device.finalize(); diff --git a/examples/jacobi/src/gpu/solver.cpp b/examples/jacobi/src/gpu/solver.cpp index b348cd5..4ebf56c 100644 --- a/examples/jacobi/src/gpu/solver.cpp +++ b/examples/jacobi/src/gpu/solver.cpp @@ -39,22 +39,22 @@ void gpu::Solver::run(const SolverSettingsT &settings, const CpuMatrixDataT &mat std::tie(invDiag, lu) = getDLU(matrix); // allocate gpu data structures - DeviceInstance &device = DeviceInstance::getInstance(); - device.api->setDevice(ws.rank); - device.api->initialize(); + DeviceInstance &device = DeviceInstance::instance(); + device.api().setDevice(ws.rank); + device.api().initialize(); this->setUp(lu, rhs, x, residual, invDiag); // start solver Statistics computeStat(ws, range); Statistics commStat(ws, range); - auto defaultStream = device.api->getDefaultStream(); + auto defaultStream = device.api().getDefaultStream(); while ((infNorm > settings.eps) and (currentIter <= settings.maxNumIters)) { computeStat.start(); launch_multMatVec(*devLU, devX, devTemp, defaultStream); launch_manipVectors(range, devRhs, devTemp, devX, VectorManipOps::Subtraction, defaultStream); launch_manipVectors(range, devInvDiag, devX, devX, VectorManipOps::Multiply, defaultStream); - device.api->syncDevice(); + device.api().syncDevice(); computeStat.stop(); commStat.start(); @@ -69,7 +69,7 @@ void gpu::Solver::run(const SolverSettingsT &settings, const CpuMatrixDataT &mat launch_multMatVec(*devLU, devX, devResidual, defaultStream); launch_manipVectors(range, devTemp, devResidual, devResidual, VectorManipOps::Addition, defaultStream); launch_manipVectors(range, devRhs, devResidual, devResidual, VectorManipOps::Subtraction, defaultStream); - device.api->copyFrom(const_cast(residual.data()), devResidual, residual.size() * sizeof(real)); + device.api().copyFrom(const_cast(residual.data()), devResidual, residual.size() * sizeof(real)); auto localInfNorm = infNorm = host::getInfNorm(range, residual); #ifdef USE_MPI @@ -90,31 +90,31 @@ void gpu::Solver::run(const SolverSettingsT &settings, const CpuMatrixDataT &mat ++currentIter; } - device.api->syncDevice(); + device.api().syncDevice(); assembler.assemble(devX, devTempX); - device.api->copyFrom(const_cast(x.data()), devTempX, x.size() * sizeof(real)); + device.api().copyFrom(const_cast(x.data()), devTempX, x.size() * sizeof(real)); this->tearDown(); } void gpu::Solver::setUp(const CpuMatrixDataT &lu, const VectorT &rhs, const VectorT &x, const VectorT &residual, VectorT &invDiag) { - DeviceInstance &device = DeviceInstance::getInstance(); + DeviceInstance &device = DeviceInstance::instance(); - devRhs = static_cast(device.api->allocGlobMem(rhs.size() * sizeof(real))); - device.api->copyTo(devRhs, rhs.data(), rhs.size() * sizeof(real)); + devRhs = static_cast(device.api().allocGlobMem(rhs.size() * sizeof(real))); + device.api().copyTo(devRhs, rhs.data(), rhs.size() * sizeof(real)); - devX = static_cast(device.api->allocGlobMem(x.size() * sizeof(real))); - device.api->copyTo(devX, x.data(), x.size() * sizeof(real)); + devX = static_cast(device.api().allocGlobMem(x.size() * sizeof(real))); + device.api().copyTo(devX, x.data(), x.size() * sizeof(real)); - devTempX = static_cast(device.api->allocGlobMem(x.size() * sizeof(real))); - device.api->copyTo(devTempX, x.data(), x.size() * sizeof(real)); + devTempX = static_cast(device.api().allocGlobMem(x.size() * sizeof(real))); + device.api().copyTo(devTempX, x.data(), x.size() * sizeof(real)); - devTemp = static_cast(device.api->allocGlobMem(x.size() * sizeof(real))); + devTemp = static_cast(device.api().allocGlobMem(x.size() * sizeof(real))); // InvDiag still holds the diagonal elements at this point - devDiag = static_cast(device.api->allocGlobMem(invDiag.size() * sizeof(real))); - device.api->copyTo(devDiag, invDiag.data(), invDiag.size() * sizeof(real)); + devDiag = static_cast(device.api().allocGlobMem(invDiag.size() * sizeof(real))); + device.api().copyTo(devDiag, invDiag.data(), invDiag.size() * sizeof(real)); // compute inverse diagonal matrix std::transform(invDiag.begin(), invDiag.end(), invDiag.begin(), [](const real &diag) { @@ -122,33 +122,33 @@ void gpu::Solver::setUp(const CpuMatrixDataT &lu, const VectorT &rhs, const Vect return 1.0 / diag; }); - devInvDiag = static_cast(device.api->allocGlobMem(invDiag.size() * sizeof(real))); - device.api->copyTo(devInvDiag, invDiag.data(), invDiag.size() * sizeof(real)); + devInvDiag = static_cast(device.api().allocGlobMem(invDiag.size() * sizeof(real))); + device.api().copyTo(devInvDiag, invDiag.data(), invDiag.size() * sizeof(real)); - devResidual = static_cast(device.api->allocGlobMem(residual.size() * sizeof(real))); - device.api->copyTo(devResidual, residual.data(), residual.size() * sizeof(real)); + devResidual = static_cast(device.api().allocGlobMem(residual.size() * sizeof(real))); + device.api().copyTo(devResidual, residual.data(), residual.size() * sizeof(real)); devLU = std::make_unique(lu.info); // GpuMatrixDataT devLU(LU.Info); - devLU->data = static_cast(device.api->allocGlobMem(lu.info.volume * sizeof(real))); - device.api->copyTo(devLU->data, lu.data.data(), lu.info.volume * sizeof(real)); + devLU->data = static_cast(device.api().allocGlobMem(lu.info.volume * sizeof(real))); + device.api().copyTo(devLU->data, lu.data.data(), lu.info.volume * sizeof(real)); - devLU->indices = static_cast(device.api->allocGlobMem(lu.info.volume * sizeof(int))); - device.api->copyTo(devLU->indices, lu.indices.data(), lu.info.volume * sizeof(int)); + devLU->indices = static_cast(device.api().allocGlobMem(lu.info.volume * sizeof(int))); + device.api().copyTo(devLU->indices, lu.indices.data(), lu.info.volume * sizeof(int)); } void gpu::Solver::tearDown() { - DeviceInstance &device = DeviceInstance::getInstance(); - device.api->freeGlobMem(devResidual); - device.api->freeGlobMem(devInvDiag); - device.api->freeGlobMem(devDiag); - device.api->freeGlobMem(devTemp); - device.api->freeGlobMem(devX); - device.api->freeGlobMem(devTempX); - device.api->freeGlobMem(devRhs); - - device.api->freeGlobMem(devLU->data); - device.api->freeGlobMem(devLU->indices); + DeviceInstance &device = DeviceInstance::instance(); + device.api().freeGlobMem(devResidual); + device.api().freeGlobMem(devInvDiag); + device.api().freeGlobMem(devDiag); + device.api().freeGlobMem(devTemp); + device.api().freeGlobMem(devX); + device.api().freeGlobMem(devTempX); + device.api().freeGlobMem(devRhs); + + device.api().freeGlobMem(devLU->data); + device.api().freeGlobMem(devLU->indices); devLU.reset(nullptr); } diff --git a/examples/jacobi/src/helper.hpp b/examples/jacobi/src/helper.hpp index 977eb3d..9c1bb75 100644 --- a/examples/jacobi/src/helper.hpp +++ b/examples/jacobi/src/helper.hpp @@ -96,8 +96,8 @@ class VectorAssembler { std::memcpy(reinterpret_cast(dest), reinterpret_cast(src), recvCounts[0] * sizeof(real)); } } else { - device::DeviceInstance &device = device::DeviceInstance::getInstance(); - device.api->copyBetween(dest, src, recvCounts[0] * sizeof(real)); + device::DeviceInstance &device = device::DeviceInstance::instance(); + device.api().copyBetween(dest, src, recvCounts[0] * sizeof(real)); } #endif } diff --git a/examples/jacobi/tests/driver.cpp b/examples/jacobi/tests/driver.cpp index c00fbfc..7445592 100644 --- a/examples/jacobi/tests/driver.cpp +++ b/examples/jacobi/tests/driver.cpp @@ -24,9 +24,9 @@ int main(int argc, char **argv) { WorkSpaceT ws{MPI_COMM_WORLD}; #endif - DeviceInstance &device = DeviceInstance::getInstance(); - device.api->setDevice(ws.rank); - device.api->initialize(); + DeviceInstance &device = DeviceInstance::instance(); + device.api().setDevice(ws.rank); + device.api().initialize(); ::testing::InitGoogleTest(&argc, argv); int result = RUN_ALL_TESTS(); diff --git a/examples/jacobi/tests/gpu/subroutines_tests.cpp b/examples/jacobi/tests/gpu/subroutines_tests.cpp index 61fe904..e920207 100644 --- a/examples/jacobi/tests/gpu/subroutines_tests.cpp +++ b/examples/jacobi/tests/gpu/subroutines_tests.cpp @@ -12,7 +12,7 @@ using ::testing::ElementsAreArray; TEST(Subroutines, MultMatrixVec) { - auto *api = ::device::DeviceInstance::getInstance().api; + auto *api = ::device::DeviceInstance::instance().api; auto defaultStream = api->getDefaultStream(); const int size = 3; @@ -54,7 +54,7 @@ TEST(Subroutines, MultMatrixVec) { } TEST(Subroutines, VectorManips) { - auto *api = ::device::DeviceInstance::getInstance().api; + auto *api = ::device::DeviceInstance::instance().api; auto defaultStream = api->getDefaultStream(); const int size = 3; diff --git a/examples/mpi/src/main.cpp b/examples/mpi/src/main.cpp index b65c100..1e70aa2 100644 --- a/examples/mpi/src/main.cpp +++ b/examples/mpi/src/main.cpp @@ -10,7 +10,7 @@ using namespace device; void forkOther(int otherRank) { - DeviceInstance &device = DeviceInstance::getInstance(); + DeviceInstance &device = DeviceInstance::instance(); auto *api = device.api; api->setDevice(otherRank); @@ -24,7 +24,7 @@ void forkOther(int otherRank) { } void forkRoot(int rootRank) { - DeviceInstance &device = DeviceInstance::getInstance(); + DeviceInstance &device = DeviceInstance::instance(); auto *api = device.api; api->setDevice(rootRank); diff --git a/interfaces/cuda/Control.cu b/interfaces/cuda/Control.cu index b55b6f0..f154787 100644 --- a/interfaces/cuda/Control.cu +++ b/interfaces/cuda/Control.cu @@ -125,11 +125,9 @@ std::string ConcreteAPI::getDeviceInfoAsText(int deviceId) { info << "memPitch: " << property.memPitch << '\n'; info << "maxThreadsPerBlock: " << property.maxThreadsPerBlock << '\n'; info << "totalConstMem: " << property.totalConstMem << '\n'; - info << "clockRate: " << property.clockRate << '\n'; info << "multiProcessorCount: " << property.multiProcessorCount << '\n'; info << "integrated: " << property.integrated << '\n'; info << "canMapHostMemory: " << property.canMapHostMemory << '\n'; - info << "computeMode: " << property.computeMode << '\n'; info << "concurrentKernels: " << property.concurrentKernels << '\n'; info << "pciBusID: " << property.pciBusID << '\n'; info << "pciDeviceID: " << property.pciDeviceID << '\n'; diff --git a/interfaces/cuda/Copy.cu b/interfaces/cuda/Copy.cu index 0b7de7f..4292cb6 100644 --- a/interfaces/cuda/Copy.cu +++ b/interfaces/cuda/Copy.cu @@ -71,9 +71,24 @@ void ConcreteAPI::prefetchUnifiedMemTo(Destination type, const void *devPtr, siz void *streamPtr) { isFlagSet(status); cudaStream_t stream = (streamPtr == nullptr) ? 0 : (static_cast(streamPtr)); + + cudaMemLocation location{}; + if (type == Destination::Host) { + location.id = cudaCpuDeviceId; + location.type = cudaMemLocationTypeHost; + } + else if (allowedConcurrentManagedAccess) { + location.id = currentDeviceId; + location.type = cudaMemLocationTypeDevice; + } + cudaMemPrefetchAsync(devPtr, count, - type == Destination::CurrentDevice ? currentDeviceId : cudaCpuDeviceId, +#if CUDART_VERSION >= 13000 + location, 0, +#else + location.type, +#endif stream); CHECK_ERR; } diff --git a/interfaces/cuda/Memory.cu b/interfaces/cuda/Memory.cu index 8ab88c5..2402197 100644 --- a/interfaces/cuda/Memory.cu +++ b/interfaces/cuda/Memory.cu @@ -96,14 +96,26 @@ void *ConcreteAPI::allocUnifiedMem(size_t size, bool compress, Destination hint) void *devPtr; cudaMallocManaged(&devPtr, size, cudaMemAttachGlobal); CHECK_ERR; + + cudaMemLocation location{}; if (hint == Destination::Host) { - cudaMemAdvise(devPtr, size, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId); - CHECK_ERR; + location.id = cudaCpuDeviceId; + location.type = cudaMemLocationTypeHost; } else if (allowedConcurrentManagedAccess) { - cudaMemAdvise(devPtr, size, cudaMemAdviseSetPreferredLocation, currentDeviceId); - CHECK_ERR; + location.id = currentDeviceId; + location.type = cudaMemLocationTypeDevice; } + + cudaMemAdvise(devPtr, size, cudaMemAdviseSetPreferredLocation, +#if CUDART_VERSION >= 13000 + location +#else + location.type +#endif + ); + CHECK_ERR; + statistics.allocatedMemBytes += size; statistics.allocatedUnifiedMemBytes += size; memToSizeMap[devPtr] = size; diff --git a/interfaces/hip/Control.cpp b/interfaces/hip/Control.cpp index 0e4774e..b2721ac 100644 --- a/interfaces/hip/Control.cpp +++ b/interfaces/hip/Control.cpp @@ -134,11 +134,9 @@ std::string ConcreteAPI::getDeviceInfoAsText(int deviceId) { info << "memPitch: " << property.memPitch << '\n'; info << "maxThreadsPerBlock: " << property.maxThreadsPerBlock << '\n'; info << "totalConstMem: " << property.totalConstMem << '\n'; - info << "clockRate: " << property.clockRate << '\n'; info << "multiProcessorCount: " << property.multiProcessorCount << '\n'; info << "integrated: " << property.integrated << '\n'; info << "canMapHostMemory: " << property.canMapHostMemory << '\n'; - info << "computeMode: " << property.computeMode << '\n'; info << "concurrentKernels: " << property.concurrentKernels << '\n'; info << "pciBusID: " << property.pciBusID << '\n'; info << "pciDeviceID: " << property.pciDeviceID << '\n'; diff --git a/tests/BaseTestSuite.h b/tests/BaseTestSuite.h index 0f57a44..c42ee6e 100644 --- a/tests/BaseTestSuite.h +++ b/tests/BaseTestSuite.h @@ -24,7 +24,7 @@ class BaseTestSuite : public ::testing::Test BaseTestSuite() { randomEngine.seed(randomDevice()); } void SetUp() { - device = &DeviceInstance::getInstance(); + device = &DeviceInstance::instance(); setUp = true; } diff --git a/tests/array_manip.cpp b/tests/array_manip.cpp index 201968f..1459bc5 100644 --- a/tests/array_manip.cpp +++ b/tests/array_manip.cpp @@ -16,112 +16,112 @@ class ArrayManip : public BaseTestSuite { TEST_F(ArrayManip, fill) { const int N = 100; - int *arr = (int *)device->api->allocGlobMem(N * sizeof(int)); + int *arr = (int *)device->api().allocGlobMem(N * sizeof(int)); int scalar = 502; - device->algorithms.fillArray(arr, scalar, N, device->api->getDefaultStream()); + device->algorithms().fillArray(arr, scalar, N, device->api().getDefaultStream()); std::vector hostVector(N, 0); - device->api->copyFromAsync(&hostVector[0], arr, N * sizeof(int), device->api->getDefaultStream()); + device->api().copyFromAsync(&hostVector[0], arr, N * sizeof(int), device->api().getDefaultStream()); - device->api->syncDefaultStreamWithHost(); + device->api().syncDefaultStreamWithHost(); for (auto &i : hostVector) { EXPECT_EQ(scalar, i); } - device->api->freeGlobMem(arr); + device->api().freeGlobMem(arr); } TEST_F(ArrayManip, touchClean32) { const int N = 100; - float *arr = (float *)device->api->allocGlobMem(N * sizeof(float)); - device->algorithms.touchMemory(arr, N, true, device->api->getDefaultStream()); + float *arr = (float *)device->api().allocGlobMem(N * sizeof(float)); + device->algorithms().touchMemory(arr, N, true, device->api().getDefaultStream()); std::vector hostVector(N, 1); - device->api->copyFromAsync(&hostVector[0], arr, N * sizeof(float), device->api->getDefaultStream()); + device->api().copyFromAsync(&hostVector[0], arr, N * sizeof(float), device->api().getDefaultStream()); - device->api->syncDefaultStreamWithHost(); + device->api().syncDefaultStreamWithHost(); for (auto &i : hostVector) { EXPECT_EQ(0, i); } - device->api->freeGlobMem(arr); + device->api().freeGlobMem(arr); } TEST_F(ArrayManip, touchNoClean32) { const int N = 100; - float *arr = (float *)device->api->allocGlobMem(N * sizeof(float)); + float *arr = (float *)device->api().allocGlobMem(N * sizeof(float)); std::vector hostVector(N, 0); - device->api->copyToAsync(arr, &hostVector[0], N * sizeof(float), device->api->getDefaultStream()); - device->algorithms.touchMemory(arr, N, false, device->api->getDefaultStream()); - device->api->copyFromAsync(&hostVector[0], arr, N * sizeof(float), device->api->getDefaultStream()); + device->api().copyToAsync(arr, &hostVector[0], N * sizeof(float), device->api().getDefaultStream()); + device->algorithms().touchMemory(arr, N, false, device->api().getDefaultStream()); + device->api().copyFromAsync(&hostVector[0], arr, N * sizeof(float), device->api().getDefaultStream()); - device->api->syncDefaultStreamWithHost(); + device->api().syncDefaultStreamWithHost(); for (auto &i : hostVector) { EXPECT_EQ(0, i); } - device->api->freeGlobMem(arr); + device->api().freeGlobMem(arr); } TEST_F(ArrayManip, touchClean64) { const int N = 100; - double *arr = (double *)device->api->allocGlobMem(N * sizeof(double)); - device->algorithms.touchMemory(arr, N, true, device->api->getDefaultStream()); + double *arr = (double *)device->api().allocGlobMem(N * sizeof(double)); + device->algorithms().touchMemory(arr, N, true, device->api().getDefaultStream()); std::vector hostVector(N, 1); - device->api->copyFromAsync(&hostVector[0], arr, N * sizeof(double), device->api->getDefaultStream()); + device->api().copyFromAsync(&hostVector[0], arr, N * sizeof(double), device->api().getDefaultStream()); - device->api->syncDefaultStreamWithHost(); + device->api().syncDefaultStreamWithHost(); for (auto &i : hostVector) { EXPECT_EQ(0, i); } - device->api->freeGlobMem(arr); + device->api().freeGlobMem(arr); } TEST_F(ArrayManip, touchNoClean64) { const int N = 100; - double *arr = (double *)device->api->allocGlobMem(N * sizeof(double)); + double *arr = (double *)device->api().allocGlobMem(N * sizeof(double)); std::vector hostVector(N, 0); - device->api->copyToAsync(arr, &hostVector[0], N * sizeof(double), device->api->getDefaultStream()); - device->algorithms.touchMemory(arr, N, false, device->api->getDefaultStream()); - device->api->copyFromAsync(&hostVector[0], arr, N * sizeof(double), device->api->getDefaultStream()); + device->api().copyToAsync(arr, &hostVector[0], N * sizeof(double), device->api().getDefaultStream()); + device->algorithms().touchMemory(arr, N, false, device->api().getDefaultStream()); + device->api().copyFromAsync(&hostVector[0], arr, N * sizeof(double), device->api().getDefaultStream()); - device->api->syncDefaultStreamWithHost(); + device->api().syncDefaultStreamWithHost(); for (auto &i : hostVector) { EXPECT_EQ(0, i); } - device->api->freeGlobMem(arr); + device->api().freeGlobMem(arr); } TEST_F(ArrayManip, scale) { const int N = 100; std::vector hostVector(N, 1); - int *arr = (int *)device->api->allocGlobMem(N * sizeof(int)); + int *arr = (int *)device->api().allocGlobMem(N * sizeof(int)); - device->api->copyToAsync(arr, &hostVector[0], N * sizeof(int), device->api->getDefaultStream()); - device->algorithms.scaleArray(arr, 5, N, device->api->getDefaultStream()); - device->api->copyFromAsync(&hostVector[0], arr, N * sizeof(int), device->api->getDefaultStream()); + device->api().copyToAsync(arr, &hostVector[0], N * sizeof(int), device->api().getDefaultStream()); + device->algorithms().scaleArray(arr, 5, N, device->api().getDefaultStream()); + device->api().copyFromAsync(&hostVector[0], arr, N * sizeof(int), device->api().getDefaultStream()); - device->api->syncDefaultStreamWithHost(); + device->api().syncDefaultStreamWithHost(); for (auto &i : hostVector) { EXPECT_EQ(5, i); } - device->api->freeGlobMem(arr); + device->api().freeGlobMem(arr); } diff --git a/tests/batch_manip.cpp b/tests/batch_manip.cpp index 380dad6..9e1db35 100644 --- a/tests/batch_manip.cpp +++ b/tests/batch_manip.cpp @@ -17,8 +17,8 @@ class BatchManip : public BaseTestSuite { public: template void testWrapper(size_t N, size_t M, bool sparse, F&& inner) { - T *data = (T *)device->api->allocGlobMem(N * M * sizeof(T)); - T **batch = (T **)device->api->allocUnifiedMem(N * sizeof(T*)); + T *data = (T *)device->api().allocGlobMem(N * M * sizeof(T)); + T **batch = (T **)device->api().allocUnifiedMem(N * sizeof(T*)); for (size_t i = 0; i < N; ++i) { if (!(sparse && i % 2 == 0)) { @@ -31,8 +31,8 @@ class BatchManip : public BaseTestSuite { std::forward(inner)(batch, data); - device->api->freeGlobMem(data); - device->api->freeGlobMem(batch); + device->api().freeGlobMem(data); + device->api().freeGlobMem(batch); } }; @@ -43,12 +43,12 @@ TEST_F(BatchManip, fill32) { testWrapper(N, M, false, [&](float** batch, float* data) { float scalar = 502; - device->algorithms.setToValue(batch, scalar, M, N, device->api->getDefaultStream()); + device->algorithms().setToValue(batch, scalar, M, N, device->api().getDefaultStream()); std::vector hostVector(N * M, 0); - device->api->copyFromAsync(&hostVector[0], data, N * M * sizeof(float), device->api->getDefaultStream()); + device->api().copyFromAsync(&hostVector[0], data, N * M * sizeof(float), device->api().getDefaultStream()); - device->api->syncDefaultStreamWithHost(); + device->api().syncDefaultStreamWithHost(); for (auto &i : hostVector) { EXPECT_EQ(scalar, i); @@ -60,12 +60,12 @@ TEST_F(BatchManip, touchClean32) { const int N = 100; const int M = 120; testWrapper(N, M, false, [&](float** batch, float* data) { - device->algorithms.touchBatchedMemory(batch, M, N, true, device->api->getDefaultStream()); + device->algorithms().touchBatchedMemory(batch, M, N, true, device->api().getDefaultStream()); std::vector hostVector(N * M, 1); - device->api->copyFromAsync(&hostVector[0], data, M * N * sizeof(float), device->api->getDefaultStream()); + device->api().copyFromAsync(&hostVector[0], data, M * N * sizeof(float), device->api().getDefaultStream()); - device->api->syncDefaultStreamWithHost(); + device->api().syncDefaultStreamWithHost(); for (auto &i : hostVector) { EXPECT_EQ(0, i); @@ -75,11 +75,11 @@ TEST_F(BatchManip, touchClean32) { testWrapper(N, M, true, [&](float** batch, float* data) { std::vector hostVector(N * M, 1); - device->api->copyToAsync(data, &hostVector[0], N * M * sizeof(float), device->api->getDefaultStream()); - device->algorithms.touchBatchedMemory(batch, M, N, true, device->api->getDefaultStream()); - device->api->copyFromAsync(&hostVector[0], data, M * N * sizeof(float), device->api->getDefaultStream()); + device->api().copyToAsync(data, &hostVector[0], N * M * sizeof(float), device->api().getDefaultStream()); + device->algorithms().touchBatchedMemory(batch, M, N, true, device->api().getDefaultStream()); + device->api().copyFromAsync(&hostVector[0], data, M * N * sizeof(float), device->api().getDefaultStream()); - device->api->syncDefaultStreamWithHost(); + device->api().syncDefaultStreamWithHost(); for (size_t i = 0; i < N; ++i) { for (size_t j = 0; j < M; ++j) { @@ -100,11 +100,11 @@ TEST_F(BatchManip, touchNoClean32) { testWrapper(N, M, false, [&](float** batch, float* data) { std::vector hostVector(N * M, 1); - device->api->copyToAsync(data, &hostVector[0], N * M * sizeof(float), device->api->getDefaultStream()); - device->algorithms.touchBatchedMemory(batch, M, N, false, device->api->getDefaultStream()); - device->api->copyFromAsync(&hostVector[0], data, N * M * sizeof(float), device->api->getDefaultStream()); + device->api().copyToAsync(data, &hostVector[0], N * M * sizeof(float), device->api().getDefaultStream()); + device->algorithms().touchBatchedMemory(batch, M, N, false, device->api().getDefaultStream()); + device->api().copyFromAsync(&hostVector[0], data, N * M * sizeof(float), device->api().getDefaultStream()); - device->api->syncDefaultStreamWithHost(); + device->api().syncDefaultStreamWithHost(); for (auto &i : hostVector) { EXPECT_EQ(1, i); @@ -116,42 +116,42 @@ TEST_F(BatchManip, scatterToUniform32) { const int N = 10000; const int M = 12000; - float *data2 = (float *)device->api->allocGlobMem(N * M * sizeof(float)); + float *data2 = (float *)device->api().allocGlobMem(N * M * sizeof(float)); testWrapper(N, M, false, [&](float** batch, float* data) { std::vector hostVector(N * M, 1); - device->api->copyToAsync(data, &hostVector[0], N * M * sizeof(float), device->api->getDefaultStream()); - device->algorithms.copyScatterToUniform(const_cast(batch), data2, M, M, N, device->api->getDefaultStream()); - device->api->copyFromAsync(&hostVector[0], data2, N * M * sizeof(float), device->api->getDefaultStream()); + device->api().copyToAsync(data, &hostVector[0], N * M * sizeof(float), device->api().getDefaultStream()); + device->algorithms().copyScatterToUniform(const_cast(batch), data2, M, M, N, device->api().getDefaultStream()); + device->api().copyFromAsync(&hostVector[0], data2, N * M * sizeof(float), device->api().getDefaultStream()); - device->api->syncDefaultStreamWithHost(); + device->api().syncDefaultStreamWithHost(); for (auto &i : hostVector) { EXPECT_EQ(1, i); } }); - device->api->freeGlobMem(data2); + device->api().freeGlobMem(data2); } TEST_F(BatchManip, uniformToScatter32) { const int N = 10000; const int M = 12000; - float *data2 = (float *)device->api->allocGlobMem(N * M * sizeof(float)); + float *data2 = (float *)device->api().allocGlobMem(N * M * sizeof(float)); testWrapper(N, M, false, [&](float** batch, float* data) { std::vector hostVector(N * M, 1); - device->api->copyToAsync(data2, &hostVector[0], N * M * sizeof(float), device->api->getDefaultStream()); - device->algorithms.copyUniformToScatter(data2, batch, M, M, N, device->api->getDefaultStream()); - device->api->copyFromAsync(&hostVector[0], data, N * M * sizeof(float), device->api->getDefaultStream()); + device->api().copyToAsync(data2, &hostVector[0], N * M * sizeof(float), device->api().getDefaultStream()); + device->algorithms().copyUniformToScatter(data2, batch, M, M, N, device->api().getDefaultStream()); + device->api().copyFromAsync(&hostVector[0], data, N * M * sizeof(float), device->api().getDefaultStream()); - device->api->syncDefaultStreamWithHost(); + device->api().syncDefaultStreamWithHost(); for (auto &i : hostVector) { EXPECT_EQ(1, i); } }); - device->api->freeGlobMem(data2); + device->api().freeGlobMem(data2); } @@ -164,12 +164,12 @@ TEST_F(BatchManip, fill64) { testWrapper(N, M, false, [&](double** batch, double* data) { double scalar = 502; - device->algorithms.setToValue(batch, scalar, M, N, device->api->getDefaultStream()); + device->algorithms().setToValue(batch, scalar, M, N, device->api().getDefaultStream()); std::vector hostVector(N * M, 0); - device->api->copyFromAsync(&hostVector[0], data, N * M * sizeof(double), device->api->getDefaultStream()); + device->api().copyFromAsync(&hostVector[0], data, N * M * sizeof(double), device->api().getDefaultStream()); - device->api->syncDefaultStreamWithHost(); + device->api().syncDefaultStreamWithHost(); for (auto &i : hostVector) { EXPECT_EQ(scalar, i); @@ -181,12 +181,12 @@ TEST_F(BatchManip, touchClean64) { const int N = 100; const int M = 120; testWrapper(N, M, false, [&](double** batch, double* data) { - device->algorithms.touchBatchedMemory(batch, M, N, true, device->api->getDefaultStream()); + device->algorithms().touchBatchedMemory(batch, M, N, true, device->api().getDefaultStream()); std::vector hostVector(N * M, 1); - device->api->copyFromAsync(&hostVector[0], data, M * N * sizeof(double), device->api->getDefaultStream()); + device->api().copyFromAsync(&hostVector[0], data, M * N * sizeof(double), device->api().getDefaultStream()); - device->api->syncDefaultStreamWithHost(); + device->api().syncDefaultStreamWithHost(); for (auto &i : hostVector) { EXPECT_EQ(0, i); @@ -196,11 +196,11 @@ TEST_F(BatchManip, touchClean64) { testWrapper(N, M, true, [&](double** batch, double* data) { std::vector hostVector(N * M, 1); - device->api->copyToAsync(data, &hostVector[0], N * M * sizeof(double), device->api->getDefaultStream()); - device->algorithms.touchBatchedMemory(batch, M, N, true, device->api->getDefaultStream()); - device->api->copyFromAsync(&hostVector[0], data, M * N * sizeof(double), device->api->getDefaultStream()); + device->api().copyToAsync(data, &hostVector[0], N * M * sizeof(double), device->api().getDefaultStream()); + device->algorithms().touchBatchedMemory(batch, M, N, true, device->api().getDefaultStream()); + device->api().copyFromAsync(&hostVector[0], data, M * N * sizeof(double), device->api().getDefaultStream()); - device->api->syncDefaultStreamWithHost(); + device->api().syncDefaultStreamWithHost(); for (size_t i = 0; i < N; ++i) { for (size_t j = 0; j < M; ++j) { @@ -221,11 +221,11 @@ TEST_F(BatchManip, touchNoClean64) { testWrapper(N, M, false, [&](double** batch, double* data) { std::vector hostVector(N * M, 1); - device->api->copyToAsync(data, &hostVector[0], N * M * sizeof(double), device->api->getDefaultStream()); - device->algorithms.touchBatchedMemory(batch, M, N, false, device->api->getDefaultStream()); - device->api->copyFromAsync(&hostVector[0], data, N * M * sizeof(double), device->api->getDefaultStream()); + device->api().copyToAsync(data, &hostVector[0], N * M * sizeof(double), device->api().getDefaultStream()); + device->algorithms().touchBatchedMemory(batch, M, N, false, device->api().getDefaultStream()); + device->api().copyFromAsync(&hostVector[0], data, N * M * sizeof(double), device->api().getDefaultStream()); - device->api->syncDefaultStreamWithHost(); + device->api().syncDefaultStreamWithHost(); for (auto &i : hostVector) { EXPECT_EQ(1, i); @@ -237,42 +237,42 @@ TEST_F(BatchManip, scatterToUniform64) { const int N = 100; const int M = 120; - double *data2 = (double *)device->api->allocGlobMem(N * M * sizeof(double)); + double *data2 = (double *)device->api().allocGlobMem(N * M * sizeof(double)); testWrapper(N, M, false, [&](double** batch, double* data) { std::vector hostVector(N * M, 1); - device->api->copyToAsync(data, &hostVector[0], N * M * sizeof(double), device->api->getDefaultStream()); - device->algorithms.copyScatterToUniform(const_cast(batch), data2, M, M, N, device->api->getDefaultStream()); - device->api->copyFromAsync(&hostVector[0], data2, N * M * sizeof(double), device->api->getDefaultStream()); + device->api().copyToAsync(data, &hostVector[0], N * M * sizeof(double), device->api().getDefaultStream()); + device->algorithms().copyScatterToUniform(const_cast(batch), data2, M, M, N, device->api().getDefaultStream()); + device->api().copyFromAsync(&hostVector[0], data2, N * M * sizeof(double), device->api().getDefaultStream()); - device->api->syncDefaultStreamWithHost(); + device->api().syncDefaultStreamWithHost(); for (auto &i : hostVector) { EXPECT_EQ(1, i); } }); - device->api->freeGlobMem(data2); + device->api().freeGlobMem(data2); } TEST_F(BatchManip, uniformToScatter64) { const int N = 100; const int M = 120; - double *data2 = (double *)device->api->allocGlobMem(N * M * sizeof(double)); + double *data2 = (double *)device->api().allocGlobMem(N * M * sizeof(double)); testWrapper(N, M, false, [&](double** batch, double* data) { std::vector hostVector(N * M, 1); - device->api->copyToAsync(data2, &hostVector[0], N * M * sizeof(double), device->api->getDefaultStream()); - device->algorithms.copyUniformToScatter(data2, batch, M, M, N, device->api->getDefaultStream()); - device->api->copyFromAsync(&hostVector[0], data, N * M * sizeof(double), device->api->getDefaultStream()); + device->api().copyToAsync(data2, &hostVector[0], N * M * sizeof(double), device->api().getDefaultStream()); + device->algorithms().copyUniformToScatter(data2, batch, M, M, N, device->api().getDefaultStream()); + device->api().copyFromAsync(&hostVector[0], data, N * M * sizeof(double), device->api().getDefaultStream()); - device->api->syncDefaultStreamWithHost(); + device->api().syncDefaultStreamWithHost(); for (auto &i : hostVector) { EXPECT_EQ(1, i); } }); - device->api->freeGlobMem(data2); + device->api().freeGlobMem(data2); } diff --git a/tests/main.cpp b/tests/main.cpp index 9f514c5..2863432 100644 --- a/tests/main.cpp +++ b/tests/main.cpp @@ -10,9 +10,9 @@ using namespace device; int main(int argc, char **argv) { ::testing::InitGoogleTest(&argc, argv); - DeviceInstance &device = DeviceInstance::getInstance(); - device.api->setDevice(0); - device.api->initialize(); + DeviceInstance &device = DeviceInstance::instance(); + device.api().setDevice(0); + device.api().initialize(); return RUN_ALL_TESTS(); } diff --git a/tests/memory.cpp b/tests/memory.cpp index f87625f..728ce18 100644 --- a/tests/memory.cpp +++ b/tests/memory.cpp @@ -28,16 +28,16 @@ TEST_F(Memories, copy2DMemory) { } } - int *arr = (int *)device->api->allocGlobMem(M * N * sizeof(int)); + int *arr = (int *)device->api().allocGlobMem(M * N * sizeof(int)); int spitch = N * sizeof(int); int dpitch = N * sizeof(int); int width = N * sizeof(int); int height = M; - device->api->copy2dArrayTo(arr, dpitch, &hostVector[0], spitch, width, height); + device->api().copy2dArrayTo(arr, dpitch, &hostVector[0], spitch, width, height); int hostVector2[M][N]; - device->api->copy2dArrayFrom(&hostVector2[0], dpitch, arr, spitch, width, height); + device->api().copy2dArrayFrom(&hostVector2[0], dpitch, arr, spitch, width, height); for (size_t i = 0; i < M; i++) { for (size_t j = 0; j < N; j++) { @@ -45,7 +45,7 @@ TEST_F(Memories, copy2DMemory) { } } - device->api->freeGlobMem(arr); + device->api().freeGlobMem(arr); } /* @@ -64,16 +64,16 @@ TEST_F(Memories, copy2DMemoryWithSrcPitch) { } } - int *arr = (int *)device->api->allocGlobMem(M * N * sizeof(int)); + int *arr = (int *)device->api().allocGlobMem(M * N * sizeof(int)); int spitch = (N + SPI) * sizeof(int); int dpitch = N * sizeof(int); int width = N * sizeof(int); int height = M; - device->api->copy2dArrayTo(arr, dpitch, &hostVector[0], spitch, width, height); + device->api().copy2dArrayTo(arr, dpitch, &hostVector[0], spitch, width, height); int hostVector2[M][N]; - device->api->copy2dArrayFrom(&hostVector2[0], dpitch, arr, dpitch, width, height); + device->api().copy2dArrayFrom(&hostVector2[0], dpitch, arr, dpitch, width, height); for (size_t i = 0; i < M; i++) { for (size_t j = 0; j < N; j++) { @@ -81,7 +81,7 @@ TEST_F(Memories, copy2DMemoryWithSrcPitch) { } } - device->api->freeGlobMem(arr); + device->api().freeGlobMem(arr); } */ @@ -103,16 +103,16 @@ TEST_F(Memories, copy2DMemoryWithDstPitch) { } } - int *arr = (int *)device->api->allocGlobMem(M * (N + DPI) * sizeof(int)); + int *arr = (int *)device->api().allocGlobMem(M * (N + DPI) * sizeof(int)); int spitch = (N + SPI) * sizeof(int); int dpitch = (N + DPI) * sizeof(int); int width = N * sizeof(int); int height = M; - device->api->copy2dArrayTo(arr, dpitch, &hostVector[0], spitch, width, height); + device->api().copy2dArrayTo(arr, dpitch, &hostVector[0], spitch, width, height); int hostVector2[M][N + SPI]; - device->api->copy2dArrayFrom(&hostVector2[0], spitch, arr, dpitch, width, height); + device->api().copy2dArrayFrom(&hostVector2[0], spitch, arr, dpitch, width, height); for (size_t i = 0; i < M; i++) { for (size_t j = 0; j < N + SPI; j++) { @@ -120,7 +120,7 @@ TEST_F(Memories, copy2DMemoryWithDstPitch) { } } - device->api->freeGlobMem(arr); + device->api().freeGlobMem(arr); } */ diff --git a/tests/reductions.cpp b/tests/reductions.cpp index 2af4756..63e40cf 100644 --- a/tests/reductions.cpp +++ b/tests/reductions.cpp @@ -28,45 +28,45 @@ TEST_F(Reductions, Add) { element = distribution(randomEngine); } - auto* devVector = reinterpret_cast(device->api->allocGlobMem(sizeof(unsigned) * size)); - device->api->copyTo(devVector, vector.data(), sizeof(unsigned) * size); + auto* devVector = reinterpret_cast(device->api().allocGlobMem(sizeof(unsigned) * size)); + device->api().copyTo(devVector, vector.data(), sizeof(unsigned) * size); auto expectedResult = std::accumulate(vector.begin(), vector.end(), 0, std::plus()); - unsigned* testResult = reinterpret_cast(device->api->allocPinnedMem(sizeof(unsigned))); + unsigned* testResult = reinterpret_cast(device->api().allocPinnedMem(sizeof(unsigned))); - device->algorithms.reduceVector(testResult, devVector, true, size, ReductionType::Add, device->api->getDefaultStream()); - device->api->syncDefaultStreamWithHost(); + device->algorithms().reduceVector(testResult, devVector, true, size, ReductionType::Add, device->api().getDefaultStream()); + device->api().syncDefaultStreamWithHost(); EXPECT_EQ(expectedResult, *testResult); - device->api->freePinnedMem(testResult); - device->api->freeGlobMem(devVector); + device->api().freePinnedMem(testResult); + device->api().freeGlobMem(devVector); } TEST_F(Reductions, Max) { constexpr size_t size = 20010000; std::vector vector(size, 0); - auto* devVector = reinterpret_cast(device->api->allocGlobMem(sizeof(unsigned) * size)); - device->api->copyTo(devVector, vector.data(), sizeof(unsigned) * size); + auto* devVector = reinterpret_cast(device->api().allocGlobMem(sizeof(unsigned) * size)); + device->api().copyTo(devVector, vector.data(), sizeof(unsigned) * size); std::uniform_int_distribution<> distribution(10, 100); for (auto &element : vector) { element = distribution(randomEngine); } - device->api->copyTo(devVector, vector.data(), sizeof(unsigned) * size); + device->api().copyTo(devVector, vector.data(), sizeof(unsigned) * size); auto max = [](unsigned a, unsigned b) -> unsigned { return a > b ? a : b; }; auto initValue = std::numeric_limits::min(); auto expectedResult = std::accumulate(vector.begin(), vector.end(), initValue, max); - unsigned* testResult = reinterpret_cast(device->api->allocPinnedMem(sizeof(unsigned))); + unsigned* testResult = reinterpret_cast(device->api().allocPinnedMem(sizeof(unsigned))); - device->algorithms.reduceVector(testResult, devVector, true, size, ReductionType::Max, device->api->getDefaultStream()); - device->api->syncDefaultStreamWithHost(); + device->algorithms().reduceVector(testResult, devVector, true, size, ReductionType::Max, device->api().getDefaultStream()); + device->api().syncDefaultStreamWithHost(); EXPECT_EQ(expectedResult, *testResult); - device->api->freePinnedMem(testResult); - device->api->freeGlobMem(devVector); + device->api().freePinnedMem(testResult); + device->api().freeGlobMem(devVector); } TEST_F(Reductions, Min) { @@ -78,19 +78,19 @@ TEST_F(Reductions, Min) { element = distribution(randomEngine); } - auto* devVector = reinterpret_cast(device->api->allocGlobMem(sizeof(unsigned) * size)); - device->api->copyTo(devVector, vector.data(), sizeof(unsigned) * size); + auto* devVector = reinterpret_cast(device->api().allocGlobMem(sizeof(unsigned) * size)); + device->api().copyTo(devVector, vector.data(), sizeof(unsigned) * size); auto min = [](unsigned a, unsigned b) -> unsigned { return a > b ? b : a; }; auto initValue = std::numeric_limits::max(); auto expectedResult = std::accumulate(vector.begin(), vector.end(), initValue, min); - unsigned* testResult = reinterpret_cast(device->api->allocPinnedMem(sizeof(unsigned))); + unsigned* testResult = reinterpret_cast(device->api().allocPinnedMem(sizeof(unsigned))); - device->algorithms.reduceVector(testResult, devVector, true, size, ReductionType::Min, device->api->getDefaultStream()); - device->api->syncDefaultStreamWithHost(); + device->algorithms().reduceVector(testResult, devVector, true, size, ReductionType::Min, device->api().getDefaultStream()); + device->api().syncDefaultStreamWithHost(); EXPECT_EQ(expectedResult, *testResult); - device->api->freePinnedMem(testResult); - device->api->freeGlobMem(devVector); + device->api().freePinnedMem(testResult); + device->api().freeGlobMem(devVector); }