diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 9fd1045..f1ee0e2 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -23,6 +23,14 @@ jobs:
           - Release
           - Debug
         setup:
+          - arch: none
+            backend: none
+            cc: gcc-13
+            cxx: g++-13
+            fc: gfortran-13
+            container: seissol/gha-cpu:davschneller-gpu-image
+            runner: ubuntu-24.04
+            pythonbreak: true
           - arch: sm_60
             backend: cuda
             cc: gcc-13
@@ -98,7 +106,7 @@ jobs:
           cd ../..
 
       - name: checkout-device
-        uses: actions/checkout@v4
+        uses: actions/checkout@v5
         with:
           submodules: recursive
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 97cab89..b2e7e1a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,19 +13,18 @@ if (NOT DEFINED DEVICE_BACKEND)
     message(FATAL_ERROR "DEVICE_BACKEND variable has not been provided into the submodule")
 else()
     set(FOUND OFF)
-    foreach(VARIANT cuda hip oneapi hipsycl acpp)
+    foreach(VARIANT cuda hip oneapi hipsycl acpp none)
         if (${DEVICE_BACKEND} STREQUAL ${VARIANT})
             set(FOUND ON)
         endif()
     endforeach()
     if (NOT FOUND)
-        message(FATAL_ERROR "DEVICE_BACKEND must be either cuda, hip, opeapi, acpp, or hipsycl. Given: ${DEVICE_BACKEND}")
+        message(FATAL_ERROR "DEVICE_BACKEND must be either none, cuda, hip, opeapi, acpp, hipsycl. Given: ${DEVICE_BACKEND}")
     endif()
 endif()
 
-if (NOT DEFINED DEVICE_ARCH)
-    message(FATAL_ERROR "DEVICE_ARCH is not defined. "
-            "Supported for example: sm_60, sm_61, sm_70, sm_71, gfx906, gfx908, dg1, bdw, skl, Gen8, Gen9, Gen11, Gen12LP")
+if ((NOT (${DEVICE_BACKEND} STREQUAL "none")) AND (NOT DEFINED DEVICE_ARCH))
+    message(FATAL_ERROR "DEVICE_ARCH has not been defined")
 endif()
 
 set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
@@ -44,10 +43,10 @@ elseif(${DEVICE_BACKEND} STREQUAL "hip")
 elseif((${DEVICE_BACKEND} STREQUAL "oneapi") OR (${DEVICE_BACKEND} STREQUAL "hipsycl") OR (${DEVICE_BACKEND} STREQUAL "acpp"))
     set(BACKEND_FOLDER "sycl")
     include(sycl.cmake)
+else()
+    add_library(device OBJECT device.cpp algorithms/Dummy.cpp)
 endif()
 
-string(TOUPPER ${BACKEND_FOLDER} BACKEND_UPPER_CASE)
-
 # common options
 target_compile_features(device PRIVATE cxx_std_17)
 
@@ -58,15 +57,24 @@ if (ENABLE_PROFILING_MARKERS)
   target_compile_definitions(device PRIVATE PROFILING_ENABLED)
 endif()
 
-target_compile_definitions(device PRIVATE DEVICE_LANG_${BACKEND_UPPER_CASE})
+if (NOT (${DEVICE_BACKEND} STREQUAL "none"))
+    string(TOUPPER ${BACKEND_FOLDER} BACKEND_UPPER_CASE)
+    target_compile_definitions(device PRIVATE DEVICE_LANG_${BACKEND_UPPER_CASE})
+    
+    target_include_directories(device PRIVATE
+                                          interfaces/${BACKEND_FOLDER}
+                                          interfaces/common
+                                          algorithms/${BACKEND_FOLDER})
+endif()
 
 if (LOG_LEVEL_MASTER)
   target_compile_definitions(device PRIVATE LOG_LEVEL=${LOG_LEVEL_MASTER})
 endif()
 
-target_include_directories(device PRIVATE .
-                                          interfaces/${BACKEND_FOLDER}
-                                          interfaces/common
-                                          algorithms/${BACKEND_FOLDER}
-                                          submodules)
+if (DEFINED DEVICE_SUBMODULES)
+    target_include_directories(device PRIVATE ${DEVICE_SUBMODULES})
+else()
+    target_include_directories(device PRIVATE submodules)
+endif()
 
+target_include_directories(device PRIVATE .)
diff --git a/UsmAllocator.h b/UsmAllocator.h
index 89d7f6e..5e9b305 100644
--- a/UsmAllocator.h
+++ b/UsmAllocator.h
@@ -22,7 +22,7 @@ class UsmAllocator {
   using difference_type = std::ptrdiff_t;
 
   UsmAllocator() noexcept = delete;
-  UsmAllocator(device::DeviceInstance& instance) noexcept : api(instance.api) {}
+  UsmAllocator(device::DeviceInstance& instance) noexcept : api(&instance.api()) {}
 
   UsmAllocator(const UsmAllocator &) noexcept = default;
   UsmAllocator(UsmAllocator &&) noexcept = default;
diff --git a/algorithms/Dummy.cpp b/algorithms/Dummy.cpp
new file mode 100644
index 0000000..aece8b2
--- /dev/null
+++ b/algorithms/Dummy.cpp
@@ -0,0 +1,136 @@
+// SPDX-FileCopyrightText: 2020-2024 SeisSol Group
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <cassert>
+#include <cstdint>
+#include <device.h>
+#include "Algorithms.h"
+
+namespace device {
+template <typename T> void Algorithms::scaleArray(T *devArray,
+                                                  T scalar,
+                                                  const size_t numElements,
+                                                  void* streamPtr) {
+}
+template void Algorithms::scaleArray(float *devArray, float scalar, const size_t numElements, void* streamPtr);
+template void Algorithms::scaleArray(double *devArray, double scalar, const size_t numElements, void* streamPtr);
+template void Algorithms::scaleArray(int *devArray, int scalar, const size_t numElements, void* streamPtr);
+template void Algorithms::scaleArray(unsigned *devArray, unsigned scalar, const size_t numElements, void* streamPtr);
+template void Algorithms::scaleArray(char *devArray, char scalar, const size_t numElements, void* streamPtr);
+
+template <typename T> void Algorithms::fillArray(T *devArray, const T scalar, const size_t numElements, void* streamPtr) {
+}
+template void Algorithms::fillArray(float *devArray, float scalar, const size_t numElements, void* streamPtr);
+template void Algorithms::fillArray(double *devArray, double scalar, const size_t numElements, void* streamPtr);
+template void Algorithms::fillArray(int *devArray, int scalar, const size_t numElements, void* streamPtr);
+template void Algorithms::fillArray(unsigned *devArray, unsigned scalar, const size_t numElements, void* streamPtr);
+template void Algorithms::fillArray(char *devArray, char scalar, const size_t numElements, void* streamPtr);
+
+void Algorithms::touchMemoryI(void *ptr, size_t size, bool clean, void* streamPtr) {
+}
+
+void Algorithms::incrementalAddI(
+  void** out,
+  void *base,
+  size_t increment,
+  size_t numElements,
+  void* streamPtr) {
+}
+
+
+  void Algorithms::streamBatchedDataI(const void **baseSrcPtr,
+                                    void **baseDstPtr,
+                                    size_t elementSize,
+                                    size_t numElements,
+                                     void* streamPtr) {
+  }
+
+  template<typename T>
+  void Algorithms::accumulateBatchedData(const T **baseSrcPtr,
+                                         T **baseDstPtr,
+                                         size_t elementSize,
+                                         size_t numElements,
+                                         void* streamPtr) {
+  }
+
+  template void Algorithms::accumulateBatchedData(const float **baseSrcPtr,
+                                         float **baseDstPtr,
+                                         size_t elementSize,
+                                         size_t numElements,
+                                         void* streamPtr);
+
+  template void Algorithms::accumulateBatchedData(const double **baseSrcPtr,
+                                         double **baseDstPtr,
+                                         size_t elementSize,
+                                         size_t numElements,
+                                         void* streamPtr);
+
+  void Algorithms::touchBatchedMemoryI(void **basePtr,
+    size_t elementSize,
+                                      size_t numElements,
+                                      bool clean,
+                                      void* streamPtr) {
+  }
+
+
+template<typename T>
+void Algorithms::setToValue(T** out, T value, size_t elementSize, size_t numElements, void* streamPtr) {
+
+}
+
+template void Algorithms::setToValue(float** out, float value, size_t elementSize, size_t numElements, void* streamPtr);
+template void Algorithms::setToValue(double** out, double value, size_t elementSize, size_t numElements, void* streamPtr);
+template void Algorithms::setToValue(int** out, int value, size_t elementSize, size_t numElements, void* streamPtr);
+template void Algorithms::setToValue(unsigned** out, unsigned value, size_t elementSize, size_t numElements, void* streamPtr);
+template void Algorithms::setToValue(char** out, char value, size_t elementSize, size_t numElements, void* streamPtr);
+
+
+  void Algorithms::copyUniformToScatterI(const void *src,
+                                        void **dst,
+                                        size_t srcOffset,
+                                        size_t copySize,
+                                        size_t numElements,
+                                        void* streamPtr) {
+  }
+
+  void Algorithms::copyScatterToUniformI(const void **src,
+                                        void *dst,
+                                        size_t dstOffset,
+                                        size_t copySize,
+                                        size_t numElements,
+                                        void* streamPtr) {
+  }
+
+
+template<typename T>
+void Algorithms::compareDataWithHost(const T *hostPtr, const T *devPtr, const size_t numElements,
+                                     const std::string &dataName) {
+
+};
+
+template void Algorithms::compareDataWithHost(const float *hostPtr, const float *devPtr, const size_t numElements,
+  const std::string &dataName);
+template void Algorithms::compareDataWithHost(const double *hostPtr, const double *devPtr, const size_t numElements,
+  const std::string &dataName);
+
+template <typename AccT, typename VecT> void Algorithms::reduceVector(AccT* result, const VecT *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr) {
+}
+
+template void Algorithms::reduceVector(int* result, const int *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr);
+template void Algorithms::reduceVector(unsigned* result, const unsigned *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr);
+template void Algorithms::reduceVector(long* result, const int *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr);
+template void Algorithms::reduceVector(unsigned long* result, const unsigned *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr);
+template void Algorithms::reduceVector(long* result, const long *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr);
+template void Algorithms::reduceVector(unsigned long* result, const unsigned long *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr);
+template void Algorithms::reduceVector(long long* result, const int *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr);
+template void Algorithms::reduceVector(unsigned long long* result, const unsigned *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr);
+template void Algorithms::reduceVector(long long* result, const long *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr);
+template void Algorithms::reduceVector(unsigned long long* result, const unsigned long *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr);
+template void Algorithms::reduceVector(long long* result, const long long *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr);
+template void Algorithms::reduceVector(unsigned long long* result, const unsigned long long *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr);
+template void Algorithms::reduceVector(float* result, const float *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr);
+template void Algorithms::reduceVector(double* result, const float *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr);
+template void Algorithms::reduceVector(double* result, const double *buffer, bool overrideResult, size_t size, ReductionType type, void* streamPtr);
+
+} // namespace device
diff --git a/device.cpp b/device.cpp
index 0438a24..f9d89d3 100644
--- a/device.cpp
+++ b/device.cpp
@@ -3,15 +3,17 @@
 // SPDX-License-Identifier: BSD-3-Clause
 
 #include "device.h"
+#include "Algorithms.h"
 
 #ifdef DEVICE_LANG_CUDA
 #include "interfaces/cuda/CudaWrappedAPI.h"
+#define DEVICE_ENABLED
 #elif DEVICE_LANG_HIP
 #include "interfaces/hip/HipWrappedAPI.h"
+#define DEVICE_ENABLED
 #elif DEVICE_LANG_SYCL
 #include "interfaces/sycl/SyclWrappedAPI.h"
-#else
-#error "Unknown interface for the device wrapper"
+#define DEVICE_ENABLED
 #endif
 
 using namespace device;
@@ -20,17 +22,41 @@ DeviceInstance::DeviceInstance() {
   // NOTE: all headers inside of macros define their unique ConcreteInterface.
   // Make sure to not include multiple different interfaces at the same time.
   // Only one interface is allowed per program because of issues of unique compilers, etc.
-  api = new ConcreteAPI;
-  algorithms.setDeviceApi(api);
+#ifdef DEVICE_ENABLED
+  apiP = std::make_unique<ConcreteAPI>();
+  algorithmsP = std::make_unique<Algorithms>();
+
+  algorithmsP->setDeviceApi(apiP.get());
+#endif
 }
 
 DeviceInstance::~DeviceInstance() {
+#ifdef DEVICE_ENABLED
   this->finalize();
-  delete api;
-  api = nullptr;
+#endif
 }
 
 void DeviceInstance::finalize() {
-  api->finalize();
+#ifdef DEVICE_ENABLED
+  api().finalize();
+#endif
 }
 
+DeviceInstance& DeviceInstance::instance() {
+  static DeviceInstance currentInstance;
+  return currentInstance;
+}
+
+AbstractAPI& DeviceInstance::api() {
+  if (apiP == nullptr) {
+    throw std::runtime_error("Device API was called; but it is not initialized.");
+  }
+  return *apiP;
+}
+
+Algorithms& DeviceInstance::algorithms() {
+  if (algorithmsP == nullptr) {
+    throw std::runtime_error("Device API was called; but it is not initialized.");
+  }
+  return *algorithmsP;
+}
diff --git a/device.h b/device.h
index 67f6d53..a1aeee5 100644
--- a/device.h
+++ b/device.h
@@ -7,6 +7,8 @@
 
 #include "AbstractAPI.h"
 #include "Algorithms.h"
+#include <memory>
+#include <stdexcept>
 
 namespace device {
 
@@ -15,17 +17,17 @@ class DeviceInstance {
 public:
   DeviceInstance(const DeviceInstance &) = delete;
   DeviceInstance &operator=(const DeviceInstance &) = delete;
-  static DeviceInstance &getInstance() {
-    static DeviceInstance instance;
-    return instance;
-  }
+  static DeviceInstance& instance();
   ~DeviceInstance();
   void finalize();
 
-  AbstractAPI *api{nullptr};
-  Algorithms algorithms{};
+  AbstractAPI& api();
+
+  Algorithms& algorithms();
 
 private:
+  std::unique_ptr<AbstractAPI> apiP{nullptr};
+  std::unique_ptr<Algorithms> algorithmsP{nullptr};
   DeviceInstance();
 };
 } // namespace device
diff --git a/examples/basic/basic.cpp b/examples/basic/basic.cpp
index 2c449c4..fb4bb4d 100644
--- a/examples/basic/basic.cpp
+++ b/examples/basic/basic.cpp
@@ -14,45 +14,45 @@ int main(int argc, char *argv[]) {
   real *inputArray = new real[size];
   real *outputArray = new real[size];
 
-  DeviceInstance &device = DeviceInstance::getInstance();
+  DeviceInstance &device = DeviceInstance::instance();
 
   // set up the first device
-  const int numDevices = device.api->getNumDevices();
+  const int numDevices = device.api().getNumDevices();
   std::cout << "Num. devices available: " << numDevices << '\n';
   if (numDevices > 0) {
-    device.api->setDevice(0);
+    device.api().setDevice(0);
   }
-  device.api->initialize();
+  device.api().initialize();
 
   // print some device info
-  std::string deviceInfo(device.api->getDeviceInfoAsText(0));
+  std::string deviceInfo(device.api().getDeviceInfoAsText(0));
   std::cout << deviceInfo << std::endl;
 
-  std::cout << "alignment: " << device.api->getGlobMemAlignment() << std::endl;
-  std::cout << "max available mem: " << device.api->getMaxAvailableMem() << std::endl;
-  std::cout << "max shared mem: " << device.api->getMaxSharedMemSize() << std::endl;
-  std::cout << "max thread block size: " << device.api->getMaxThreadBlockSize() << std::endl;
+  std::cout << "alignment: " << device.api().getGlobMemAlignment() << std::endl;
+  std::cout << "max available mem: " << device.api().getMaxAvailableMem() << std::endl;
+  std::cout << "max shared mem: " << device.api().getMaxSharedMemSize() << std::endl;
+  std::cout << "max thread block size: " << device.api().getMaxThreadBlockSize() << std::endl;
 
 
   // allocate mem. on a device
-  real *dInputArray = static_cast<real *>(device.api->allocGlobMem(sizeof(real) * size));
-  real *dOutputArray = static_cast<real *>(device.api->allocGlobMem(sizeof(real) * size));
+  real *dInputArray = static_cast<real *>(device.api().allocGlobMem(sizeof(real) * size));
+  real *dOutputArray = static_cast<real *>(device.api().allocGlobMem(sizeof(real) * size));
 
   // copy data into a device
-  device.api->copyTo(dInputArray, inputArray, sizeof(real) * size);
+  device.api().copyTo(dInputArray, inputArray, sizeof(real) * size);
 
   // call a kernel
-  device.api->checkOffloading();
-  device.api->syncDevice();
+  device.api().checkOffloading();
+  device.api().syncDevice();
 
   // copy data from a device
-  device.api->copyFrom(outputArray, dOutputArray, sizeof(real) * size);
+  device.api().copyFrom(outputArray, dOutputArray, sizeof(real) * size);
 
   // deallocate mem. on a device
-  device.api->freeGlobMem(dInputArray);
-  device.api->freeGlobMem(dOutputArray);
+  device.api().freeGlobMem(dInputArray);
+  device.api().freeGlobMem(dOutputArray);
 
-  std::cout << device.api->getMemLeaksReport();
+  std::cout << device.api().getMemLeaksReport();
 
   device.finalize();
 
diff --git a/examples/jacobi/src/gpu/solver.cpp b/examples/jacobi/src/gpu/solver.cpp
index b348cd5..4ebf56c 100644
--- a/examples/jacobi/src/gpu/solver.cpp
+++ b/examples/jacobi/src/gpu/solver.cpp
@@ -39,22 +39,22 @@ void gpu::Solver::run(const SolverSettingsT &settings, const CpuMatrixDataT &mat
   std::tie(invDiag, lu) = getDLU(matrix);
 
   // allocate gpu data structures
-  DeviceInstance &device = DeviceInstance::getInstance();
-  device.api->setDevice(ws.rank);
-  device.api->initialize();
+  DeviceInstance &device = DeviceInstance::instance();
+  device.api().setDevice(ws.rank);
+  device.api().initialize();
   this->setUp(lu, rhs, x, residual, invDiag);
 
   // start solver
   Statistics computeStat(ws, range);
   Statistics commStat(ws, range);
-  auto defaultStream = device.api->getDefaultStream();
+  auto defaultStream = device.api().getDefaultStream();
   while ((infNorm > settings.eps) and (currentIter <= settings.maxNumIters)) {
 
     computeStat.start();
     launch_multMatVec(*devLU, devX, devTemp, defaultStream);
     launch_manipVectors(range, devRhs, devTemp, devX, VectorManipOps::Subtraction, defaultStream);
     launch_manipVectors(range, devInvDiag, devX, devX, VectorManipOps::Multiply, defaultStream);
-    device.api->syncDevice();
+    device.api().syncDevice();
     computeStat.stop();
 
     commStat.start();
@@ -69,7 +69,7 @@ void gpu::Solver::run(const SolverSettingsT &settings, const CpuMatrixDataT &mat
       launch_multMatVec(*devLU, devX, devResidual, defaultStream);
       launch_manipVectors(range, devTemp, devResidual, devResidual, VectorManipOps::Addition, defaultStream);
       launch_manipVectors(range, devRhs, devResidual, devResidual, VectorManipOps::Subtraction, defaultStream);
-      device.api->copyFrom(const_cast<real *>(residual.data()), devResidual, residual.size() * sizeof(real));
+      device.api().copyFrom(const_cast<real *>(residual.data()), devResidual, residual.size() * sizeof(real));
 
       auto localInfNorm = infNorm = host::getInfNorm(range, residual);
 #ifdef USE_MPI
@@ -90,31 +90,31 @@ void gpu::Solver::run(const SolverSettingsT &settings, const CpuMatrixDataT &mat
     ++currentIter;
   }
 
-  device.api->syncDevice();
+  device.api().syncDevice();
   assembler.assemble<SystemType::OnDevice>(devX, devTempX);
-  device.api->copyFrom(const_cast<real *>(x.data()), devTempX, x.size() * sizeof(real));
+  device.api().copyFrom(const_cast<real *>(x.data()), devTempX, x.size() * sizeof(real));
 
   this->tearDown();
 }
 
 void gpu::Solver::setUp(const CpuMatrixDataT &lu, const VectorT &rhs, const VectorT &x, const VectorT &residual,
                         VectorT &invDiag) {
-  DeviceInstance &device = DeviceInstance::getInstance();
+  DeviceInstance &device = DeviceInstance::instance();
 
-  devRhs = static_cast<real *>(device.api->allocGlobMem(rhs.size() * sizeof(real)));
-  device.api->copyTo(devRhs, rhs.data(), rhs.size() * sizeof(real));
+  devRhs = static_cast<real *>(device.api().allocGlobMem(rhs.size() * sizeof(real)));
+  device.api().copyTo(devRhs, rhs.data(), rhs.size() * sizeof(real));
 
-  devX = static_cast<real *>(device.api->allocGlobMem(x.size() * sizeof(real)));
-  device.api->copyTo(devX, x.data(), x.size() * sizeof(real));
+  devX = static_cast<real *>(device.api().allocGlobMem(x.size() * sizeof(real)));
+  device.api().copyTo(devX, x.data(), x.size() * sizeof(real));
 
-  devTempX = static_cast<real *>(device.api->allocGlobMem(x.size() * sizeof(real)));
-  device.api->copyTo(devTempX, x.data(), x.size() * sizeof(real));
+  devTempX = static_cast<real *>(device.api().allocGlobMem(x.size() * sizeof(real)));
+  device.api().copyTo(devTempX, x.data(), x.size() * sizeof(real));
 
-  devTemp = static_cast<real *>(device.api->allocGlobMem(x.size() * sizeof(real)));
+  devTemp = static_cast<real *>(device.api().allocGlobMem(x.size() * sizeof(real)));
 
   // InvDiag still holds the diagonal elements at this point
-  devDiag = static_cast<real *>(device.api->allocGlobMem(invDiag.size() * sizeof(real)));
-  device.api->copyTo(devDiag, invDiag.data(), invDiag.size() * sizeof(real));
+  devDiag = static_cast<real *>(device.api().allocGlobMem(invDiag.size() * sizeof(real)));
+  device.api().copyTo(devDiag, invDiag.data(), invDiag.size() * sizeof(real));
 
   // compute inverse diagonal matrix
   std::transform(invDiag.begin(), invDiag.end(), invDiag.begin(), [](const real &diag) {
@@ -122,33 +122,33 @@ void gpu::Solver::setUp(const CpuMatrixDataT &lu, const VectorT &rhs, const Vect
     return 1.0 / diag;
   });
 
-  devInvDiag = static_cast<real *>(device.api->allocGlobMem(invDiag.size() * sizeof(real)));
-  device.api->copyTo(devInvDiag, invDiag.data(), invDiag.size() * sizeof(real));
+  devInvDiag = static_cast<real *>(device.api().allocGlobMem(invDiag.size() * sizeof(real)));
+  device.api().copyTo(devInvDiag, invDiag.data(), invDiag.size() * sizeof(real));
 
-  devResidual = static_cast<real *>(device.api->allocGlobMem(residual.size() * sizeof(real)));
-  device.api->copyTo(devResidual, residual.data(), residual.size() * sizeof(real));
+  devResidual = static_cast<real *>(device.api().allocGlobMem(residual.size() * sizeof(real)));
+  device.api().copyTo(devResidual, residual.data(), residual.size() * sizeof(real));
 
   devLU = std::make_unique<GpuMatrixDataT>(lu.info);
   // GpuMatrixDataT devLU(LU.Info);
-  devLU->data = static_cast<real *>(device.api->allocGlobMem(lu.info.volume * sizeof(real)));
-  device.api->copyTo(devLU->data, lu.data.data(), lu.info.volume * sizeof(real));
+  devLU->data = static_cast<real *>(device.api().allocGlobMem(lu.info.volume * sizeof(real)));
+  device.api().copyTo(devLU->data, lu.data.data(), lu.info.volume * sizeof(real));
 
-  devLU->indices = static_cast<int *>(device.api->allocGlobMem(lu.info.volume * sizeof(int)));
-  device.api->copyTo(devLU->indices, lu.indices.data(), lu.info.volume * sizeof(int));
+  devLU->indices = static_cast<int *>(device.api().allocGlobMem(lu.info.volume * sizeof(int)));
+  device.api().copyTo(devLU->indices, lu.indices.data(), lu.info.volume * sizeof(int));
 }
 
 void gpu::Solver::tearDown() {
-  DeviceInstance &device = DeviceInstance::getInstance();
-  device.api->freeGlobMem(devResidual);
-  device.api->freeGlobMem(devInvDiag);
-  device.api->freeGlobMem(devDiag);
-  device.api->freeGlobMem(devTemp);
-  device.api->freeGlobMem(devX);
-  device.api->freeGlobMem(devTempX);
-  device.api->freeGlobMem(devRhs);
-
-  device.api->freeGlobMem(devLU->data);
-  device.api->freeGlobMem(devLU->indices);
+  DeviceInstance &device = DeviceInstance::instance();
+  device.api().freeGlobMem(devResidual);
+  device.api().freeGlobMem(devInvDiag);
+  device.api().freeGlobMem(devDiag);
+  device.api().freeGlobMem(devTemp);
+  device.api().freeGlobMem(devX);
+  device.api().freeGlobMem(devTempX);
+  device.api().freeGlobMem(devRhs);
+
+  device.api().freeGlobMem(devLU->data);
+  device.api().freeGlobMem(devLU->indices);
   devLU.reset(nullptr);
 }
 
diff --git a/examples/jacobi/src/helper.hpp b/examples/jacobi/src/helper.hpp
index 977eb3d..9c1bb75 100644
--- a/examples/jacobi/src/helper.hpp
+++ b/examples/jacobi/src/helper.hpp
@@ -96,8 +96,8 @@ class VectorAssembler {
         std::memcpy(reinterpret_cast<char *>(dest), reinterpret_cast<char *>(src), recvCounts[0] * sizeof(real));
       }
     } else {
-      device::DeviceInstance &device = device::DeviceInstance::getInstance();
-      device.api->copyBetween(dest, src, recvCounts[0] * sizeof(real));
+      device::DeviceInstance &device = device::DeviceInstance::instance();
+      device.api().copyBetween(dest, src, recvCounts[0] * sizeof(real));
     }
 #endif
   }
diff --git a/examples/jacobi/tests/driver.cpp b/examples/jacobi/tests/driver.cpp
index c00fbfc..7445592 100644
--- a/examples/jacobi/tests/driver.cpp
+++ b/examples/jacobi/tests/driver.cpp
@@ -24,9 +24,9 @@ int main(int argc, char **argv) {
   WorkSpaceT ws{MPI_COMM_WORLD};
 #endif
 
-  DeviceInstance &device = DeviceInstance::getInstance();
-  device.api->setDevice(ws.rank);
-  device.api->initialize();
+  DeviceInstance &device = DeviceInstance::instance();
+  device.api().setDevice(ws.rank);
+  device.api().initialize();
 
   ::testing::InitGoogleTest(&argc, argv);
   int result = RUN_ALL_TESTS();
diff --git a/examples/jacobi/tests/gpu/subroutines_tests.cpp b/examples/jacobi/tests/gpu/subroutines_tests.cpp
index 61fe904..e920207 100644
--- a/examples/jacobi/tests/gpu/subroutines_tests.cpp
+++ b/examples/jacobi/tests/gpu/subroutines_tests.cpp
@@ -12,7 +12,7 @@
 using ::testing::ElementsAreArray;
 
 TEST(Subroutines, MultMatrixVec) {
-  auto *api = ::device::DeviceInstance::getInstance().api;
+  auto *api = ::device::DeviceInstance::instance().api;
   auto defaultStream = api->getDefaultStream();
   const int size = 3;
 
@@ -54,7 +54,7 @@ TEST(Subroutines, MultMatrixVec) {
 }
 
 TEST(Subroutines, VectorManips) {
-  auto *api = ::device::DeviceInstance::getInstance().api;
+  auto *api = ::device::DeviceInstance::instance().api;
   auto defaultStream = api->getDefaultStream();
 
   const int size = 3;
diff --git a/examples/mpi/src/main.cpp b/examples/mpi/src/main.cpp
index b65c100..1e70aa2 100644
--- a/examples/mpi/src/main.cpp
+++ b/examples/mpi/src/main.cpp
@@ -10,7 +10,7 @@ using namespace device;
 
 void forkOther(int otherRank) {
 
-  DeviceInstance &device = DeviceInstance::getInstance();
+  DeviceInstance &device = DeviceInstance::instance();
   auto *api = device.api;
   api->setDevice(otherRank);
 
@@ -24,7 +24,7 @@ void forkOther(int otherRank) {
 }
 
 void forkRoot(int rootRank) {
-  DeviceInstance &device = DeviceInstance::getInstance();
+  DeviceInstance &device = DeviceInstance::instance();
   auto *api = device.api;
   api->setDevice(rootRank);
 
diff --git a/interfaces/cuda/Control.cu b/interfaces/cuda/Control.cu
index b55b6f0..f154787 100644
--- a/interfaces/cuda/Control.cu
+++ b/interfaces/cuda/Control.cu
@@ -125,11 +125,9 @@ std::string ConcreteAPI::getDeviceInfoAsText(int deviceId) {
   info << "memPitch: " << property.memPitch << '\n';
   info << "maxThreadsPerBlock: " << property.maxThreadsPerBlock << '\n';
   info << "totalConstMem: " << property.totalConstMem << '\n';
-  info << "clockRate: " << property.clockRate << '\n';
   info << "multiProcessorCount: " << property.multiProcessorCount << '\n';
   info << "integrated: " << property.integrated << '\n';
   info << "canMapHostMemory: " << property.canMapHostMemory << '\n';
-  info << "computeMode: " << property.computeMode << '\n';
   info << "concurrentKernels: " << property.concurrentKernels << '\n';
   info << "pciBusID: " << property.pciBusID << '\n';
   info << "pciDeviceID: " << property.pciDeviceID << '\n';
diff --git a/interfaces/cuda/Copy.cu b/interfaces/cuda/Copy.cu
index 0b7de7f..4292cb6 100644
--- a/interfaces/cuda/Copy.cu
+++ b/interfaces/cuda/Copy.cu
@@ -71,9 +71,24 @@ void ConcreteAPI::prefetchUnifiedMemTo(Destination type, const void *devPtr, siz
                                        void *streamPtr) {
   isFlagSet<InterfaceInitialized>(status);
   cudaStream_t stream = (streamPtr == nullptr) ? 0 : (static_cast<cudaStream_t>(streamPtr));
+
+  cudaMemLocation location{};
+  if (type == Destination::Host) {
+    location.id = cudaCpuDeviceId;
+    location.type = cudaMemLocationTypeHost;
+  }
+  else if (allowedConcurrentManagedAccess) {
+    location.id = currentDeviceId;
+    location.type = cudaMemLocationTypeDevice;
+  }
+
   cudaMemPrefetchAsync(devPtr,
                        count,
-                       type == Destination::CurrentDevice ? currentDeviceId : cudaCpuDeviceId,
+#if CUDART_VERSION >= 13000
+                       location, 0,
+#else
+                       location.type,
+#endif
                        stream);
   CHECK_ERR;
 }
diff --git a/interfaces/cuda/Memory.cu b/interfaces/cuda/Memory.cu
index 8ab88c5..2402197 100644
--- a/interfaces/cuda/Memory.cu
+++ b/interfaces/cuda/Memory.cu
@@ -96,14 +96,26 @@ void *ConcreteAPI::allocUnifiedMem(size_t size, bool compress, Destination hint)
   void *devPtr;
   cudaMallocManaged(&devPtr, size, cudaMemAttachGlobal);
   CHECK_ERR;
+
+  cudaMemLocation location{};
   if (hint == Destination::Host) {
-    cudaMemAdvise(devPtr, size, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
-    CHECK_ERR;
+    location.id = cudaCpuDeviceId;
+    location.type = cudaMemLocationTypeHost;
   }
   else if (allowedConcurrentManagedAccess) {
-    cudaMemAdvise(devPtr, size, cudaMemAdviseSetPreferredLocation, currentDeviceId);
-    CHECK_ERR;
+    location.id = currentDeviceId;
+    location.type = cudaMemLocationTypeDevice;
   }
+
+  cudaMemAdvise(devPtr, size, cudaMemAdviseSetPreferredLocation,
+#if CUDART_VERSION >= 13000
+                       location
+#else
+                       location.type
+#endif
+  );
+  CHECK_ERR;
+
   statistics.allocatedMemBytes += size;
   statistics.allocatedUnifiedMemBytes += size;
   memToSizeMap[devPtr] = size;
diff --git a/interfaces/hip/Control.cpp b/interfaces/hip/Control.cpp
index 0e4774e..b2721ac 100644
--- a/interfaces/hip/Control.cpp
+++ b/interfaces/hip/Control.cpp
@@ -134,11 +134,9 @@ std::string ConcreteAPI::getDeviceInfoAsText(int deviceId) {
   info << "memPitch: " << property.memPitch << '\n';
   info << "maxThreadsPerBlock: " << property.maxThreadsPerBlock << '\n';
   info << "totalConstMem: " << property.totalConstMem << '\n';
-  info << "clockRate: " << property.clockRate << '\n';
   info << "multiProcessorCount: " << property.multiProcessorCount << '\n';
   info << "integrated: " << property.integrated << '\n';
   info << "canMapHostMemory: " << property.canMapHostMemory << '\n';
-  info << "computeMode: " << property.computeMode << '\n';
   info << "concurrentKernels: " << property.concurrentKernels << '\n';
   info << "pciBusID: " << property.pciBusID << '\n';
   info << "pciDeviceID: " << property.pciDeviceID << '\n';
diff --git a/tests/BaseTestSuite.h b/tests/BaseTestSuite.h
index 0f57a44..c42ee6e 100644
--- a/tests/BaseTestSuite.h
+++ b/tests/BaseTestSuite.h
@@ -24,7 +24,7 @@ class BaseTestSuite : public ::testing::Test
   BaseTestSuite() { randomEngine.seed(randomDevice()); }
 
 void SetUp() {
-  device = &DeviceInstance::getInstance();
+  device = &DeviceInstance::instance();
   setUp = true;
 }
 
diff --git a/tests/array_manip.cpp b/tests/array_manip.cpp
index 201968f..1459bc5 100644
--- a/tests/array_manip.cpp
+++ b/tests/array_manip.cpp
@@ -16,112 +16,112 @@ class ArrayManip : public BaseTestSuite {
 
 TEST_F(ArrayManip, fill) {
   const int N = 100;
-  int *arr = (int *)device->api->allocGlobMem(N * sizeof(int));
+  int *arr = (int *)device->api().allocGlobMem(N * sizeof(int));
   int scalar = 502;
 
-  device->algorithms.fillArray(arr, scalar, N, device->api->getDefaultStream());
+  device->algorithms().fillArray(arr, scalar, N, device->api().getDefaultStream());
 
   std::vector<int> hostVector(N, 0);
-  device->api->copyFromAsync(&hostVector[0], arr, N * sizeof(int), device->api->getDefaultStream());
+  device->api().copyFromAsync(&hostVector[0], arr, N * sizeof(int), device->api().getDefaultStream());
 
-  device->api->syncDefaultStreamWithHost();
+  device->api().syncDefaultStreamWithHost();
 
   for (auto &i : hostVector) {
     EXPECT_EQ(scalar, i);
   }
 
-  device->api->freeGlobMem(arr);
+  device->api().freeGlobMem(arr);
 }
 
 TEST_F(ArrayManip, touchClean32) {
 
   const int N = 100;
-  float *arr = (float *)device->api->allocGlobMem(N * sizeof(float));
-  device->algorithms.touchMemory(arr, N, true, device->api->getDefaultStream());
+  float *arr = (float *)device->api().allocGlobMem(N * sizeof(float));
+  device->algorithms().touchMemory(arr, N, true, device->api().getDefaultStream());
   std::vector<float> hostVector(N, 1);
 
-  device->api->copyFromAsync(&hostVector[0], arr, N * sizeof(float), device->api->getDefaultStream());
+  device->api().copyFromAsync(&hostVector[0], arr, N * sizeof(float), device->api().getDefaultStream());
 
-  device->api->syncDefaultStreamWithHost();
+  device->api().syncDefaultStreamWithHost();
 
   for (auto &i : hostVector) {
     EXPECT_EQ(0, i);
   }
 
-  device->api->freeGlobMem(arr);
+  device->api().freeGlobMem(arr);
 }
 
 TEST_F(ArrayManip, touchNoClean32) {
 
   const int N = 100;
-  float *arr = (float *)device->api->allocGlobMem(N * sizeof(float));
+  float *arr = (float *)device->api().allocGlobMem(N * sizeof(float));
   std::vector<float> hostVector(N, 0);
 
-  device->api->copyToAsync(arr, &hostVector[0], N * sizeof(float), device->api->getDefaultStream());
-  device->algorithms.touchMemory(arr, N, false, device->api->getDefaultStream());
-  device->api->copyFromAsync(&hostVector[0], arr, N * sizeof(float), device->api->getDefaultStream());
+  device->api().copyToAsync(arr, &hostVector[0], N * sizeof(float), device->api().getDefaultStream());
+  device->algorithms().touchMemory(arr, N, false, device->api().getDefaultStream());
+  device->api().copyFromAsync(&hostVector[0], arr, N * sizeof(float), device->api().getDefaultStream());
 
-  device->api->syncDefaultStreamWithHost();
+  device->api().syncDefaultStreamWithHost();
 
   for (auto &i : hostVector) {
     EXPECT_EQ(0, i);
   }
 
-  device->api->freeGlobMem(arr);
+  device->api().freeGlobMem(arr);
 }
 
 TEST_F(ArrayManip, touchClean64) {
 
   const int N = 100;
-  double *arr = (double *)device->api->allocGlobMem(N * sizeof(double));
-  device->algorithms.touchMemory(arr, N, true, device->api->getDefaultStream());
+  double *arr = (double *)device->api().allocGlobMem(N * sizeof(double));
+  device->algorithms().touchMemory(arr, N, true, device->api().getDefaultStream());
   std::vector<double> hostVector(N, 1);
 
-  device->api->copyFromAsync(&hostVector[0], arr, N * sizeof(double), device->api->getDefaultStream());
+  device->api().copyFromAsync(&hostVector[0], arr, N * sizeof(double), device->api().getDefaultStream());
 
-  device->api->syncDefaultStreamWithHost();
+  device->api().syncDefaultStreamWithHost();
 
   for (auto &i : hostVector) {
     EXPECT_EQ(0, i);
   }
 
-  device->api->freeGlobMem(arr);
+  device->api().freeGlobMem(arr);
 }
 
 TEST_F(ArrayManip, touchNoClean64) {
 
   const int N = 100;
-  double *arr = (double *)device->api->allocGlobMem(N * sizeof(double));
+  double *arr = (double *)device->api().allocGlobMem(N * sizeof(double));
   std::vector<double> hostVector(N, 0);
 
-  device->api->copyToAsync(arr, &hostVector[0], N * sizeof(double), device->api->getDefaultStream());
-  device->algorithms.touchMemory(arr, N, false, device->api->getDefaultStream());
-  device->api->copyFromAsync(&hostVector[0], arr, N * sizeof(double), device->api->getDefaultStream());
+  device->api().copyToAsync(arr, &hostVector[0], N * sizeof(double), device->api().getDefaultStream());
+  device->algorithms().touchMemory(arr, N, false, device->api().getDefaultStream());
+  device->api().copyFromAsync(&hostVector[0], arr, N * sizeof(double), device->api().getDefaultStream());
 
-  device->api->syncDefaultStreamWithHost();
+  device->api().syncDefaultStreamWithHost();
 
   for (auto &i : hostVector) {
     EXPECT_EQ(0, i);
   }
 
-  device->api->freeGlobMem(arr);
+  device->api().freeGlobMem(arr);
 }
 
 TEST_F(ArrayManip, scale) {
   const int N = 100;
   std::vector<int> hostVector(N, 1);
-  int *arr = (int *)device->api->allocGlobMem(N * sizeof(int));
+  int *arr = (int *)device->api().allocGlobMem(N * sizeof(int));
 
-  device->api->copyToAsync(arr, &hostVector[0], N * sizeof(int), device->api->getDefaultStream());
-  device->algorithms.scaleArray(arr, 5, N, device->api->getDefaultStream());
-  device->api->copyFromAsync(&hostVector[0], arr, N * sizeof(int), device->api->getDefaultStream());
+  device->api().copyToAsync(arr, &hostVector[0], N * sizeof(int), device->api().getDefaultStream());
+  device->algorithms().scaleArray(arr, 5, N, device->api().getDefaultStream());
+  device->api().copyFromAsync(&hostVector[0], arr, N * sizeof(int), device->api().getDefaultStream());
 
-  device->api->syncDefaultStreamWithHost();
+  device->api().syncDefaultStreamWithHost();
 
   for (auto &i : hostVector) {
     EXPECT_EQ(5, i);
   }
 
-  device->api->freeGlobMem(arr);
+  device->api().freeGlobMem(arr);
 }
 
diff --git a/tests/batch_manip.cpp b/tests/batch_manip.cpp
index 380dad6..9e1db35 100644
--- a/tests/batch_manip.cpp
+++ b/tests/batch_manip.cpp
@@ -17,8 +17,8 @@ class BatchManip : public BaseTestSuite {
 public:
   template<typename T, typename F>
   void testWrapper(size_t N, size_t M, bool sparse, F&& inner) {
-    T *data = (T *)device->api->allocGlobMem(N * M * sizeof(T));
-    T **batch = (T **)device->api->allocUnifiedMem(N * sizeof(T*));
+    T *data = (T *)device->api().allocGlobMem(N * M * sizeof(T));
+    T **batch = (T **)device->api().allocUnifiedMem(N * sizeof(T*));
 
     for (size_t i = 0; i < N; ++i) {
       if (!(sparse && i % 2 == 0)) {
@@ -31,8 +31,8 @@ class BatchManip : public BaseTestSuite {
 
     std::forward<F>(inner)(batch, data);
 
-    device->api->freeGlobMem(data);
-    device->api->freeGlobMem(batch);
+    device->api().freeGlobMem(data);
+    device->api().freeGlobMem(batch);
   }
 };
 
@@ -43,12 +43,12 @@ TEST_F(BatchManip, fill32) {
   testWrapper<float>(N, M, false, [&](float** batch, float* data) {
     float scalar = 502;
 
-    device->algorithms.setToValue(batch, scalar, M, N, device->api->getDefaultStream());
+    device->algorithms().setToValue(batch, scalar, M, N, device->api().getDefaultStream());
 
     std::vector<float> hostVector(N * M, 0);
-    device->api->copyFromAsync(&hostVector[0], data, N * M * sizeof(float), device->api->getDefaultStream());
+    device->api().copyFromAsync(&hostVector[0], data, N * M * sizeof(float), device->api().getDefaultStream());
 
-    device->api->syncDefaultStreamWithHost();
+    device->api().syncDefaultStreamWithHost();
 
     for (auto &i : hostVector) {
         EXPECT_EQ(scalar, i);
@@ -60,12 +60,12 @@ TEST_F(BatchManip, touchClean32) {
   const int N = 100;
   const int M = 120;
   testWrapper<float>(N, M, false, [&](float** batch, float* data) {
-    device->algorithms.touchBatchedMemory(batch, M, N, true, device->api->getDefaultStream());
+    device->algorithms().touchBatchedMemory(batch, M, N, true, device->api().getDefaultStream());
     std::vector<float> hostVector(N * M, 1);
 
-    device->api->copyFromAsync(&hostVector[0], data, M * N * sizeof(float), device->api->getDefaultStream());
+    device->api().copyFromAsync(&hostVector[0], data, M * N * sizeof(float), device->api().getDefaultStream());
 
-    device->api->syncDefaultStreamWithHost();
+    device->api().syncDefaultStreamWithHost();
 
     for (auto &i : hostVector) {
       EXPECT_EQ(0, i);
@@ -75,11 +75,11 @@ TEST_F(BatchManip, touchClean32) {
   testWrapper<float>(N, M, true, [&](float** batch, float* data) {
     std::vector<float> hostVector(N * M, 1);
 
-    device->api->copyToAsync(data, &hostVector[0], N * M * sizeof(float), device->api->getDefaultStream());
-    device->algorithms.touchBatchedMemory(batch, M, N, true, device->api->getDefaultStream());
-    device->api->copyFromAsync(&hostVector[0], data, M * N * sizeof(float), device->api->getDefaultStream());
+    device->api().copyToAsync(data, &hostVector[0], N * M * sizeof(float), device->api().getDefaultStream());
+    device->algorithms().touchBatchedMemory(batch, M, N, true, device->api().getDefaultStream());
+    device->api().copyFromAsync(&hostVector[0], data, M * N * sizeof(float), device->api().getDefaultStream());
 
-    device->api->syncDefaultStreamWithHost();
+    device->api().syncDefaultStreamWithHost();
 
     for (size_t i = 0; i < N; ++i) {
       for (size_t j = 0; j < M; ++j) {
@@ -100,11 +100,11 @@ TEST_F(BatchManip, touchNoClean32) {
   testWrapper<float>(N, M, false, [&](float** batch, float* data) {
     std::vector<float> hostVector(N * M, 1);
 
-    device->api->copyToAsync(data, &hostVector[0], N * M * sizeof(float), device->api->getDefaultStream());
-    device->algorithms.touchBatchedMemory(batch, M, N, false, device->api->getDefaultStream());
-    device->api->copyFromAsync(&hostVector[0], data, N * M * sizeof(float), device->api->getDefaultStream());
+    device->api().copyToAsync(data, &hostVector[0], N * M * sizeof(float), device->api().getDefaultStream());
+    device->algorithms().touchBatchedMemory(batch, M, N, false, device->api().getDefaultStream());
+    device->api().copyFromAsync(&hostVector[0], data, N * M * sizeof(float), device->api().getDefaultStream());
 
-    device->api->syncDefaultStreamWithHost();
+    device->api().syncDefaultStreamWithHost();
 
     for (auto &i : hostVector) {
       EXPECT_EQ(1, i);
@@ -116,42 +116,42 @@ TEST_F(BatchManip, scatterToUniform32) {
   const int N = 10000;
   const int M = 12000;
 
-  float *data2 = (float *)device->api->allocGlobMem(N * M * sizeof(float));
+  float *data2 = (float *)device->api().allocGlobMem(N * M * sizeof(float));
   testWrapper<float>(N, M, false, [&](float** batch, float* data) {
     std::vector<float> hostVector(N * M, 1);
 
-    device->api->copyToAsync(data, &hostVector[0], N * M * sizeof(float), device->api->getDefaultStream());
-    device->algorithms.copyScatterToUniform(const_cast<const float**>(batch), data2, M, M, N, device->api->getDefaultStream());
-    device->api->copyFromAsync(&hostVector[0], data2, N * M * sizeof(float), device->api->getDefaultStream());
+    device->api().copyToAsync(data, &hostVector[0], N * M * sizeof(float), device->api().getDefaultStream());
+    device->algorithms().copyScatterToUniform(const_cast<const float**>(batch), data2, M, M, N, device->api().getDefaultStream());
+    device->api().copyFromAsync(&hostVector[0], data2, N * M * sizeof(float), device->api().getDefaultStream());
 
-    device->api->syncDefaultStreamWithHost();
+    device->api().syncDefaultStreamWithHost();
 
     for (auto &i : hostVector) {
       EXPECT_EQ(1, i);
     }
   });
-  device->api->freeGlobMem(data2);
+  device->api().freeGlobMem(data2);
 }
 
 TEST_F(BatchManip, uniformToScatter32) {
   const int N = 10000;
   const int M = 12000;
 
-  float *data2 = (float *)device->api->allocGlobMem(N * M * sizeof(float));
+  float *data2 = (float *)device->api().allocGlobMem(N * M * sizeof(float));
   testWrapper<float>(N, M, false, [&](float** batch, float* data) {
     std::vector<float> hostVector(N * M, 1);
 
-    device->api->copyToAsync(data2, &hostVector[0], N * M * sizeof(float), device->api->getDefaultStream());
-    device->algorithms.copyUniformToScatter(data2, batch, M, M, N, device->api->getDefaultStream());
-    device->api->copyFromAsync(&hostVector[0], data, N * M * sizeof(float), device->api->getDefaultStream());
+    device->api().copyToAsync(data2, &hostVector[0], N * M * sizeof(float), device->api().getDefaultStream());
+    device->algorithms().copyUniformToScatter(data2, batch, M, M, N, device->api().getDefaultStream());
+    device->api().copyFromAsync(&hostVector[0], data, N * M * sizeof(float), device->api().getDefaultStream());
 
-    device->api->syncDefaultStreamWithHost();
+    device->api().syncDefaultStreamWithHost();
 
     for (auto &i : hostVector) {
       EXPECT_EQ(1, i);
     }
   });
-  device->api->freeGlobMem(data2);
+  device->api().freeGlobMem(data2);
 }
 
 
@@ -164,12 +164,12 @@ TEST_F(BatchManip, fill64) {
   testWrapper<double>(N, M, false, [&](double** batch, double* data) {
     double scalar = 502;
 
-    device->algorithms.setToValue(batch, scalar, M, N, device->api->getDefaultStream());
+    device->algorithms().setToValue(batch, scalar, M, N, device->api().getDefaultStream());
 
     std::vector<double> hostVector(N * M, 0);
-    device->api->copyFromAsync(&hostVector[0], data, N * M * sizeof(double), device->api->getDefaultStream());
+    device->api().copyFromAsync(&hostVector[0], data, N * M * sizeof(double), device->api().getDefaultStream());
 
-    device->api->syncDefaultStreamWithHost();
+    device->api().syncDefaultStreamWithHost();
 
     for (auto &i : hostVector) {
         EXPECT_EQ(scalar, i);
@@ -181,12 +181,12 @@ TEST_F(BatchManip, touchClean64) {
   const int N = 100;
   const int M = 120;
   testWrapper<double>(N, M, false, [&](double** batch, double* data) {
-    device->algorithms.touchBatchedMemory(batch, M, N, true, device->api->getDefaultStream());
+    device->algorithms().touchBatchedMemory(batch, M, N, true, device->api().getDefaultStream());
     std::vector<double> hostVector(N * M, 1);
 
-    device->api->copyFromAsync(&hostVector[0], data, M * N * sizeof(double), device->api->getDefaultStream());
+    device->api().copyFromAsync(&hostVector[0], data, M * N * sizeof(double), device->api().getDefaultStream());
 
-    device->api->syncDefaultStreamWithHost();
+    device->api().syncDefaultStreamWithHost();
 
     for (auto &i : hostVector) {
       EXPECT_EQ(0, i);
@@ -196,11 +196,11 @@ TEST_F(BatchManip, touchClean64) {
   testWrapper<double>(N, M, true, [&](double** batch, double* data) {
     std::vector<double> hostVector(N * M, 1);
 
-    device->api->copyToAsync(data, &hostVector[0], N * M * sizeof(double), device->api->getDefaultStream());
-    device->algorithms.touchBatchedMemory(batch, M, N, true, device->api->getDefaultStream());
-    device->api->copyFromAsync(&hostVector[0], data, M * N * sizeof(double), device->api->getDefaultStream());
+    device->api().copyToAsync(data, &hostVector[0], N * M * sizeof(double), device->api().getDefaultStream());
+    device->algorithms().touchBatchedMemory(batch, M, N, true, device->api().getDefaultStream());
+    device->api().copyFromAsync(&hostVector[0], data, M * N * sizeof(double), device->api().getDefaultStream());
 
-    device->api->syncDefaultStreamWithHost();
+    device->api().syncDefaultStreamWithHost();
 
     for (size_t i = 0; i < N; ++i) {
       for (size_t j = 0; j < M; ++j) {
@@ -221,11 +221,11 @@ TEST_F(BatchManip, touchNoClean64) {
   testWrapper<double>(N, M, false, [&](double** batch, double* data) {
     std::vector<double> hostVector(N * M, 1);
 
-    device->api->copyToAsync(data, &hostVector[0], N * M * sizeof(double), device->api->getDefaultStream());
-    device->algorithms.touchBatchedMemory(batch, M, N, false, device->api->getDefaultStream());
-    device->api->copyFromAsync(&hostVector[0], data, N * M * sizeof(double), device->api->getDefaultStream());
+    device->api().copyToAsync(data, &hostVector[0], N * M * sizeof(double), device->api().getDefaultStream());
+    device->algorithms().touchBatchedMemory(batch, M, N, false, device->api().getDefaultStream());
+    device->api().copyFromAsync(&hostVector[0], data, N * M * sizeof(double), device->api().getDefaultStream());
 
-    device->api->syncDefaultStreamWithHost();
+    device->api().syncDefaultStreamWithHost();
 
     for (auto &i : hostVector) {
       EXPECT_EQ(1, i);
@@ -237,42 +237,42 @@ TEST_F(BatchManip, scatterToUniform64) {
   const int N = 100;
   const int M = 120;
 
-  double *data2 = (double *)device->api->allocGlobMem(N * M * sizeof(double));
+  double *data2 = (double *)device->api().allocGlobMem(N * M * sizeof(double));
   testWrapper<double>(N, M, false, [&](double** batch, double* data) {
     std::vector<double> hostVector(N * M, 1);
 
-    device->api->copyToAsync(data, &hostVector[0], N * M * sizeof(double), device->api->getDefaultStream());
-    device->algorithms.copyScatterToUniform(const_cast<const double**>(batch), data2, M, M, N, device->api->getDefaultStream());
-    device->api->copyFromAsync(&hostVector[0], data2, N * M * sizeof(double), device->api->getDefaultStream());
+    device->api().copyToAsync(data, &hostVector[0], N * M * sizeof(double), device->api().getDefaultStream());
+    device->algorithms().copyScatterToUniform(const_cast<const double**>(batch), data2, M, M, N, device->api().getDefaultStream());
+    device->api().copyFromAsync(&hostVector[0], data2, N * M * sizeof(double), device->api().getDefaultStream());
 
-    device->api->syncDefaultStreamWithHost();
+    device->api().syncDefaultStreamWithHost();
 
     for (auto &i : hostVector) {
       EXPECT_EQ(1, i);
     }
   });
-  device->api->freeGlobMem(data2);
+  device->api().freeGlobMem(data2);
 }
 
 TEST_F(BatchManip, uniformToScatter64) {
   const int N = 100;
   const int M = 120;
 
-  double *data2 = (double *)device->api->allocGlobMem(N * M * sizeof(double));
+  double *data2 = (double *)device->api().allocGlobMem(N * M * sizeof(double));
   testWrapper<double>(N, M, false, [&](double** batch, double* data) {
     std::vector<double> hostVector(N * M, 1);
 
-    device->api->copyToAsync(data2, &hostVector[0], N * M * sizeof(double), device->api->getDefaultStream());
-    device->algorithms.copyUniformToScatter(data2, batch, M, M, N, device->api->getDefaultStream());
-    device->api->copyFromAsync(&hostVector[0], data, N * M * sizeof(double), device->api->getDefaultStream());
+    device->api().copyToAsync(data2, &hostVector[0], N * M * sizeof(double), device->api().getDefaultStream());
+    device->algorithms().copyUniformToScatter(data2, batch, M, M, N, device->api().getDefaultStream());
+    device->api().copyFromAsync(&hostVector[0], data, N * M * sizeof(double), device->api().getDefaultStream());
 
-    device->api->syncDefaultStreamWithHost();
+    device->api().syncDefaultStreamWithHost();
 
     for (auto &i : hostVector) {
       EXPECT_EQ(1, i);
     }
   });
-  device->api->freeGlobMem(data2);
+  device->api().freeGlobMem(data2);
 }
 
 
diff --git a/tests/main.cpp b/tests/main.cpp
index 9f514c5..2863432 100644
--- a/tests/main.cpp
+++ b/tests/main.cpp
@@ -10,9 +10,9 @@ using namespace device;
 int main(int argc, char **argv) {
   ::testing::InitGoogleTest(&argc, argv);
 
-  DeviceInstance &device = DeviceInstance::getInstance();
-  device.api->setDevice(0);
-  device.api->initialize();
+  DeviceInstance &device = DeviceInstance::instance();
+  device.api().setDevice(0);
+  device.api().initialize();
 
   return RUN_ALL_TESTS();
 }
diff --git a/tests/memory.cpp b/tests/memory.cpp
index f87625f..728ce18 100644
--- a/tests/memory.cpp
+++ b/tests/memory.cpp
@@ -28,16 +28,16 @@ TEST_F(Memories, copy2DMemory) {
     }
   }
 
-  int *arr = (int *)device->api->allocGlobMem(M * N * sizeof(int));
+  int *arr = (int *)device->api().allocGlobMem(M * N * sizeof(int));
 
   int spitch = N * sizeof(int);
   int dpitch = N * sizeof(int);
   int width = N * sizeof(int);
   int height = M;
 
-  device->api->copy2dArrayTo(arr, dpitch, &hostVector[0], spitch, width, height);
+  device->api().copy2dArrayTo(arr, dpitch, &hostVector[0], spitch, width, height);
   int hostVector2[M][N];
-  device->api->copy2dArrayFrom(&hostVector2[0], dpitch, arr, spitch, width, height);
+  device->api().copy2dArrayFrom(&hostVector2[0], dpitch, arr, spitch, width, height);
 
   for (size_t i = 0; i < M; i++) {
     for (size_t j = 0; j < N; j++) {
@@ -45,7 +45,7 @@ TEST_F(Memories, copy2DMemory) {
     }
   }
 
-  device->api->freeGlobMem(arr);
+  device->api().freeGlobMem(arr);
 }
 
 /*
@@ -64,16 +64,16 @@ TEST_F(Memories, copy2DMemoryWithSrcPitch) {
     }
   }
 
-  int *arr = (int *)device->api->allocGlobMem(M * N * sizeof(int));
+  int *arr = (int *)device->api().allocGlobMem(M * N * sizeof(int));
 
   int spitch = (N + SPI) * sizeof(int);
   int dpitch = N * sizeof(int);
   int width = N * sizeof(int);
   int height = M;
 
-  device->api->copy2dArrayTo(arr, dpitch, &hostVector[0], spitch, width, height);
+  device->api().copy2dArrayTo(arr, dpitch, &hostVector[0], spitch, width, height);
   int hostVector2[M][N];
-  device->api->copy2dArrayFrom(&hostVector2[0], dpitch, arr, dpitch, width, height);
+  device->api().copy2dArrayFrom(&hostVector2[0], dpitch, arr, dpitch, width, height);
 
   for (size_t i = 0; i < M; i++) {
     for (size_t j = 0; j < N; j++) {
@@ -81,7 +81,7 @@ TEST_F(Memories, copy2DMemoryWithSrcPitch) {
     }
   }
 
-  device->api->freeGlobMem(arr);
+  device->api().freeGlobMem(arr);
 }
 */
 
@@ -103,16 +103,16 @@ TEST_F(Memories, copy2DMemoryWithDstPitch) {
     }
   }
 
-  int *arr = (int *)device->api->allocGlobMem(M * (N + DPI) * sizeof(int));
+  int *arr = (int *)device->api().allocGlobMem(M * (N + DPI) * sizeof(int));
 
   int spitch = (N + SPI) * sizeof(int);
   int dpitch = (N + DPI) * sizeof(int);
   int width = N * sizeof(int);
   int height = M;
 
-  device->api->copy2dArrayTo(arr, dpitch, &hostVector[0], spitch, width, height);
+  device->api().copy2dArrayTo(arr, dpitch, &hostVector[0], spitch, width, height);
   int hostVector2[M][N + SPI];
-  device->api->copy2dArrayFrom(&hostVector2[0], spitch, arr, dpitch, width, height);
+  device->api().copy2dArrayFrom(&hostVector2[0], spitch, arr, dpitch, width, height);
 
   for (size_t i = 0; i < M; i++) {
     for (size_t j = 0; j < N + SPI; j++) {
@@ -120,7 +120,7 @@ TEST_F(Memories, copy2DMemoryWithDstPitch) {
     }
   }
 
-  device->api->freeGlobMem(arr);
+  device->api().freeGlobMem(arr);
 }
 */
 
diff --git a/tests/reductions.cpp b/tests/reductions.cpp
index 2af4756..63e40cf 100644
--- a/tests/reductions.cpp
+++ b/tests/reductions.cpp
@@ -28,45 +28,45 @@ TEST_F(Reductions, Add) {
     element = distribution(randomEngine);
   }
 
-  auto* devVector = reinterpret_cast<unsigned *>(device->api->allocGlobMem(sizeof(unsigned) * size));
-  device->api->copyTo(devVector, vector.data(), sizeof(unsigned) * size);
+  auto* devVector = reinterpret_cast<unsigned *>(device->api().allocGlobMem(sizeof(unsigned) * size));
+  device->api().copyTo(devVector, vector.data(), sizeof(unsigned) * size);
 
   auto expectedResult = std::accumulate(vector.begin(), vector.end(), 0, std::plus<unsigned>());
 
-  unsigned* testResult = reinterpret_cast<unsigned*>(device->api->allocPinnedMem(sizeof(unsigned)));
+  unsigned* testResult = reinterpret_cast<unsigned*>(device->api().allocPinnedMem(sizeof(unsigned)));
 
-  device->algorithms.reduceVector(testResult, devVector, true, size, ReductionType::Add, device->api->getDefaultStream());
-  device->api->syncDefaultStreamWithHost();
+  device->algorithms().reduceVector(testResult, devVector, true, size, ReductionType::Add, device->api().getDefaultStream());
+  device->api().syncDefaultStreamWithHost();
   EXPECT_EQ(expectedResult, *testResult);
-  device->api->freePinnedMem(testResult);
-  device->api->freeGlobMem(devVector);
+  device->api().freePinnedMem(testResult);
+  device->api().freeGlobMem(devVector);
 }
 
 TEST_F(Reductions, Max) {
   constexpr size_t size = 20010000;
   std::vector<unsigned> vector(size, 0);
 
-  auto* devVector = reinterpret_cast<unsigned *>(device->api->allocGlobMem(sizeof(unsigned) * size));
-  device->api->copyTo(devVector, vector.data(), sizeof(unsigned) * size);
+  auto* devVector = reinterpret_cast<unsigned *>(device->api().allocGlobMem(sizeof(unsigned) * size));
+  device->api().copyTo(devVector, vector.data(), sizeof(unsigned) * size);
 
   std::uniform_int_distribution<> distribution(10, 100);
   for (auto &element : vector) {
     element = distribution(randomEngine);
   }
 
-  device->api->copyTo(devVector, vector.data(), sizeof(unsigned) * size);
+  device->api().copyTo(devVector, vector.data(), sizeof(unsigned) * size);
 
   auto max = [](unsigned a, unsigned b) -> unsigned { return a > b ? a : b; };
   auto initValue = std::numeric_limits<unsigned>::min();
   auto expectedResult = std::accumulate(vector.begin(), vector.end(), initValue, max);
 
-  unsigned* testResult = reinterpret_cast<unsigned*>(device->api->allocPinnedMem(sizeof(unsigned)));
+  unsigned* testResult = reinterpret_cast<unsigned*>(device->api().allocPinnedMem(sizeof(unsigned)));
 
-  device->algorithms.reduceVector(testResult, devVector, true, size, ReductionType::Max, device->api->getDefaultStream());
-  device->api->syncDefaultStreamWithHost();
+  device->algorithms().reduceVector(testResult, devVector, true, size, ReductionType::Max, device->api().getDefaultStream());
+  device->api().syncDefaultStreamWithHost();
   EXPECT_EQ(expectedResult, *testResult);
-  device->api->freePinnedMem(testResult);
-  device->api->freeGlobMem(devVector);
+  device->api().freePinnedMem(testResult);
+  device->api().freeGlobMem(devVector);
 }
 
 TEST_F(Reductions, Min) {
@@ -78,19 +78,19 @@ TEST_F(Reductions, Min) {
     element = distribution(randomEngine);
   }
 
-  auto* devVector = reinterpret_cast<unsigned *>(device->api->allocGlobMem(sizeof(unsigned) * size));
-  device->api->copyTo(devVector, vector.data(), sizeof(unsigned) * size);
+  auto* devVector = reinterpret_cast<unsigned *>(device->api().allocGlobMem(sizeof(unsigned) * size));
+  device->api().copyTo(devVector, vector.data(), sizeof(unsigned) * size);
 
   auto min = [](unsigned a, unsigned b) -> unsigned { return a > b ? b : a; };
   auto initValue = std::numeric_limits<unsigned>::max();
   auto expectedResult = std::accumulate(vector.begin(), vector.end(), initValue, min);
 
-  unsigned* testResult = reinterpret_cast<unsigned*>(device->api->allocPinnedMem(sizeof(unsigned)));
+  unsigned* testResult = reinterpret_cast<unsigned*>(device->api().allocPinnedMem(sizeof(unsigned)));
 
-  device->algorithms.reduceVector(testResult, devVector, true, size, ReductionType::Min, device->api->getDefaultStream());
-  device->api->syncDefaultStreamWithHost();
+  device->algorithms().reduceVector(testResult, devVector, true, size, ReductionType::Min, device->api().getDefaultStream());
+  device->api().syncDefaultStreamWithHost();
   EXPECT_EQ(expectedResult, *testResult);
-  device->api->freePinnedMem(testResult);
-  device->api->freeGlobMem(devVector);
+  device->api().freePinnedMem(testResult);
+  device->api().freeGlobMem(devVector);
 }