diff --git a/Grid/allocator/Allocator.h b/Grid/allocator/Allocator.h
index 589ea36f83..f151bc0d54 100644
--- a/Grid/allocator/Allocator.h
+++ b/Grid/allocator/Allocator.h
@@ -2,3 +2,4 @@
 #include <Grid/allocator/MemoryStats.h>
 #include <Grid/allocator/MemoryManager.h>
 #include <Grid/allocator/AlignedAllocator.h>
+#include <Grid/allocator/DeviceMemoryAllocator.h>
diff --git a/Grid/allocator/DeviceMemoryAllocator.cc b/Grid/allocator/DeviceMemoryAllocator.cc
new file mode 100644
index 0000000000..c6711deba2
--- /dev/null
+++ b/Grid/allocator/DeviceMemoryAllocator.cc
@@ -0,0 +1,235 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/DeviceMemoryAllocator.h
+
+    Copyright (C) 2025
+
+Author: Christoph Lehner <christoph@lhnr.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/GridCore.h>
+
+NAMESPACE_BEGIN(Grid);
+
+#define DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE (64*1024)
+#define OVERALLOCATION_FACTOR 1.2
+
+#ifdef GRID_DEVICE_MEMORY_ALLOCATOR
+struct DeviceMemoryAllocator {
+
+  bool initialized;
+  char* base;
+  size_t size;
+  size_t offset;
+  bool verbose;
+
+  DeviceMemoryAllocator() {
+    initialized = false;
+    base = 0;
+    size = 0;
+    offset = 0;
+    verbose = false;
+  }
+
+  ~DeviceMemoryAllocator() {
+    if (initialized) {
+      acceleratorFreeDevice(base);
+      initialized = false;
+    }
+  }
+
+  std::vector<size_t> pages;
+  std::map<size_t, std::vector<size_t> > size_map;
+
+  void Init(size_t _size) {
+    assert(!initialized);
+
+    char* str;
+    if ((str = getenv("GRID_OVERALLOCATION_FACTOR"))) {
+      _size = (size_t)(_size * atof(str));
+    } else {
+      _size = (size_t)(_size * OVERALLOCATION_FACTOR);
+    }
+
+    verbose = (getenv("GRID_DEBUG_DEVICE_ALLOCATOR") != 0);
+    
+    size_t n_pages = (_size + DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE - 1) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
+    size = n_pages * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
+    std::cout << GridLogMessage << "Init device allocator with " << size << " bytes" << std::endl;
+
+    base = (char*)acceleratorAllocDeviceInternal(size);
+    assert(base);
+
+    if (verbose)
+      std::cout << GridLogMessage << "Initialize memory to zero" << std::endl;
+
+    {
+      uint64_t* ba = (uint64_t*)base;
+      size_t n = size / sizeof(uint64_t);
+      size_t MAX_BLOCK_INIT = 128*1024*1024;
+      while (n > 0) {
+	size_t n0 = n;
+	if (n0 > MAX_BLOCK_INIT)
+	  n0 = MAX_BLOCK_INIT;
+	accelerator_for(i, n0, 1, {
+	    ba[i] = (uint64_t)-1;
+	  });
+	ba += n0;
+	n -= n0;
+      }
+    }  
+
+    if (verbose)
+      std::cout << GridLogMessage << "Done" << std::endl;
+    
+    offset = 0;
+    
+    pages.resize(n_pages, 0);
+
+    if (verbose)
+      std::cout << GridLogMessage << "Pages initialized" << std::endl;
+  
+    initialized = true;
+  }
+
+  void* attemptReuseExactSize(size_t n_pages) {
+    auto sm = size_map.find(n_pages);
+    if (sm != size_map.end() && sm->second.size() > 0) {
+      size_t index = sm->second.back();
+      sm->second.pop_back();
+      
+      if (sm->second.size() == 0)
+	size_map.erase(sm);
+      
+      assert(pages[index] == 0);
+      pages[index] = n_pages;
+
+      return base + index * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
+    }
+    return 0;
+  }
+
+  void* attemptAllocUnused(size_t n_pages) {
+    size_t end = (offset + n_pages) * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
+    void* ptr = 0;
+    
+    if (end <= size) {
+      pages[offset] = n_pages;
+    
+      ptr = base + offset * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
+      offset += n_pages;
+
+      if (verbose) {
+	size_t reusable_pages = 0;
+	for (auto & sm : size_map)
+	  reusable_pages += sm.first * sm.second.size();
+	
+	std::cout << GridLogMessage << (size - end) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE << " pages left to allocate ("
+		  << (size - end) * 100 / size << "% unallocated, " << reusable_pages << " reusable pages)" << std::endl;
+      }
+    }
+
+    return ptr;
+  }
+
+  void* alloc(size_t bytes) {
+    if (!initialized)
+      Init(MemoryManager::DeviceMaxBytes);
+    
+    if (!bytes)
+      bytes++;
+    
+    size_t n_pages = (bytes + DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE - 1) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
+    
+    // first check if block of perfect size is available
+    void* ptr;
+    if ((ptr = attemptReuseExactSize(n_pages))) {
+      
+      if (verbose)
+	std::cout << GridLogMessage << "Can re-use perfect pointer for " << n_pages << " pages" << std::endl;
+      
+      return ptr;
+    }
+
+    // if not, attempt to allocate in the unused area
+    if ((ptr = attemptAllocUnused(n_pages)))
+      return ptr;
+
+    // last attempt, find a re-usable region that barely fits and return it
+    // for loop of std::map iterates in ascending order
+    size_t reusable_pages = 0;
+    size_t n_pages_usable = 0;
+    for (auto & sm : size_map) {
+      assert(sm.second.size() > 0); // should never be empty
+      reusable_pages += sm.first * sm.second.size();
+      if (n_pages_usable == 0 && sm.first > n_pages)
+	n_pages_usable = sm.first;
+    }    
+
+    if (n_pages_usable == 0) {
+      std::cout << GridLogMessage << "Out of memory for " << n_pages << " pages!  Re-usable pages at time of death:" << std::endl;
+
+      for (auto & sm : size_map) {
+	std::cout << GridLogMessage << sm.second.size() << " x " << sm.first << " pages" << std::endl;
+      }
+	
+      exit(1);
+    }
+
+    if ((ptr = attemptReuseExactSize(n_pages_usable))) {
+      
+      if (verbose)
+	std::cout << GridLogMessage << "Can re-use pointer for " << n_pages_usable << " pages when " << n_pages << " were needed; " << reusable_pages << " reusable pages" << std::endl;
+      
+      return ptr;
+    }
+
+    // this should never be reached
+    assert(0);
+    return ptr;
+  }
+
+  void free(void* ptr) {
+    if (!initialized)
+      return;
+    
+    size_t index = ((size_t)((char*)ptr - base)) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
+    size_t n_pages = pages[index];
+    //std::cout << GridLogMessage << "Freeing ptr " << ptr << " has " << n_pages << " pages" << std::endl;
+    pages[index] = 0;
+    auto & sm = size_map[n_pages];
+    sm.push_back(index);
+  }
+};
+
+static DeviceMemoryAllocator dma;
+
+void *acceleratorAllocDevice(size_t bytes) {
+  return dma.alloc(bytes);
+}
+
+void acceleratorFreeDevice(void *ptr) {
+  dma.free(ptr);
+}
+#endif
+
+NAMESPACE_END(Grid);
diff --git a/Grid/allocator/DeviceMemoryAllocator.h b/Grid/allocator/DeviceMemoryAllocator.h
new file mode 100644
index 0000000000..13c7eb8e18
--- /dev/null
+++ b/Grid/allocator/DeviceMemoryAllocator.h
@@ -0,0 +1,36 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/DeviceMemoryAllocator.h
+
+    Copyright (C) 2025
+
+Author: Christoph Lehner <christoph@lhnr.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+NAMESPACE_BEGIN(Grid);
+
+#ifdef GRID_DEVICE_MEMORY_ALLOCATOR
+void *acceleratorAllocDevice(size_t bytes);
+void acceleratorFreeDevice(void *ptr);
+#endif
+
+NAMESPACE_END(Grid);
diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index 38e8072d4e..bb168a66a4 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -44,6 +44,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 inline void *memalign(size_t align, size_t bytes) { return malloc(bytes); }
 #endif
 
+#ifdef GRID_DEVICE_MEMORY_ALLOCATOR
+#define acceleratorAllocDevice acceleratorAllocDeviceInternal
+#define acceleratorFreeDevice acceleratorFreeDeviceInternal
+#endif
+
 NAMESPACE_BEGIN(Grid);
 
 //////////////////////////////////////////////////////////////////////////////////
@@ -346,7 +351,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 		     });						\
   });
 
-#define accelerator_barrier(dummy) { theGridAccelerator->wait(); }
+#define accelerator_barrier(dummy) { theGridAccelerator->wait_and_throw(); theGridAccelerator->wait_and_throw(); }
 
 inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
 inline void *acceleratorAllocHost(size_t bytes)  { return malloc_host(bytes,*theGridAccelerator);};
@@ -355,7 +360,7 @@ inline void acceleratorFreeHost(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
 
-inline void acceleratorCopySynchronise(void) {  theCopyAccelerator->wait(); }
+inline void acceleratorCopySynchronise(void) {  theCopyAccelerator->wait_and_throw(); theCopyAccelerator->wait_and_throw(); }
 
 
 ///////
@@ -365,7 +370,7 @@ typedef sycl::event acceleratorEvent_t;
 
 inline void acceleratorEventWait(acceleratorEvent_t ev)
 {
-  ev.wait();
+  ev.wait_and_throw();
 }
 
 inline int acceleratorEventIsComplete(acceleratorEvent_t ev)
@@ -377,9 +382,9 @@ inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *t
 inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)        { return theCopyAccelerator->memcpy(to,from,bytes); }
 inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes)      { return theCopyAccelerator->memcpy(to,from,bytes); }
 
-inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
-inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
-inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
+inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait_and_throw();theCopyAccelerator->wait_and_throw();}
+inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait_and_throw();theCopyAccelerator->wait_and_throw();}
+inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait_and_throw();theCopyAccelerator->wait_and_throw();}
 
 inline int  acceleratorIsCommunicable(void *ptr)
 {
@@ -650,7 +655,7 @@ inline void acceleratorFreeCpu  (void *ptr){free(ptr);};
 //////////////////////////////////////////////
 
 #ifdef GRID_SYCL
-inline void acceleratorFenceComputeStream(void){ theGridAccelerator->ext_oneapi_submit_barrier(); };
+inline void acceleratorFenceComputeStream(void){ theGridAccelerator->ext_oneapi_submit_barrier(); theGridAccelerator->ext_oneapi_submit_barrier(); };
 #else
 // Ordering within a stream guaranteed on Nvidia & AMD
 inline void acceleratorFenceComputeStream(void){ };
@@ -720,3 +725,8 @@ template<class T> T acceleratorGet(T& dev)
 
 
 NAMESPACE_END(Grid);
+
+#ifdef GRID_DEVICE_MEMORY_ALLOCATOR
+#undef acceleratorAllocDevice
+#undef acceleratorFreeDevice
+#endif
diff --git a/benchmarks/Benchmark_dwf_fp32_jureap.cc b/benchmarks/Benchmark_dwf_fp32_jureap.cc
new file mode 100644
index 0000000000..92dd40bc1b
--- /dev/null
+++ b/benchmarks/Benchmark_dwf_fp32_jureap.cc
@@ -0,0 +1,431 @@
+ /*************************************************************************************
+    Grid physics library, www.github.com/paboyle/Grid
+    Source file: ./benchmarks/Benchmark_dwf.cc
+    Copyright (C) 2015
+
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#ifdef GRID_CUDA
+#define CUDA_PROFILE
+#endif
+
+#ifdef CUDA_PROFILE
+#include <cuda_profiler_api.h>
+#endif
+
+using namespace std;
+using namespace Grid;
+
+////////////////////////
+/// Move to domains ////
+////////////////////////
+
+Gamma::Algebra Gmu [] = {
+			 Gamma::Algebra::GammaX,
+			 Gamma::Algebra::GammaY,
+			 Gamma::Algebra::GammaZ,
+			 Gamma::Algebra::GammaT
+};
+
+void Benchmark(int Ls, Coordinate Dirichlet);
+
+#include <chrono>
+#include <thread>
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+
+  int threads = GridThread::GetThreads();
+
+  int Ls=16;
+  for(int i=0;i<argc;i++) {
+    if(std::string(argv[i]) == "-Ls"){
+      std::stringstream ss(argv[i+1]); ss >> Ls;
+    }
+  }
+
+  //////////////////
+  // With comms
+  //////////////////
+  Coordinate Dirichlet(Nd+1,0);
+
+  std::cout << "\n\n\n\n\n\n" <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  
+  Benchmark(Ls,Dirichlet);
+
+  //////////////////
+  // Domain decomposed
+  //////////////////
+  Coordinate latt4  = GridDefaultLatt();
+  Coordinate mpi    = GridDefaultMpi();
+  Coordinate CommDim(Nd);
+  Coordinate shm;
+  GlobalSharedMemory::GetShmDims(mpi,shm);
+
+
+
+  Grid_finalize();
+  exit(0);
+}
+void Benchmark(int Ls, Coordinate Dirichlet)
+{
+  Coordinate latt4 = GridDefaultLatt();
+  GridLogLayout();
+
+  long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+#define SINGLE
+#ifdef SINGLE
+  typedef vComplexF          Simd;
+  typedef LatticeFermionF    FermionField;
+  typedef LatticeGaugeFieldF GaugeField;
+  typedef LatticeColourMatrixF ColourMatrixField;
+  typedef DomainWallFermionF FermionAction;
+#endif
+#ifdef DOUBLE
+  typedef vComplexD          Simd;
+  typedef LatticeFermionD    FermionField;
+  typedef LatticeGaugeFieldD GaugeField;
+  typedef LatticeColourMatrixD ColourMatrixField;
+  typedef DomainWallFermionD FermionAction;
+#endif
+#ifdef DOUBLE2
+  typedef vComplexD2          Simd;
+  typedef LatticeFermionD2    FermionField;
+  typedef LatticeGaugeFieldD2 GaugeField;
+  typedef LatticeColourMatrixD2 ColourMatrixField;
+  typedef DomainWallFermionD2 FermionAction;
+#endif
+  
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString(std::string("The 4D RNG"));
+
+  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG"));
+
+ 
+  FermionField src   (FGrid); random(RNG5,src);
+#if 0
+  src = Zero();
+  {
+    Coordinate origin({0,0,0,latt4[2]-1,0});
+    SpinColourVectorF tmp;
+    tmp=Zero();
+    tmp()(0)(0)=Complex(-2.0,0.0);
+    std::cout << " source site 0 " << tmp<<std::endl;
+    pokeSite(tmp,src,origin);
+  }
+#else
+  RealD N2 = 1.0/::sqrt(norm2(src));
+  src = src*N2;
+#endif
+
+  FermionField result(FGrid); result=Zero();
+  FermionField    ref(FGrid);    ref=Zero();
+  FermionField    tmp(FGrid);
+  FermionField    err(FGrid);
+
+  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
+  GaugeField Umu(UGrid);
+  GaugeField UmuCopy(UGrid);
+  SU<Nc>::HotConfiguration(RNG4,Umu);
+  //  SU<Nc>::ColdConfiguration(Umu);
+  UmuCopy=Umu;
+  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
+
+  ////////////////////////////////////
+  // Apply BCs
+  ////////////////////////////////////
+  Coordinate Block(4);
+  for(int d=0;d<4;d++)  Block[d]= Dirichlet[d+1];
+
+  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
+  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
+
+  DirichletFilter<GaugeField> Filter(Block);
+  Filter.applyFilter(Umu);
+  
+  ////////////////////////////////////
+  // Naive wilson implementation
+  ////////////////////////////////////
+  std::vector<ColourMatrixField> U(4,UGrid);
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+  }
+
+  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
+
+  if (1)
+  {
+    ref = Zero();
+    for(int mu=0;mu<Nd;mu++){
+
+      tmp = Cshift(src,mu+1,1);
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v  , U[mu]  , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
+	  }
+	}
+      }
+      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
+
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v  , U[mu]  , CpuRead);
+	autoView( src_v, src    , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
+	  }
+	}
+      }
+      tmp =Cshift(tmp,mu+1,-1);
+      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
+    }
+    ref = -0.5*ref;
+  }
+
+  RealD mass=0.1;
+  RealD M5  =1.8;
+
+  RealD NP = UGrid->_Nprocessors;
+  RealD NN = UGrid->NodeCount();
+
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
+  std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl;
+#ifdef GRID_OMP
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
+#endif
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+
+  FermionAction::ImplParams p;
+  p.dirichlet=Dirichlet;
+  FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
+  Dw.ImportGauge(Umu);
+  
+  int ncall =300;
+  RealD n2e;
+  
+  if (1) {
+    FGrid->Barrier();
+    Dw.Dhop(src,result,0);
+    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
+    
+    FGrid->Barrier();
+    std::this_thread::sleep_for(std::chrono::seconds(5));
+    auto cpu_start = std::chrono::system_clock::now();
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      Dw.Dhop(src,result,0);
+    }
+    double t1=usecond();
+    FGrid->Barrier();
+    auto cpu_stop = std::chrono::system_clock::now();
+    std::this_thread::sleep_for(std::chrono::seconds(5));
+
+    if (FGrid->ThisRank() == 0) {
+      std::ofstream  file("energy.times");
+      file<<"energy_start:"<<  std::chrono::duration_cast<std::chrono::milliseconds>(cpu_start.time_since_epoch()).count()<<std::endl;
+      file<<"energy_stop:"<<  std::chrono::duration_cast<std::chrono::milliseconds>(cpu_stop.time_since_epoch()).count()<<std::endl;
+    }
+    
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=single_site_flops*volume*ncall;
+
+    auto nsimd = Simd::Nsimd();
+    auto simdwidth = sizeof(Simd);
+
+    // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
+    double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
+
+    // mem: Nd Wilson * Ls, Nd gauge, Nc colors
+    double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
+
+    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
+    err = ref-result;
+    n2e = norm2(err);
+    std::cout<<GridLogMessage << "norm diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
+
+    if(( n2e>1.0e-4) ) {
+      std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
+      FGrid->Barrier();
+      std::cout<<GridLogMessage << "RESULT" << std::endl;
+      //      std::cout << result<<std::endl;
+      std::cout << norm2(result)<<std::endl;
+      std::cout<<GridLogMessage << "REF" << std::endl;
+      std::cout << norm2(ref)<<std::endl;
+      std::cout<<GridLogMessage << "ERR" << std::endl;
+      std::cout << norm2(err)<<std::endl;
+      FGrid->Barrier();
+      exit(-1);
+    }
+    assert (n2e< 1.0e-4 );
+  }
+
+  if (1)
+  { // Naive wilson dag implementation
+    ref = Zero();
+    for(int mu=0;mu<Nd;mu++){
+
+      //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
+      tmp = Cshift(src,mu+1,1);
+      {
+	autoView( ref_v, ref, CpuWrite);
+	autoView( tmp_v, tmp, CpuRead);
+	autoView( U_v  , U[mu]  , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    int i=s+Ls*ss;
+	    ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ;
+	  }
+	}
+      }
+      
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v  , U[mu]  , CpuRead);
+	autoView( src_v, src    , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
+	  }
+	}
+      }
+      //      tmp =adj(U[mu])*src;
+      tmp =Cshift(tmp,mu+1,-1);
+      {
+	autoView( ref_v, ref, CpuWrite);
+	autoView( tmp_v, tmp, CpuRead);
+	for(int i=0;i<ref_v.size();i++){
+	  ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
+	}
+      }
+    }
+    ref = -0.5*ref;
+  }
+
+  Dw.Dhop(src,result,DaggerYes);
+
+  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
+  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
+  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
+
+  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
+  std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
+  std::cout<<GridLogMessage << "norm dag ref    "<< norm2(ref)<<std::endl;
+  err = ref-result;
+  n2e= norm2(err);
+  std::cout<<GridLogMessage << "norm dag diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
+
+  assert((n2e)<1.0e-4);
+  
+  FermionField src_e (FrbGrid);
+  FermionField src_o (FrbGrid);
+  FermionField r_e   (FrbGrid);
+  FermionField r_o   (FrbGrid);
+  FermionField r_eo  (FGrid);
+
+  std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
+  pickCheckerboard(Even,src_e,src);
+  pickCheckerboard(Odd,src_o,src);
+
+  std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
+  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
+
+
+  // S-direction is INNERMOST and takes no part in the parity.
+  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO                "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
+#ifdef GRID_OMP
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
+#endif
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+  {
+    FGrid->Barrier();
+    Dw.DhopEO(src_o,r_e,DaggerNo);
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      Dw.DhopEO(src_o,r_e,DaggerNo);
+    }
+    double t1=usecond();
+    FGrid->Barrier();
+
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=(single_site_flops*volume*ncall)/2.0;
+
+    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
+    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl;
+  }
+  Dw.DhopEO(src_o,r_e,DaggerNo);
+  Dw.DhopOE(src_e,r_o,DaggerNo);
+  Dw.Dhop  (src  ,result,DaggerNo);
+
+  std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
+  std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
+  std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
+
+  setCheckerboard(r_eo,r_o);
+  setCheckerboard(r_eo,r_e);
+
+  err = r_eo-result;
+  n2e= norm2(err);
+  std::cout<<GridLogMessage << "norm diff   "<< n2e<<std::endl;
+  assert(n2e<1.0e-4);
+
+  pickCheckerboard(Even,src_e,err);
+  pickCheckerboard(Odd,src_o,err);
+  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
+
+  assert(norm2(src_e)<1.0e-4);
+  assert(norm2(src_o)<1.0e-4);
+}
diff --git a/configure.ac b/configure.ac
index ab01878709..ce786b0169 100644
--- a/configure.ac
+++ b/configure.ac
@@ -275,7 +275,7 @@ esac
 ############### CHECKSUM COMMS
 AC_ARG_ENABLE([checksum-comms],
     [AS_HELP_STRING([--enable-checksum-comms=yes|no],[checksum all communication])],
-    [ac_CHECKSUM_COMMS=${enable_checksum_comms}], [ac_CHECKSUM_COMMS=yes])
+    [ac_CHECKSUM_COMMS=${enable_checksum_comms}], [ac_CHECKSUM_COMMS=no])
 
 case ${ac_CHECKSUM_COMMS} in
     yes)
@@ -283,10 +283,21 @@ case ${ac_CHECKSUM_COMMS} in
     *);;
 esac
 
+############### DEVICE MEMORY ALLOCATOR
+AC_ARG_ENABLE([device-memory-allocator],
+    [AS_HELP_STRING([--enable-device-memory-allocator=yes|no],[device memory allocator])],
+    [ac_DEVICE_MEMORY_ALLOCATOR=${enable_device_memory_allocator}], [ac_DEVICE_MEMORY_ALLOCATOR=no])
+
+case ${ac_DEVICE_MEMORY_ALLOCATOR} in
+    yes)
+      AC_DEFINE([GRID_DEVICE_MEMORY_ALLOCATOR],[1],[device memory allocator]);;
+    *);;
+esac
+
 ############### LOG VIEWS
 AC_ARG_ENABLE([log-views],
     [AS_HELP_STRING([--enable-log-views=yes|no],[log information on all view open/close])],
-    [ac_LOG_VIEWS=${enable_log_views}], [ac_LOG_VIEWS=yes])
+    [ac_LOG_VIEWS=${enable_log_views}], [ac_LOG_VIEWS=no])
 
 case ${ac_LOG_VIEWS} in
     yes)