From f84410d8e06eb457754c3022ca5e6cfc3131cf3b Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Sat, 20 May 2023 11:22:58 +0200
Subject: [PATCH 01/53] reduce zmobius verbosity

---
 Grid/qcd/action/fermion/ZMobiusFermion.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/Grid/qcd/action/fermion/ZMobiusFermion.h b/Grid/qcd/action/fermion/ZMobiusFermion.h
index fc8a74395d..365714f417 100644
--- a/Grid/qcd/action/fermion/ZMobiusFermion.h
+++ b/Grid/qcd/action/fermion/ZMobiusFermion.h
@@ -57,7 +57,7 @@ class ZMobiusFermion : public CayleyFermion5D<Impl>
 
   {
     //    RealD eps = 1.0;
-    std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
+    //std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
     Vector<Coeff_t> zgamma(this->Ls);
     for(int s=0;s<this->Ls;s++){
       zgamma[s] = gamma[s];

From d20caace9834daa5bdc0bd52299c717fdc797858 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Wed, 25 Sep 2024 12:40:38 +0200
Subject: [PATCH 02/53] Low-level half-precision comms framework and bfloat16
 implementation

---
 Grid/communicator/Communicator_base.cc |  2 +
 Grid/communicator/Communicator_base.h  | 10 ++-
 Grid/communicator/Communicator_mpi3.cc | 98 +++++++++++++++++++++++---
 Grid/communicator/Communicator_none.cc |  6 +-
 Grid/communicator/SharedMemory.h       |  2 +-
 Grid/stencil/Stencil.h                 |  6 +-
 Grid/util/Init.cc                      |  7 +-
 benchmarks/Benchmark_comms.cc          | 16 ++---
 8 files changed, 121 insertions(+), 26 deletions(-)

diff --git a/Grid/communicator/Communicator_base.cc b/Grid/communicator/Communicator_base.cc
index 79efb90cba..e2118eba35 100644
--- a/Grid/communicator/Communicator_base.cc
+++ b/Grid/communicator/Communicator_base.cc
@@ -41,6 +41,8 @@ bool Stencil_force_mpi = true;
 CartesianCommunicator::CommunicatorPolicy_t  
 CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
 int CartesianCommunicator::nCommThreads = -1;
+CartesianCommunicator::StencilCompressionPolicy_t  
+CartesianCommunicator::StencilCompressionPolicy= CartesianCommunicator::StencilCompressionPolicyNone;
 
 /////////////////////////////////
 // Grid information queries
diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h
index b98424a19c..ffe960e950 100644
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -49,6 +49,10 @@ class CartesianCommunicator : public SharedMemory {
   static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
   static int       nCommThreads;
 
+  enum StencilCompressionPolicy_t { StencilCompressionPolicyNone, StencilCompressionPolicyBfloat16 };
+  static StencilCompressionPolicy_t StencilCompressionPolicy;
+  static void SetStencilCompressionPolicy(StencilCompressionPolicy_t policy ) { StencilCompressionPolicy = policy; }
+
   ////////////////////////////////////////////
   // Communicator should know nothing of the physics grid, only processor grid.
   ////////////////////////////////////////////
@@ -148,17 +152,17 @@ class CartesianCommunicator : public SharedMemory {
 			       int xmit_to_rank,int do_xmit,
 			       void *recv,
 			       int recv_from_rank,int do_recv,
-			       int bytes,int dir);
+			       int bytes,int dir,size_t word_size);
 
   double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
 				    int xmit_to_rank,int do_xmit,
 				    void *recv,
 				    int recv_from_rank,int do_recv,
-				    int xbytes,int rbytes,int dir);
+				    int xbytes,int rbytes,int dir,size_t word_size);
   
   
-  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
+  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i,size_t word_size);
   void StencilBarrier(void);
 
   ////////////////////////////////////////////////////////////
diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc
index e7d7a96def..cbe5c5bb20 100644
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -340,11 +340,11 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int dest, int dox,
 						     void *recv,
 						     int from, int dor,
-						     int bytes,int dir)
+						     int bytes,int dir,size_t word_size)
 {
   std::vector<CommsRequest_t> list;
-  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
-  StencilSendToRecvFromComplete(list,dir);
+  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir,word_size);
+  StencilSendToRecvFromComplete(list,dir,word_size);
   return offbytes;
 }
 
@@ -353,7 +353,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 							 int dest,int dox,
 							 void *recv,
 							 int from,int dor,
-							 int xbytes,int rbytes,int dir)
+							 int xbytes,int rbytes,int dir,size_t word_size)
 {
   int ncomm  =communicator_halo.size();
   int commdir=dir%ncomm;
@@ -372,12 +372,45 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
   double off_node_bytes=0.0;
   int tag;
 
+  if (word_size == sizeof(float) &&
+      CartesianCommunicator::StencilCompressionPolicy
+      == CartesianCommunicator::StencilCompressionPolicyBfloat16) {
+    
+    assert(rbytes % 2 == 0 && xbytes % 2 == 0);
+    rbytes /= 2;
+    xbytes /= 2;
+
+    if (dox) {
+      assert(xbytes % sizeof(uint16_t) == 0);
+      // in-place compression
+      uint16_t* words = (uint16_t*)xmit;
+      size_t nwords = xbytes / sizeof(uint16_t);
+      assert(nwords % 2 == 0); // currently only works for even number of floats
+      size_t nwords_half = nwords / 2;
+
+      double t0 = usecond() / 1e6;
+
+      size_t width = 256;
+      assert(nwords_half % width == 0);
+      accelerator_for(i, nwords_half / width, width, {
+	  int lane = acceleratorSIMTlane(width);
+	  size_t j = i*width + lane;
+	  words[2*j + 0] = words[2*nwords_half + 2*j + 1];
+	});
+      double t1 = usecond() / 1e6;
+      double GBps = (2.*xbytes/1e9)/(t1-t0);
+      std::cout << GridLogMessage << "Compress at " << GBps << " GB/s for nwords = " << nwords << std::endl;
+
+      // bfloat[0]  bfloat[nwords_half]  bfloat[1]  bfloat[nwords_half + 1]  ...
+    }
+  }
+
   if ( dor ) {
     if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
       tag= dir+from*32;
       ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
       assert(ierr==0);
-      list.push_back(rrq);
+      list.push_back({rrq, recv, rbytes});
       off_node_bytes+=rbytes;
     }
   }
@@ -387,7 +420,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
       tag= dir+_processor*32;
       ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
       assert(ierr==0);
-      list.push_back(xrq);
+      list.push_back({xrq,0,0});
       off_node_bytes+=xbytes;
     } else {
       void *shm = (void *) this->ShmBufferTranslate(dest,recv);
@@ -398,15 +431,64 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 
   return off_node_bytes;
 }
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir,size_t word_size)
 {
   int nreq=list.size();
 
   if (nreq==0) return;
 
   std::vector<MPI_Status> status(nreq);
-  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
+  std::vector<MPI_Request> requests(nreq);
+  for (int i=0;i<nreq;i++)
+    requests[i] = list[i].request;
+  int ierr = MPI_Waitall(nreq,&requests[0],&status[0]);
   assert(ierr==0);
+
+  if (word_size == sizeof(float) &&
+      CartesianCommunicator::StencilCompressionPolicy
+      == CartesianCommunicator::StencilCompressionPolicyBfloat16) {
+
+    double t0 = usecond() / 1e6;
+    double Gb = 0.0;
+    
+    for (auto & l : list) {
+
+      if (l.recv) {
+
+	int rbytes = l.rbytes;
+
+	assert(rbytes % sizeof(uint16_t) == 0);
+	// in-place de-compression
+	uint16_t* words = (uint16_t*)l.recv;
+	size_t nwords = rbytes / sizeof(uint16_t);
+	assert(nwords % 2 == 0);
+	size_t nwords_half = nwords / 2;
+
+	size_t width = 256;
+	assert(nwords_half % width == 0);
+	accelerator_forNB(i, nwords_half / width, width, {
+	    int lane = acceleratorSIMTlane(width);
+	    size_t j = i*width + lane;
+	    words[2*nwords_half + 2*j + 1] = words[2*j + 0];
+	  });
+	accelerator_forNB(i, nwords_half / width, width, {
+	    int lane = acceleratorSIMTlane(width);
+	    size_t j = i*width + lane;
+	    words[2*j + 0] = 0;
+	});
+
+	Gb += 2.*rbytes/1e9;
+      }
+    }
+    
+    accelerator_barrier();
+    
+    double t1 = usecond() / 1e6;
+    double GBps = Gb/(t1-t0);
+    std::cout << GridLogMessage << "Decompress at " << GBps << " GB/s for GB =" << Gb << std::endl;
+
+  }
+
   list.resize(0);
 }
 void CartesianCommunicator::StencilBarrier(void)
diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc
index a06e054da0..9c67002edd 100644
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@@ -117,7 +117,7 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int xmit_to_rank,int dox,
 						     void *recv,
 						     int recv_from_rank,int dor,
-						     int bytes, int dir)
+						     int bytes, int dir,size_t word_size)
 {
   return 2.0*bytes;
 }
@@ -126,11 +126,11 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 							 int xmit_to_rank,int dox,
 							 void *recv,
 							 int recv_from_rank,int dor,
-							 int xbytes,int rbytes, int dir)
+							 int xbytes,int rbytes, int dir,size_t word_size)
 {
   return xbytes+rbytes;
 }
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir,size_t word_size)
 {
 }
 
diff --git a/Grid/communicator/SharedMemory.h b/Grid/communicator/SharedMemory.h
index d55fbf3ddf..db84247a83 100644
--- a/Grid/communicator/SharedMemory.h
+++ b/Grid/communicator/SharedMemory.h
@@ -46,7 +46,7 @@ NAMESPACE_BEGIN(Grid);
 
 #if defined (GRID_COMMS_MPI3) 
 typedef MPI_Comm    Grid_MPI_Comm;
-typedef MPI_Request CommsRequest_t;
+struct CommsRequest_t { MPI_Request request; void* recv; int rbytes; };
 #else 
 typedef int CommsRequest_t;
 typedef int Grid_MPI_Comm;
diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h
index ef3aa821a5..3a570e13f2 100644
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -441,13 +441,15 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 					Packets[i].to_rank,Packets[i].do_send,
 					Packets[i].recv_buf,
 					Packets[i].from_rank,Packets[i].do_recv,
-					Packets[i].xbytes,Packets[i].rbytes,i);
+					Packets[i].xbytes,Packets[i].rbytes,i,
+					sizeof(typename getPrecision<cobj>::real_scalar_type));
     }
   }
 
   void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
   {
-    _grid->StencilSendToRecvFromComplete(MpiReqs,0);
+    _grid->StencilSendToRecvFromComplete(MpiReqs,0,
+					 sizeof(typename getPrecision<cobj>::real_scalar_type));
     if   ( this->partialDirichlet ) DslashLogPartial();
     else if ( this->fullDirichlet ) DslashLogDirichlet();
     else DslashLogFull();
diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc
index d013763a44..bd2ce36600 100644
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -454,7 +454,8 @@ void Grid_init(int *argc,char ***argv)
     std::cout<<GridLogMessage<<std::endl;
     std::cout<<GridLogMessage<<"  --comms-concurrent : Asynchronous MPI calls; several dirs at a time "<<std::endl;    
     std::cout<<GridLogMessage<<"  --comms-sequential : Synchronous MPI calls; one dirs at a time "<<std::endl;    
-    std::cout<<GridLogMessage<<"  --comms-overlap    : Overlap comms with compute "<<std::endl;    
+    std::cout<<GridLogMessage<<"  --comms-overlap    : Overlap comms with compute "<<std::endl;
+    std::cout<<GridLogMessage<<"  --comms-half-prec  : Compress halo exchange data to half-precision "<<std::endl;    
     std::cout<<GridLogMessage<<std::endl;
     std::cout<<GridLogMessage<<"  --dslash-generic: Wilson kernel for generic Nc"<<std::endl;    
     std::cout<<GridLogMessage<<"  --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;    
@@ -489,6 +490,10 @@ void Grid_init(int *argc,char ***argv)
     WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsThenCompute;
     StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsThenCompute;
   }
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-bfloat16") ){
+    CartesianCommunicator::SetStencilCompressionPolicy(CartesianCommunicator::StencilCompressionPolicyBfloat16);
+    std::cout << GridLogMessage << "Compress floats to bfloat16 in stencil halo exchange" << std::endl;
+  }
   if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){
     CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent);
   }
diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc
index 00526893cd..188e5fe265 100644
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -220,7 +220,7 @@ int main (int argc, char ** argv)
 					      xmit_to_rank,1,
 					      (void *)&rbuf[mu][0],
 					      recv_from_rank,1,
-					      bytes,bytes,mu);
+					      bytes,bytes,mu,sizeof(double));
 	
 	    comm_proc = mpi_layout[mu]-1;
 	  
@@ -231,11 +231,11 @@ int main (int argc, char ** argv)
 					      xmit_to_rank,1,
 					      (void *)&rbuf[mu+4][0],
 					      recv_from_rank,1,
-					      bytes,bytes,mu+4);
+					      bytes,bytes,mu+4,sizeof(double));
 	  
 	  }
 	}
-	Grid.StencilSendToRecvFromComplete(requests,0);
+	Grid.StencilSendToRecvFromComplete(requests,0,sizeof(double));
 	Grid.Barrier();
 	double stop=usecond();
 	t_time[i] = stop-start; // microseconds
@@ -312,8 +312,8 @@ int main (int argc, char ** argv)
 					      xmit_to_rank,1,
 					      (void *)&rbuf[mu][0],
 					      recv_from_rank,1,
-					      bytes,bytes,mu);
-	    Grid.StencilSendToRecvFromComplete(requests,mu);
+					      bytes,bytes,mu,sizeof(double));
+	    Grid.StencilSendToRecvFromComplete(requests,mu,sizeof(double));
 	    requests.resize(0);
 
 	    comm_proc = mpi_layout[mu]-1;
@@ -325,8 +325,8 @@ int main (int argc, char ** argv)
 					      xmit_to_rank,1,
 					      (void *)&rbuf[mu+4][0],
 					      recv_from_rank,1,
-					      bytes,bytes,mu+4);
-	    Grid.StencilSendToRecvFromComplete(requests,mu+4);
+					      bytes,bytes,mu+4,sizeof(double));
+	    Grid.StencilSendToRecvFromComplete(requests,mu+4,sizeof(double));
 	    requests.resize(0);
 	  
 	  }
@@ -412,7 +412,7 @@ int main (int argc, char ** argv)
 	    }
             int tid = omp_get_thread_num();
 	    tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,1,
-					       (void *)&rbuf[dir][0], recv_from_rank,1, bytes,tid);
+					       (void *)&rbuf[dir][0], recv_from_rank,1, bytes,tid,sizeof(double));
 
 	    thread_critical { dbytes+=tbytes; }
 	  }

From 0a316ad3f5d1274b3ebb7ba355a4d4573d9bdd2d Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Wed, 25 Sep 2024 12:49:53 +0200
Subject: [PATCH 03/53] adjust help message

---
 Grid/util/Init.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc
index bd2ce36600..e420524791 100644
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -455,7 +455,7 @@ void Grid_init(int *argc,char ***argv)
     std::cout<<GridLogMessage<<"  --comms-concurrent : Asynchronous MPI calls; several dirs at a time "<<std::endl;    
     std::cout<<GridLogMessage<<"  --comms-sequential : Synchronous MPI calls; one dirs at a time "<<std::endl;    
     std::cout<<GridLogMessage<<"  --comms-overlap    : Overlap comms with compute "<<std::endl;
-    std::cout<<GridLogMessage<<"  --comms-half-prec  : Compress halo exchange data to half-precision "<<std::endl;    
+    std::cout<<GridLogMessage<<"  --comms-bfloat16   : Compress halo exchange data from float to bfloat16 "<<std::endl;    
     std::cout<<GridLogMessage<<std::endl;
     std::cout<<GridLogMessage<<"  --dslash-generic: Wilson kernel for generic Nc"<<std::endl;    
     std::cout<<GridLogMessage<<"  --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;    

From a4b99232f06179ee564004e10a824a01de8945aa Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Fri, 27 Sep 2024 20:07:21 +0300
Subject: [PATCH 04/53] mute benchmarking logs

mute benchmarking logs (2)
---
 Grid/communicator/Communicator_mpi3.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc
index cbe5c5bb20..1859d976a3 100644
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -399,7 +399,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 	});
       double t1 = usecond() / 1e6;
       double GBps = (2.*xbytes/1e9)/(t1-t0);
-      std::cout << GridLogMessage << "Compress at " << GBps << " GB/s for nwords = " << nwords << std::endl;
+      //std::cout << GridLogMessage << "Compress at " << GBps << " GB/s for nwords = " << nwords << std::endl;
 
       // bfloat[0]  bfloat[nwords_half]  bfloat[1]  bfloat[nwords_half + 1]  ...
     }
@@ -485,7 +485,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
     
     double t1 = usecond() / 1e6;
     double GBps = Gb/(t1-t0);
-    std::cout << GridLogMessage << "Decompress at " << GBps << " GB/s for GB =" << Gb << std::endl;
+    //std::cout << GridLogMessage << "Decompress at " << GBps << " GB/s for GB =" << Gb << std::endl;
 
   }
 

From 7b7c1bf860fa1147d16b715410242231a776b0be Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Wed, 9 Oct 2024 19:42:45 +0200
Subject: [PATCH 05/53] Don't benchmark Dirichlet version for JUREAP

---
 Grid/qcd/smearing/GaugeConfiguration.h       |   1 +
 Grid/qcd/smearing/GaugeConfigurationMasked.h |   1 +
 benchmarks/Benchmark_dwf_fp32_jureap.cc      | 416 +++++++++++++++++++
 3 files changed, 418 insertions(+)
 create mode 100644 benchmarks/Benchmark_dwf_fp32_jureap.cc

diff --git a/Grid/qcd/smearing/GaugeConfiguration.h b/Grid/qcd/smearing/GaugeConfiguration.h
index b25bd2f9b9..f85c37db59 100644
--- a/Grid/qcd/smearing/GaugeConfiguration.h
+++ b/Grid/qcd/smearing/GaugeConfiguration.h
@@ -120,6 +120,7 @@ class SmearedConfiguration : public ConfigurationBase<typename Gimpl::Field>
   }
 
   /*! @brief Returns smeared configuration at level 'Level' */
+public:
   const GaugeField &get_smeared_conf(int Level) const
   {
     return SmearedSet[Level];
diff --git a/Grid/qcd/smearing/GaugeConfigurationMasked.h b/Grid/qcd/smearing/GaugeConfigurationMasked.h
index 78846263cc..90046448a2 100644
--- a/Grid/qcd/smearing/GaugeConfigurationMasked.h
+++ b/Grid/qcd/smearing/GaugeConfigurationMasked.h
@@ -820,6 +820,7 @@ class SmearedConfigurationMasked : public SmearedConfiguration<Gimpl>
   }
 
 private:
+public:
   //====================================================================
   // Override base clas here to mask it
   virtual void fill_smearedSet(GaugeField &U)
diff --git a/benchmarks/Benchmark_dwf_fp32_jureap.cc b/benchmarks/Benchmark_dwf_fp32_jureap.cc
new file mode 100644
index 0000000000..57b684e991
--- /dev/null
+++ b/benchmarks/Benchmark_dwf_fp32_jureap.cc
@@ -0,0 +1,416 @@
+ /*************************************************************************************
+    Grid physics library, www.github.com/paboyle/Grid
+    Source file: ./benchmarks/Benchmark_dwf.cc
+    Copyright (C) 2015
+
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#ifdef GRID_CUDA
+#define CUDA_PROFILE
+#endif
+
+#ifdef CUDA_PROFILE
+#include <cuda_profiler_api.h>
+#endif
+
+using namespace std;
+using namespace Grid;
+
+////////////////////////
+/// Move to domains ////
+////////////////////////
+
+Gamma::Algebra Gmu [] = {
+			 Gamma::Algebra::GammaX,
+			 Gamma::Algebra::GammaY,
+			 Gamma::Algebra::GammaZ,
+			 Gamma::Algebra::GammaT
+};
+
+void Benchmark(int Ls, Coordinate Dirichlet);
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+
+  int threads = GridThread::GetThreads();
+
+  int Ls=16;
+  for(int i=0;i<argc;i++) {
+    if(std::string(argv[i]) == "-Ls"){
+      std::stringstream ss(argv[i+1]); ss >> Ls;
+    }
+  }
+
+  //////////////////
+  // With comms
+  //////////////////
+  Coordinate Dirichlet(Nd+1,0);
+
+  std::cout << "\n\n\n\n\n\n" <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  
+  Benchmark(Ls,Dirichlet);
+
+  //////////////////
+  // Domain decomposed
+  //////////////////
+  Coordinate latt4  = GridDefaultLatt();
+  Coordinate mpi    = GridDefaultMpi();
+  Coordinate CommDim(Nd);
+  Coordinate shm;
+  GlobalSharedMemory::GetShmDims(mpi,shm);
+
+
+
+  Grid_finalize();
+  exit(0);
+}
+void Benchmark(int Ls, Coordinate Dirichlet)
+{
+  Coordinate latt4 = GridDefaultLatt();
+  GridLogLayout();
+
+  long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+#define SINGLE
+#ifdef SINGLE
+  typedef vComplexF          Simd;
+  typedef LatticeFermionF    FermionField;
+  typedef LatticeGaugeFieldF GaugeField;
+  typedef LatticeColourMatrixF ColourMatrixField;
+  typedef DomainWallFermionF FermionAction;
+#endif
+#ifdef DOUBLE
+  typedef vComplexD          Simd;
+  typedef LatticeFermionD    FermionField;
+  typedef LatticeGaugeFieldD GaugeField;
+  typedef LatticeColourMatrixD ColourMatrixField;
+  typedef DomainWallFermionD FermionAction;
+#endif
+#ifdef DOUBLE2
+  typedef vComplexD2          Simd;
+  typedef LatticeFermionD2    FermionField;
+  typedef LatticeGaugeFieldD2 GaugeField;
+  typedef LatticeColourMatrixD2 ColourMatrixField;
+  typedef DomainWallFermionD2 FermionAction;
+#endif
+  
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString(std::string("The 4D RNG"));
+
+  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG"));
+
+ 
+  FermionField src   (FGrid); random(RNG5,src);
+#if 0
+  src = Zero();
+  {
+    Coordinate origin({0,0,0,latt4[2]-1,0});
+    SpinColourVectorF tmp;
+    tmp=Zero();
+    tmp()(0)(0)=Complex(-2.0,0.0);
+    std::cout << " source site 0 " << tmp<<std::endl;
+    pokeSite(tmp,src,origin);
+  }
+#else
+  RealD N2 = 1.0/::sqrt(norm2(src));
+  src = src*N2;
+#endif
+
+  FermionField result(FGrid); result=Zero();
+  FermionField    ref(FGrid);    ref=Zero();
+  FermionField    tmp(FGrid);
+  FermionField    err(FGrid);
+
+  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
+  GaugeField Umu(UGrid);
+  GaugeField UmuCopy(UGrid);
+  SU<Nc>::HotConfiguration(RNG4,Umu);
+  //  SU<Nc>::ColdConfiguration(Umu);
+  UmuCopy=Umu;
+  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
+
+  ////////////////////////////////////
+  // Apply BCs
+  ////////////////////////////////////
+  Coordinate Block(4);
+  for(int d=0;d<4;d++)  Block[d]= Dirichlet[d+1];
+
+  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
+  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
+
+  DirichletFilter<GaugeField> Filter(Block);
+  Filter.applyFilter(Umu);
+  
+  ////////////////////////////////////
+  // Naive wilson implementation
+  ////////////////////////////////////
+  std::vector<ColourMatrixField> U(4,UGrid);
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+  }
+
+  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
+
+  if (1)
+  {
+    ref = Zero();
+    for(int mu=0;mu<Nd;mu++){
+
+      tmp = Cshift(src,mu+1,1);
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v  , U[mu]  , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
+	  }
+	}
+      }
+      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
+
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v  , U[mu]  , CpuRead);
+	autoView( src_v, src    , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
+	  }
+	}
+      }
+      tmp =Cshift(tmp,mu+1,-1);
+      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
+    }
+    ref = -0.5*ref;
+  }
+
+  RealD mass=0.1;
+  RealD M5  =1.8;
+
+  RealD NP = UGrid->_Nprocessors;
+  RealD NN = UGrid->NodeCount();
+
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
+  std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl;
+#ifdef GRID_OMP
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
+#endif
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+
+  FermionAction::ImplParams p;
+  p.dirichlet=Dirichlet;
+  FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
+  Dw.ImportGauge(Umu);
+  
+  int ncall =300;
+  RealD n2e;
+  
+  if (1) {
+    FGrid->Barrier();
+    Dw.Dhop(src,result,0);
+    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      Dw.Dhop(src,result,0);
+    }
+    double t1=usecond();
+    FGrid->Barrier();
+
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=single_site_flops*volume*ncall;
+
+    auto nsimd = Simd::Nsimd();
+    auto simdwidth = sizeof(Simd);
+
+    // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
+    double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
+
+    // mem: Nd Wilson * Ls, Nd gauge, Nc colors
+    double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
+
+    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
+    err = ref-result;
+    n2e = norm2(err);
+    std::cout<<GridLogMessage << "norm diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
+
+    if(( n2e>1.0e-4) ) {
+      std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
+      FGrid->Barrier();
+      std::cout<<GridLogMessage << "RESULT" << std::endl;
+      //      std::cout << result<<std::endl;
+      std::cout << norm2(result)<<std::endl;
+      std::cout<<GridLogMessage << "REF" << std::endl;
+      std::cout << norm2(ref)<<std::endl;
+      std::cout<<GridLogMessage << "ERR" << std::endl;
+      std::cout << norm2(err)<<std::endl;
+      FGrid->Barrier();
+      exit(-1);
+    }
+    assert (n2e< 1.0e-4 );
+  }
+
+  if (1)
+  { // Naive wilson dag implementation
+    ref = Zero();
+    for(int mu=0;mu<Nd;mu++){
+
+      //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
+      tmp = Cshift(src,mu+1,1);
+      {
+	autoView( ref_v, ref, CpuWrite);
+	autoView( tmp_v, tmp, CpuRead);
+	autoView( U_v  , U[mu]  , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    int i=s+Ls*ss;
+	    ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ;
+	  }
+	}
+      }
+      
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v  , U[mu]  , CpuRead);
+	autoView( src_v, src    , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
+	  }
+	}
+      }
+      //      tmp =adj(U[mu])*src;
+      tmp =Cshift(tmp,mu+1,-1);
+      {
+	autoView( ref_v, ref, CpuWrite);
+	autoView( tmp_v, tmp, CpuRead);
+	for(int i=0;i<ref_v.size();i++){
+	  ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
+	}
+      }
+    }
+    ref = -0.5*ref;
+  }
+
+  Dw.Dhop(src,result,DaggerYes);
+
+  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
+  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
+  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
+
+  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
+  std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
+  std::cout<<GridLogMessage << "norm dag ref    "<< norm2(ref)<<std::endl;
+  err = ref-result;
+  n2e= norm2(err);
+  std::cout<<GridLogMessage << "norm dag diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
+
+  assert((n2e)<1.0e-4);
+  
+  FermionField src_e (FrbGrid);
+  FermionField src_o (FrbGrid);
+  FermionField r_e   (FrbGrid);
+  FermionField r_o   (FrbGrid);
+  FermionField r_eo  (FGrid);
+
+  std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
+  pickCheckerboard(Even,src_e,src);
+  pickCheckerboard(Odd,src_o,src);
+
+  std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
+  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
+
+
+  // S-direction is INNERMOST and takes no part in the parity.
+  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO                "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
+#ifdef GRID_OMP
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
+#endif
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+  {
+    FGrid->Barrier();
+    Dw.DhopEO(src_o,r_e,DaggerNo);
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      Dw.DhopEO(src_o,r_e,DaggerNo);
+    }
+    double t1=usecond();
+    FGrid->Barrier();
+
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=(single_site_flops*volume*ncall)/2.0;
+
+    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
+    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl;
+  }
+  Dw.DhopEO(src_o,r_e,DaggerNo);
+  Dw.DhopOE(src_e,r_o,DaggerNo);
+  Dw.Dhop  (src  ,result,DaggerNo);
+
+  std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
+  std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
+  std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
+
+  setCheckerboard(r_eo,r_o);
+  setCheckerboard(r_eo,r_e);
+
+  err = r_eo-result;
+  n2e= norm2(err);
+  std::cout<<GridLogMessage << "norm diff   "<< n2e<<std::endl;
+  assert(n2e<1.0e-4);
+
+  pickCheckerboard(Even,src_e,err);
+  pickCheckerboard(Odd,src_o,err);
+  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
+
+  assert(norm2(src_e)<1.0e-4);
+  assert(norm2(src_o)<1.0e-4);
+}

From 6cd4ce201200274116bdae333e24c70769655203 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Wed, 18 Dec 2024 14:33:27 +0100
Subject: [PATCH 06/53] Update Benchmark_dwf_fp32_jureap.cc

---
 benchmarks/Benchmark_dwf_fp32_jureap.cc | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/benchmarks/Benchmark_dwf_fp32_jureap.cc b/benchmarks/Benchmark_dwf_fp32_jureap.cc
index 57b684e991..a768b00a07 100644
--- a/benchmarks/Benchmark_dwf_fp32_jureap.cc
+++ b/benchmarks/Benchmark_dwf_fp32_jureap.cc
@@ -45,6 +45,8 @@ Gamma::Algebra Gmu [] = {
 
 void Benchmark(int Ls, Coordinate Dirichlet);
 
+#include <chrono>
+
 int main (int argc, char ** argv)
 {
   Grid_init(&argc,&argv);
@@ -247,13 +249,25 @@ void Benchmark(int Ls, Coordinate Dirichlet)
     FGrid->Barrier();
     Dw.Dhop(src,result,0);
     std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
+    
+    FGrid->Barrier();
+    std::this_thread::sleep_for(std::chrono::seconds(5));
+    auto cpu_start = std::chrono::system_clock::now();
     double t0=usecond();
     for(int i=0;i<ncall;i++){
       Dw.Dhop(src,result,0);
     }
     double t1=usecond();
     FGrid->Barrier();
+    auto cpu_stop = std::chrono::system_clock::now();
+    std::this_thread::sleep_for(std::chrono::seconds(5));
 
+    if (FGrid->ThisRank() == 0) {
+      std::ofstream  file("energy.times");
+      file<<"energy_start:"<<  std::chrono::duration_cast<std::chrono::milliseconds>(cpu_start.time_since_epoch()).count()<<std::endl;
+      file<<"energy_stop:"<<  std::chrono::duration_cast<std::chrono::milliseconds>(cpu_stop.time_since_epoch()).count()<<std::endl;
+    }
+    
     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
     double flops=single_site_flops*volume*ncall;
 

From 64c227cdbc12ea2206425f46b1fc02c2f080a66e Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Wed, 18 Dec 2024 15:38:37 +0100
Subject: [PATCH 07/53] Update Benchmark_dwf_fp32_jureap.cc

---
 benchmarks/Benchmark_dwf_fp32_jureap.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/Benchmark_dwf_fp32_jureap.cc b/benchmarks/Benchmark_dwf_fp32_jureap.cc
index a768b00a07..92dd40bc1b 100644
--- a/benchmarks/Benchmark_dwf_fp32_jureap.cc
+++ b/benchmarks/Benchmark_dwf_fp32_jureap.cc
@@ -46,6 +46,7 @@ Gamma::Algebra Gmu [] = {
 void Benchmark(int Ls, Coordinate Dirichlet);
 
 #include <chrono>
+#include <thread>
 
 int main (int argc, char ** argv)
 {

From eb1058da1d1b45c1dcc63acb6af4baa7ec661cb6 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Sun, 26 Jan 2025 16:56:07 +0100
Subject: [PATCH 08/53] Merge with upstream

---
 Grid/util/Lexicographic.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Grid/util/Lexicographic.h b/Grid/util/Lexicographic.h
index 422e42ee76..8636333d77 100644
--- a/Grid/util/Lexicographic.h
+++ b/Grid/util/Lexicographic.h
@@ -50,7 +50,7 @@ namespace Grid{
       int64_t index64;
       IndexFromCoorReversed(coor,index64,dims);
       if ( index64>=2*1024*1024*1024LL ){
-	std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
+	//std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
       }
       assert(index64<2*1024*1024*1024LL);
       index = (int) index64;

From e138258129faed5cb12fc0e5bbf829f69c6a2a3d Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Sun, 26 Jan 2025 17:14:29 +0100
Subject: [PATCH 09/53] complete merge conflict resolution

---
 Grid/communicator/Communicator_mpi3.cc | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc
index b683b7cb40..058a7cdf90 100644
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -325,12 +325,12 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
   tag= dir+from*32;
   int ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator,&rrq);
   assert(ierr==0);
-  list.push_back(rrq);
+  list.push_back({rrq, recv, bytes});
   
   tag= dir+_processor*32;
   ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator,&xrq);
   assert(ierr==0);
-  list.push_back(xrq);
+  list.push_back({xrq, xmit, bytes});
 }
 void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list)
 {
@@ -339,7 +339,11 @@ void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list)
   if (nreq==0) return;
 
   std::vector<MPI_Status> status(nreq);
-  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
+  std::vector<MPI_Request> requests(nreq);
+  for (int i=0;i<nreq;i++)
+    requests[i] = list[i].request;
+
+  int ierr = MPI_Waitall(nreq,&requests[0],&status[0]);
   assert(ierr==0);
   list.resize(0);
 }

From 5adb165cd95a5627c9cc621dcf2cc06ad898310b Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Sat, 22 Feb 2025 16:26:12 +0100
Subject: [PATCH 10/53] allow for transpose in gemm of complex matrices

---
 Grid/algorithms/blas/BatchedBlas.h | 50 +++++++++++++++++++++++++++---
 1 file changed, 46 insertions(+), 4 deletions(-)

diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h
index f4092bc535..ce615173ec 100644
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@@ -208,8 +208,8 @@ class GridBLAS {
     assert(Bkn.size()==batchCount);
     assert(Cmn.size()==batchCount);
 
-    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
-    assert(OpB!=GridBLAS_OP_T);
+    //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    //assert(OpB!=GridBLAS_OP_T);
 
     int lda = m; // m x k column major
     int ldb = k; // k x n column major
@@ -376,6 +376,13 @@ class GridBLAS {
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
 	  });
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  });
       } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
@@ -383,6 +390,13 @@ class GridBLAS {
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
 	  });
+      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  });
       } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
@@ -390,6 +404,13 @@ class GridBLAS {
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
 	  } );
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  } );
       } else { 
 	assert(0);
       }
@@ -414,8 +435,8 @@ class GridBLAS {
     RealD t2=usecond();
     int32_t batchCount = Amk.size();
 
-    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
-    assert(OpB!=GridBLAS_OP_T);
+    //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    //assert(OpB!=GridBLAS_OP_T);
 
     int lda = m; // m x k column major
     int ldb = k; // k x n column major
@@ -523,6 +544,13 @@ class GridBLAS {
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
 	  });
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  });
       } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
@@ -530,6 +558,13 @@ class GridBLAS {
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
 	  });
+      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  });
       } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
@@ -537,6 +572,13 @@ class GridBLAS {
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
 	  } );
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  } );
       } else { 
 	assert(0);
       }

From 059b6e67f0be0a7fb1595bb2e6df4d67747564e0 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Sun, 2 Mar 2025 14:23:01 +0100
Subject: [PATCH 11/53] default gemm needs to ignore C if beta vanishes

---
 Grid/algorithms/blas/BatchedBlas.h | 107 +++++++++++++++++++++++------
 1 file changed, 85 insertions(+), 22 deletions(-)

diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h
index ce615173ec..88565df1a5 100644
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@@ -367,28 +367,40 @@ class GridBLAS {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
         });
       } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn ;
 	  });
       } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
       } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.adjoint() ;
 	  });
       } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
@@ -402,14 +414,20 @@ class GridBLAS {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
 	  } );
       } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  } );
       } else { 
 	assert(0);
@@ -535,49 +553,70 @@ class GridBLAS {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
 	  });
       } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn ;
 	  });
       } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
       } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.adjoint() ;
 	  });
       } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.transpose() ;
 	  });
       } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
 	  } );
       } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  } );
       } else { 
 	assert(0);
@@ -703,29 +742,41 @@ class GridBLAS {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
 	  });
       } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
       } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.transpose() ;	  
 	  });
       } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
-	  } );
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
+	  });
       } else { 
 	assert(0);
       }
@@ -851,28 +902,40 @@ class GridBLAS {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
 	  });
       } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
       } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.transpose() ;
 	  });
       } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  });
       } else { 
 	assert(0);

From 2acb6578ecf8d7f30478733a1c711039e6745b22 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Mon, 10 Mar 2025 06:58:47 +0100
Subject: [PATCH 12/53] cpu target async

---
 Grid/threads/Accelerator.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index 1cb56dddad..0128856cc5 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -568,6 +568,8 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA spec
 
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); }
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
+inline void acceleratorCopyToDeviceAsync(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); }
+inline void acceleratorCopyFromDeviceAsync(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes);}
 inline void acceleratorCopySynchronise(void) {};
 

From e85b7986cf144f2b852da2e4635b8dd726b1060d Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Thu, 20 Mar 2025 11:38:24 +0100
Subject: [PATCH 13/53] library adjustment for JEDI

---
 configure.ac | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 8e8d67af77..1483bcae35 100644
--- a/configure.ac
+++ b/configure.ac
@@ -136,7 +136,7 @@ AC_ARG_ENABLE([tracing],
 case ${ac_TRACING} in
     nvtx)
         AC_DEFINE([GRID_TRACING_NVTX],[1],[use NVTX])
-	LIBS="${LIBS} -lnvToolsExt64_1"
+	LIBS="${LIBS} -lnvToolsExt"
 	;;
     roctx)
         AC_DEFINE([GRID_TRACING_ROCTX],[1],[use ROCTX])

From 98cce16f0686da4b8d7a5f971bb6bed31c5b9c9c Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Fri, 21 Mar 2025 12:58:38 +0100
Subject: [PATCH 14/53] Towards inverse and determinant in batched blas on all
 platforms

---
 Grid/algorithms/blas/BatchedBlas.h | 186 +++++++++++++++++++++++++++++
 1 file changed, 186 insertions(+)

diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h
index 88565df1a5..9de035d069 100644
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@@ -946,6 +946,192 @@ class GridBLAS {
      RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
   }
 
+  /*
+    Inverse and Determinant
+
+    - CPU version uses Eigen
+    - GPU version uses LAPACK-compatible getrf / getri
+
+    Design comment: Eigen does not expose getrf / getri in a LAPACK compatible manner.
+                    Overhead to go through getrf / getri for CPU version too large.
+		    Current interface therefore only guarantees the inverse and determinant
+		    functions on all platforms but not the getrf / getri ones.
+  */
+#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
+
+  void inverseBatched(int64_t n,
+		      deviceVector<ComplexD*> &Ann,
+		      deviceVector<ComplexD*> &Cnn) {
+
+    int64_t batchCount = Ann.size();
+    assert(batchCount == Cnn.size());
+    thread_for(p,batchCount, {
+	Eigen::Map<Eigen::MatrixXcd> eAnn(Ann[p],n,n);
+	Eigen::Map<Eigen::MatrixXcd> eCnn(Cnn[p],n,n);
+	eCnn = eAnn.inverse();
+      });
+  }
+
+  void inverseBatched(int64_t n,
+		      deviceVector<ComplexF*> &Ann,
+		      deviceVector<ComplexF*> &Cnn) {
+
+    int64_t batchCount = Ann.size();
+    assert(batchCount == Cnn.size());
+    thread_for(p,batchCount, {
+	Eigen::Map<Eigen::MatrixXcf> eAnn(Ann[p],n,n);
+	Eigen::Map<Eigen::MatrixXcf> eCnn(Cnn[p],n,n);
+	eCnn = eAnn.inverse();
+      });
+  }
+
+  void determinantBatched(int64_t n,
+			  deviceVector<ComplexD*> &Ann,
+			  deviceVector<ComplexD*> &C) {
+
+    int64_t batchCount = Ann.size();
+    assert(batchCount == C.size());
+    thread_for(p,batchCount, {
+	Eigen::Map<Eigen::MatrixXcd> eAnn(Ann[p],n,n);
+	*C[p] = eAnn.determinant();
+      });
+  }
+
+  void determinantBatched(int64_t n,
+			  deviceVector<ComplexF*> &Ann,
+			  deviceVector<ComplexF*> &C) {
+
+    int64_t batchCount = Ann.size();
+    assert(batchCount == C.size());
+    thread_for(p,batchCount, {
+	Eigen::Map<Eigen::MatrixXcf> eAnn(Ann[p],n,n);
+	*C[p] = eAnn.determinant();
+      });
+  }
+  
+#else
+
+  void getrfBatched(int64_t n,
+		    deviceVector<ComplexD*> &Ann,
+		    deviceVector<int64_t> &ipiv,
+		    deviceVector<int64_t> &info)
+  {
+    int64_t batchCount = Ann.size();
+    assert(ipiv.size()==batchCount);
+    assert(info.size()==batchCount);
+
+#ifdef GRID_HIP
+    auto err = hipblasZgetrfBatched(gridblasHandle,(int)n,
+				    (hipblasDoubleComplex **)&Ann[0], (int)n,
+				    (int*) &ipiv[0],
+				    (int*) &info[0],
+				    (int)batchCount);
+    assert(err==HIPBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_CUDA
+    auto err = cudablasZgetrfBatched(gridblasHandle, (int)n,
+				     (cublasDoubleComplex **)&Ann[0], (int)n,
+				     (int*) &ipiv[0],
+				     (int*) &info[0],
+				     (int)batchCount);
+    assert(err==HIPBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_SYCL
+    assert(0);
+    /*
+      TODO: cache scratchpad
+
+    oneapi::mkl::lapack::gerf_batch(*gridblasHandle,
+				    &n, &n,
+				    (ComplexD **)&Ann[0],
+				    &n,
+				    (int64_t)&ipiv[0],
+				    (int64_t)1, &batchCount,
+				    scratchpad, scratchpad_size,
+				    std::vector<sycl::event>());
+    */
+    synchronise();
+#endif
+  }
+
+  void getriBatched(int64_t n,
+		    deviceVector<ComplexD*> &Ann,
+		    deviceVector<int64_t> &ipiv,
+		    deviceVector<int64_t> &info,
+		    deviceVector<ComplexD*> &Cnn)
+  {
+    int64_t batchCount = Ann.size();
+    assert(ipiv.size()==batchCount);
+    assert(info.size()==batchCount);
+    assert(Cnn.size()==batchCount);
+
+#ifdef GRID_HIP
+    auto err = hipblasZgetrfBatched(gridblasHandle,(int)n,
+				    (hipblasDoubleComplex **)&Ann[0], (int)n,
+				    (int*) &ipiv[0],
+				    (hipblasDoubleComplex **)&Cnn[0], (int)n,
+				    (int*) &info[0],
+				    (int)batchCount);
+    assert(err==HIPBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_CUDA
+    auto err = cudablasZgetrfBatched(gridblasHandle, (int)n,
+				     (cublasDoubleComplex **)&Ann[0], (int)n,
+				     (int*) &ipiv[0],
+				     (cublasDoubleComplex **)&Cnn[0], (int)n,
+				     (int*) &info[0],
+				     (int)batchCount);
+    assert(err==HIPBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_SYCL
+    assert(0);
+    /*
+      TODO: cache scratchpad
+    oneapi::mkl::lapack::geri_batch(*gridblasHandle,
+				    &n, &n,
+				    (ComplexD **)&Ann[0],
+				    &n,
+				    (int64_t)&ipiv[0],
+				    (int64_t)1, &batchCount,
+				    scratchpad, scratchpad_size,
+				    std::vector<sycl::event>());
+    */
+    synchronise();
+#endif
+  }
+
+  template<typename dtype>
+  void inverseBatched(int64_t n,
+		      deviceVector<dtype*> &Ann,
+		      deviceVector<dtype*> &Cnn) {
+
+    int64_t batchCount = Ann.size();
+    RealD t0 = usecond();
+    deviceVector<int64_t> ipiv(batchCount*n);
+    deviceVector<int64_t> info(batchCount);
+    deviceVector<dtype> _Bnn(batchCount*n*n);
+    deviceVector<dtype*> Bnn(batchCount);
+
+    accelerator_for(i,batchCount,1,{
+	Bnn[i] = &_Bnn[n*n*i];
+      });
+    accelerator_for(i,batchCount*n*n,1,{
+	int64_t a = i / (n*n);
+	int64_t b = i % (n*n);
+	_Bnn[i] = Ann[i][b];
+      });
+
+    RealD t1 = usecond();
+    getrfBatched(n, Bnn, ipiv, info);
+    // test info for non-invertibility?  set to nan if yes?
+    RealD t2 = usecond();
+    getriBatched(n, Bnn, ipiv, info, Cnn);
+    RealD t3 = usecond();
+    std::cout << GridLogMessage << "Temp " << t1-t0 << " rf " << t2-t1 << " ri " << t3-t2 << std::endl;
+  }
+#endif
+  
+  
   template<class CComplex>
   double benchmark(int M, int N, int K, int BATCH)
   {

From 1d7edc32c2255e83ff66b436433fdada9bd388a7 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Fri, 21 Mar 2025 15:35:59 +0100
Subject: [PATCH 15/53] Continue with cublas

---
 Grid/algorithms/blas/BatchedBlas.h | 156 ++++++++++++++++++++++++++---
 1 file changed, 144 insertions(+), 12 deletions(-)

diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h
index 9de035d069..52137b48d0 100644
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@@ -1029,13 +1029,56 @@ class GridBLAS {
     assert(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
-    auto err = cudablasZgetrfBatched(gridblasHandle, (int)n,
-				     (cublasDoubleComplex **)&Ann[0], (int)n,
-				     (int*) &ipiv[0],
-				     (int*) &info[0],
-				     (int)batchCount);
+    auto err = cublasZgetrfBatched(gridblasHandle, (int)n,
+				   (cuDoubleComplex **)&Ann[0], (int)n,
+				   (int*) &ipiv[0],
+				   (int*) &info[0],
+				   (int)batchCount);
+    assert(err==CUBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_SYCL
+    assert(0);
+    /*
+      TODO: cache scratchpad
+
+    oneapi::mkl::lapack::gerf_batch(*gridblasHandle,
+				    &n, &n,
+				    (ComplexD **)&Ann[0],
+				    &n,
+				    (int64_t)&ipiv[0],
+				    (int64_t)1, &batchCount,
+				    scratchpad, scratchpad_size,
+				    std::vector<sycl::event>());
+    */
+    synchronise();
+#endif
+  }
+
+void getrfBatched(int64_t n,
+		    deviceVector<ComplexF*> &Ann,
+		    deviceVector<int64_t> &ipiv,
+		    deviceVector<int64_t> &info)
+  {
+    int64_t batchCount = Ann.size();
+    assert(ipiv.size()==batchCount);
+    assert(info.size()==batchCount);
+
+#ifdef GRID_HIP
+    auto err = hipblasCgetrfBatched(gridblasHandle,(int)n,
+				    (hipblasComplex **)&Ann[0], (int)n,
+				    (int*) &ipiv[0],
+				    (int*) &info[0],
+				    (int)batchCount);
     assert(err==HIPBLAS_STATUS_SUCCESS);
 #endif
+#ifdef GRID_CUDA
+    auto err = cublasCgetrfBatched(gridblasHandle, (int)n,
+				   (cuComplex **)&Ann[0], (int)n,
+				   (int*) &ipiv[0],
+				   (int*) &info[0],
+				   (int)batchCount);
+    assert(err==CUBLAS_STATUS_SUCCESS);
+#endif
 #ifdef GRID_SYCL
     assert(0);
     /*
@@ -1066,7 +1109,7 @@ class GridBLAS {
     assert(Cnn.size()==batchCount);
 
 #ifdef GRID_HIP
-    auto err = hipblasZgetrfBatched(gridblasHandle,(int)n,
+    auto err = hipblasZgetriBatched(gridblasHandle,(int)n,
 				    (hipblasDoubleComplex **)&Ann[0], (int)n,
 				    (int*) &ipiv[0],
 				    (hipblasDoubleComplex **)&Cnn[0], (int)n,
@@ -1075,14 +1118,60 @@ class GridBLAS {
     assert(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
-    auto err = cudablasZgetrfBatched(gridblasHandle, (int)n,
-				     (cublasDoubleComplex **)&Ann[0], (int)n,
-				     (int*) &ipiv[0],
-				     (cublasDoubleComplex **)&Cnn[0], (int)n,
-				     (int*) &info[0],
-				     (int)batchCount);
+    auto err = cublasZgetriBatched(gridblasHandle, (int)n,
+				   (cuDoubleComplex **)&Ann[0], (int)n,
+				   (int*) &ipiv[0],
+				   (cuDoubleComplex **)&Cnn[0], (int)n,
+				   (int*) &info[0],
+				   (int)batchCount);
+    assert(err==CUBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_SYCL
+    assert(0);
+    /*
+      TODO: cache scratchpad
+    oneapi::mkl::lapack::geri_batch(*gridblasHandle,
+				    &n, &n,
+				    (ComplexD **)&Ann[0],
+				    &n,
+				    (int64_t)&ipiv[0],
+				    (int64_t)1, &batchCount,
+				    scratchpad, scratchpad_size,
+				    std::vector<sycl::event>());
+    */
+    synchronise();
+#endif
+  }
+
+  void getriBatched(int64_t n,
+		    deviceVector<ComplexF*> &Ann,
+		    deviceVector<int64_t> &ipiv,
+		    deviceVector<int64_t> &info,
+		    deviceVector<ComplexF*> &Cnn)
+  {
+    int64_t batchCount = Ann.size();
+    assert(ipiv.size()==batchCount);
+    assert(info.size()==batchCount);
+    assert(Cnn.size()==batchCount);
+
+#ifdef GRID_HIP
+    auto err = hipblasCgetriBatched(gridblasHandle,(int)n,
+				    (hipblasComplex **)&Ann[0], (int)n,
+				    (int*) &ipiv[0],
+				    (hipblasComplex **)&Cnn[0], (int)n,
+				    (int*) &info[0],
+				    (int)batchCount);
     assert(err==HIPBLAS_STATUS_SUCCESS);
 #endif
+#ifdef GRID_CUDA
+    auto err = cublasCgetriBatched(gridblasHandle, (int)n,
+				   (cuComplex **)&Ann[0], (int)n,
+				   (int*) &ipiv[0],
+				   (cuComplex **)&Cnn[0], (int)n,
+				   (int*) &info[0],
+				   (int)batchCount);
+    assert(err==CUBLAS_STATUS_SUCCESS);
+#endif
 #ifdef GRID_SYCL
     assert(0);
     /*
@@ -1129,6 +1218,49 @@ class GridBLAS {
     RealD t3 = usecond();
     std::cout << GridLogMessage << "Temp " << t1-t0 << " rf " << t2-t1 << " ri " << t3-t2 << std::endl;
   }
+
+  template<typename dtype>
+  void determinantBatched(int64_t n,
+			  deviceVector<dtype*> &Ann,
+			  deviceVector<dtype*> &C) {
+
+    int64_t batchCount = Ann.size();
+    RealD t0 = usecond();
+    deviceVector<int64_t> ipiv(batchCount*n);
+    deviceVector<int64_t> info(batchCount);
+    deviceVector<dtype> _Bnn(batchCount*n*n);
+    deviceVector<dtype*> Bnn(batchCount);
+
+    dtype** pAnn = (dtype**)&Ann[0];
+    dtype** pBnn = (dtype**)&Bnn[0];
+    dtype* p_Bnn = (dtype*)&_Bnn[0];
+    dtype* pC = (dtype*)&C[0];
+    int64_t* pipiv = (int64_t*)&ipiv[0];
+    accelerator_for(i,batchCount,1,{
+	pBnn[i] = &p_Bnn[n*n*i];
+      });
+    accelerator_for(i,batchCount*n*n,1,{
+	int64_t a = i / (n*n);
+	int64_t b = i % (n*n);
+	p_Bnn[i] = pAnn[i][b];
+      });
+
+    RealD t1 = usecond();
+    getrfBatched(n, Bnn, ipiv, info);
+    RealD t2 = usecond();
+    accelerator_for(i,batchCount,1,{
+	dtype det = 1.0;
+	for (int64_t j=0;j<n;j++) {
+	  det *= pBnn[i][n*j + j];
+	  // branchless signs
+	  det *= (pipiv[i*n + j] == j) ? (1.0) : (-1.0);
+	}
+	pC[i] = det;
+      });
+    
+    RealD t3 = usecond();
+    std::cout << GridLogMessage << "Temp " << t1-t0 << " rf " << t2-t1 << " ri " << t3-t2 << std::endl;
+  }
 #endif
   
   

From cc67283c4f9f0c4a0dccfadb03cb017ce75af997 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Fri, 21 Mar 2025 20:02:08 +0100
Subject: [PATCH 16/53] Complete first version on CUDA/HIP/CPU of
 GridBlas::inverseBatched and determinantBatched

---
 Grid/algorithms/blas/BatchedBlas.h | 79 ++++++++++++------------------
 1 file changed, 31 insertions(+), 48 deletions(-)

diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h
index 52137b48d0..aef3246dcb 100644
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@@ -1017,7 +1017,7 @@ class GridBLAS {
 		    deviceVector<int64_t> &info)
   {
     int64_t batchCount = Ann.size();
-    assert(ipiv.size()==batchCount);
+    assert(ipiv.size()==batchCount*n);
     assert(info.size()==batchCount);
 
 #ifdef GRID_HIP
@@ -1060,7 +1060,7 @@ void getrfBatched(int64_t n,
 		    deviceVector<int64_t> &info)
   {
     int64_t batchCount = Ann.size();
-    assert(ipiv.size()==batchCount);
+    assert(ipiv.size()==batchCount*n);
     assert(info.size()==batchCount);
 
 #ifdef GRID_HIP
@@ -1104,7 +1104,7 @@ void getrfBatched(int64_t n,
 		    deviceVector<ComplexD*> &Cnn)
   {
     int64_t batchCount = Ann.size();
-    assert(ipiv.size()==batchCount);
+    assert(ipiv.size()==batchCount*n);
     assert(info.size()==batchCount);
     assert(Cnn.size()==batchCount);
 
@@ -1150,7 +1150,7 @@ void getrfBatched(int64_t n,
 		    deviceVector<ComplexF*> &Cnn)
   {
     int64_t batchCount = Ann.size();
-    assert(ipiv.size()==batchCount);
+    assert(ipiv.size()==batchCount*n);
     assert(info.size()==batchCount);
     assert(Cnn.size()==batchCount);
 
@@ -1191,75 +1191,58 @@ void getrfBatched(int64_t n,
 
   template<typename dtype>
   void inverseBatched(int64_t n,
-		      deviceVector<dtype*> &Ann,
-		      deviceVector<dtype*> &Cnn) {
+		      deviceVector<dtype*> &Ann, // this will be overwritten with LU decomposition
+		      deviceVector<dtype*> &Cnn  // this will be overwritten with the inverse
+		      ) {
 
     int64_t batchCount = Ann.size();
     RealD t0 = usecond();
     deviceVector<int64_t> ipiv(batchCount*n);
     deviceVector<int64_t> info(batchCount);
-    deviceVector<dtype> _Bnn(batchCount*n*n);
-    deviceVector<dtype*> Bnn(batchCount);
 
-    accelerator_for(i,batchCount,1,{
-	Bnn[i] = &_Bnn[n*n*i];
-      });
-    accelerator_for(i,batchCount*n*n,1,{
-	int64_t a = i / (n*n);
-	int64_t b = i % (n*n);
-	_Bnn[i] = Ann[i][b];
-      });
-
-    RealD t1 = usecond();
-    getrfBatched(n, Bnn, ipiv, info);
+    //RealD t1 = usecond();
+    getrfBatched(n, Ann, ipiv, info);
     // test info for non-invertibility?  set to nan if yes?
-    RealD t2 = usecond();
-    getriBatched(n, Bnn, ipiv, info, Cnn);
-    RealD t3 = usecond();
-    std::cout << GridLogMessage << "Temp " << t1-t0 << " rf " << t2-t1 << " ri " << t3-t2 << std::endl;
+    getriBatched(n, Ann, ipiv, info, Cnn);
+    //synchronise();
+    //RealD t2 = usecond();
+    //std::cout << GridLogMessage << "Temp " << t1-t0 << " rf/ri " << t2-t1  << std::endl;
   }
 
   template<typename dtype>
   void determinantBatched(int64_t n,
-			  deviceVector<dtype*> &Ann,
-			  deviceVector<dtype*> &C) {
+			  deviceVector<dtype*> &Ann, // this will be overwritten with LU decomposition
+			  deviceVector<dtype*> &C    // this will be overwritten with determinant
+			  ) {
 
     int64_t batchCount = Ann.size();
-    RealD t0 = usecond();
+    //RealD t0 = usecond();
     deviceVector<int64_t> ipiv(batchCount*n);
     deviceVector<int64_t> info(batchCount);
-    deviceVector<dtype> _Bnn(batchCount*n*n);
-    deviceVector<dtype*> Bnn(batchCount);
-
+    
     dtype** pAnn = (dtype**)&Ann[0];
-    dtype** pBnn = (dtype**)&Bnn[0];
-    dtype* p_Bnn = (dtype*)&_Bnn[0];
-    dtype* pC = (dtype*)&C[0];
+    dtype** pC = (dtype**)&C[0];
+#if defined(GRID_CUDA) || defined(GRID_HIP)
+    int* pipiv = (int*)&ipiv[0];
+#else
     int64_t* pipiv = (int64_t*)&ipiv[0];
-    accelerator_for(i,batchCount,1,{
-	pBnn[i] = &p_Bnn[n*n*i];
-      });
-    accelerator_for(i,batchCount*n*n,1,{
-	int64_t a = i / (n*n);
-	int64_t b = i % (n*n);
-	p_Bnn[i] = pAnn[i][b];
-      });
+#endif
 
-    RealD t1 = usecond();
-    getrfBatched(n, Bnn, ipiv, info);
-    RealD t2 = usecond();
+    //RealD t1 = usecond();
+    getrfBatched(n, Ann, ipiv, info);
+    //RealD t2 = usecond();
     accelerator_for(i,batchCount,1,{
 	dtype det = 1.0;
 	for (int64_t j=0;j<n;j++) {
-	  det *= pBnn[i][n*j + j];
+	  det *= pAnn[i][n*j + j];
 	  // branchless signs
-	  det *= (pipiv[i*n + j] == j) ? (1.0) : (-1.0);
+	  det *= (pipiv[i*n + j] == j+1) ? (1.0) : (-1.0);
 	}
-	pC[i] = det;
+	*pC[i] = det;
       });
     
-    RealD t3 = usecond();
-    std::cout << GridLogMessage << "Temp " << t1-t0 << " rf " << t2-t1 << " ri " << t3-t2 << std::endl;
+    //RealD t3 = usecond();
+    //std::cout << GridLogMessage << "Temp " << t1 - t0 << " rf/ri " << t2-t1  << "final" << t3 - t2 << std::endl;
   }
 #endif
   

From 23db75d945bec722e74020f719698e2f113f0127 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Thu, 10 Apr 2025 20:12:12 +0200
Subject: [PATCH 17/53] CompactWilsonClover5D

---
 .../fermion/CompactWilsonCloverFermion5D.h    | 196 +++++++++
 Grid/qcd/action/fermion/Fermion.h             |   6 +
 ...mpactWilsonCloverFermion5DImplementation.h | 376 ++++++++++++++++++
 ...lsonCloverFermion5DInstantiation.cc.master |  45 +++
 ...CloverFermion5DInstantiationWilsonImplD.cc |   1 +
 ...CloverFermion5DInstantiationWilsonImplF.cc |   1 +
 .../instantiation/generate_instantiations.sh  |   2 +-
 7 files changed, 626 insertions(+), 1 deletion(-)
 create mode 100644 Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h
 create mode 100644 Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h
 create mode 100644 Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermion5DInstantiation.cc.master
 create mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermion5DInstantiationWilsonImplD.cc
 create mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermion5DInstantiationWilsonImplF.cc

diff --git a/Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h b/Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h
new file mode 100644
index 0000000000..2c6aa587d2
--- /dev/null
+++ b/Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h
@@ -0,0 +1,196 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5D.h
+
+    Copyright (C) 2020 - 2025
+
+    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
+    Author: Nils Meyer <nils.meyer@ur.de>
+    Author: Christoph Lehner <christoph@lhnr.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#pragma once
+
+#include <Grid/qcd/action/fermion/WilsonFermion5D.h>
+#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
+#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
+#include <Grid/qcd/action/fermion/CloverHelpers.h>
+
+NAMESPACE_BEGIN(Grid);
+
+// see Grid/qcd/action/fermion/CompactWilsonCloverFermion.h for description
+
+template<class Impl, class CloverHelpers>
+class CompactWilsonCloverFermion5D : public WilsonFermion5D<Impl>,
+				     public WilsonCloverHelpers<Impl>,
+				     public CompactWilsonCloverHelpers<Impl> {
+  /////////////////////////////////////////////
+  // Sizes
+  /////////////////////////////////////////////
+
+public:
+
+  INHERIT_COMPACT_CLOVER_SIZES(Impl);
+
+  /////////////////////////////////////////////
+  // Type definitions
+  /////////////////////////////////////////////
+
+public:
+
+  INHERIT_IMPL_TYPES(Impl);
+  INHERIT_CLOVER_TYPES(Impl);
+  INHERIT_COMPACT_CLOVER_TYPES(Impl);
+
+  typedef WilsonFermion5D<Impl>            WilsonBase;
+  typedef WilsonCloverHelpers<Impl>        Helpers;
+  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
+
+  /////////////////////////////////////////////
+  // Constructors
+  /////////////////////////////////////////////
+
+public:
+
+  CompactWilsonCloverFermion5D(GaugeField& _Umu,
+			       GridCartesian         &FiveDimGrid,
+			       GridRedBlackCartesian &FiveDimRedBlackGrid,
+			       GridCartesian         &FourDimGrid,
+			       GridRedBlackCartesian &FourDimRedBlackGrid,
+			       const RealD _mass,
+			       const RealD _csw_r = 0.0,
+			       const RealD _csw_t = 0.0,
+			       const RealD _cF = 1.0,
+			       const ImplParams& impl_p = ImplParams());
+
+  /////////////////////////////////////////////
+  // Member functions (implementing interface)
+  /////////////////////////////////////////////
+
+public:
+
+  virtual void Instantiatable() {};
+  int          ConstEE()     override { return 0; };
+  int          isTrivialEE() override { return 0; };
+
+  void Dhop(const FermionField& in, FermionField& out, int dag) override;
+
+  void DhopOE(const FermionField& in, FermionField& out, int dag) override;
+
+  void DhopEO(const FermionField& in, FermionField& out, int dag) override;
+
+  void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
+
+  void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
+
+  void M(const FermionField& in, FermionField& out) override;
+
+  void Mdag(const FermionField& in, FermionField& out) override;
+
+  void Meooe(const FermionField& in, FermionField& out) override;
+
+  void MeooeDag(const FermionField& in, FermionField& out) override;
+
+  void Mooee(const FermionField& in, FermionField& out) override;
+
+  void MooeeDag(const FermionField& in, FermionField& out) override;
+
+  void MooeeInv(const FermionField& in, FermionField& out) override;
+
+  void MooeeInvDag(const FermionField& in, FermionField& out) override;
+
+  void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
+
+  void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
+
+  void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
+
+  void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
+
+  void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
+
+  /////////////////////////////////////////////
+  // Member functions (internals)
+  /////////////////////////////////////////////
+
+  void MooeeInternal(const FermionField&        in,
+                     FermionField&              out,
+                     const CloverDiagonalField& diagonal,
+                     const CloverTriangleField& triangle);
+
+  /////////////////////////////////////////////
+  // Helpers
+  /////////////////////////////////////////////
+
+  void ImportGauge(const GaugeField& _Umu) override;
+
+  /////////////////////////////////////////////
+  // Helpers
+  /////////////////////////////////////////////
+
+private:
+
+  template<class Field>
+  const MaskField* getCorrectMaskField(const Field &in) const {
+    if(in.Grid()->_isCheckerBoarded) {
+      if(in.Checkerboard() == Odd) {
+        return &this->BoundaryMaskOdd;
+      } else {
+        return &this->BoundaryMaskEven;
+      }
+    } else {
+      return &this->BoundaryMask;
+    }
+  }
+
+  template<class Field>
+  void ApplyBoundaryMask(Field& f) {
+    const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
+    assert(m != nullptr);
+    CompactHelpers::ApplyBoundaryMask(f, *m);
+  }
+
+  /////////////////////////////////////////////
+  // Member Data
+  /////////////////////////////////////////////
+
+public:
+
+  RealD csw_r;
+  RealD csw_t;
+  RealD cF;
+  int n_rhs;
+  
+  bool fixedBoundaries;
+
+  CloverDiagonalField Diagonal,    DiagonalEven,    DiagonalOdd;
+  CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
+
+  CloverTriangleField Triangle,    TriangleEven,    TriangleOdd;
+  CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
+
+  FermionField Tmp;
+
+  MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
+};
+
+NAMESPACE_END(Grid);
diff --git a/Grid/qcd/action/fermion/Fermion.h b/Grid/qcd/action/fermion/Fermion.h
index 8d7b3fdc30..a3d96d9b03 100644
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@@ -55,6 +55,7 @@ NAMESPACE_CHECK(Wilson);
 NAMESPACE_CHECK(WilsonTM);
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
+#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h> // 5d compact wilson clover fermions
 NAMESPACE_CHECK(WilsonClover);
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
 NAMESPACE_CHECK(Wilson5D);
@@ -164,12 +165,17 @@ typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiS
 
 // Compact Clover fermions
 template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
+template <typename WImpl> using CompactWilsonClover5D = CompactWilsonCloverFermion5D<WImpl, CompactCloverHelpers<WImpl>>;
 template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
 
 typedef CompactWilsonClover<WilsonImplD2> CompactWilsonCloverFermionD2;
 typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
 typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
 
+typedef CompactWilsonClover5D<WilsonImplD2> CompactWilsonCloverFermion5DD2;
+typedef CompactWilsonClover5D<WilsonImplF> CompactWilsonCloverFermion5DF;
+typedef CompactWilsonClover5D<WilsonImplD> CompactWilsonCloverFermion5DD;
+
 typedef CompactWilsonExpClover<WilsonImplD2> CompactWilsonExpCloverFermionD2;
 typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
 typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
diff --git a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h
new file mode 100644
index 0000000000..e65fb6d675
--- /dev/null
+++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h
@@ -0,0 +1,376 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5DImplementation.h
+
+    Copyright (C) 2017 - 2025
+
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
+    Author: Christoph Lehner <christoph@lhnr.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid.h>
+#include <Grid/qcd/spin/Dirac.h>
+#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h>
+
+
+NAMESPACE_BEGIN(Grid);
+template<class Impl, class CloverHelpers>
+CompactWilsonCloverFermion5D<Impl, CloverHelpers>::CompactWilsonCloverFermion5D(GaugeField& _Umu,
+										GridCartesian         &FiveDimGrid,
+										GridRedBlackCartesian &FiveDimRedBlackGrid,
+										GridCartesian         &FourDimGrid,
+										GridRedBlackCartesian &FourDimRedBlackGrid,
+										const RealD _mass,
+										const RealD _csw_r,
+										const RealD _csw_t,
+										const RealD _cF,
+										const ImplParams& impl_p)
+  : WilsonBase(_Umu, FiveDimGrid, FiveDimRedBlackGrid, FourDimGrid, FourDimRedBlackGrid, _mass, impl_p)
+  , csw_r(_csw_r)
+  , csw_t(_csw_t)
+  , cF(_cF)
+  , fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0)
+  , Diagonal(&FourDimGrid),        Triangle(&FourDimGrid)
+  , DiagonalEven(&FourDimRedBlackGrid),    TriangleEven(&FourDimRedBlackGrid)
+  , DiagonalOdd(&FourDimRedBlackGrid),     TriangleOdd(&FourDimRedBlackGrid)
+  , DiagonalInv(&FourDimGrid),     TriangleInv(&FourDimGrid)
+  , DiagonalInvEven(&FourDimRedBlackGrid), TriangleInvEven(&FourDimRedBlackGrid)
+  , DiagonalInvOdd(&FourDimRedBlackGrid),  TriangleInvOdd(&FourDimRedBlackGrid)
+  , Tmp(&FiveDimGrid)
+  , BoundaryMask(&FiveDimGrid)
+  , BoundaryMaskEven(&FiveDimRedBlackGrid), BoundaryMaskOdd(&FiveDimRedBlackGrid)
+{
+  assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
+
+  csw_r *= 0.5;
+  csw_t *= 0.5;
+  //if (clover_anisotropy.isAnisotropic)
+  //  csw_r /= clover_anisotropy.xi_0;
+
+  ImportGauge(_Umu);
+  if (fixedBoundaries) {
+    this->BoundaryMaskEven.Checkerboard() = Even;
+    this->BoundaryMaskOdd.Checkerboard() = Odd;
+    CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
+  }
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
+  WilsonBase::Dhop(in, out, dag);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
+  WilsonBase::DhopOE(in, out, dag);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
+  WilsonBase::DhopEO(in, out, dag);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
+  WilsonBase::DhopDir(in, out, dir, disp);
+  if(this->fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
+  WilsonBase::DhopDirAll(in, out);
+  if(this->fixedBoundaries) {
+    for(auto& o : out) ApplyBoundaryMask(o);
+  }
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::M(const FermionField& in, FermionField& out) {
+  out.Checkerboard() = in.Checkerboard();
+  WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
+  Mooee(in, Tmp);
+  axpy(out, 1.0, out, Tmp);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mdag(const FermionField& in, FermionField& out) {
+  out.Checkerboard() = in.Checkerboard();
+  WilsonBase::Dhop(in, out, DaggerYes);  // call base to save applying bc
+  MooeeDag(in, Tmp);
+  axpy(out, 1.0, out, Tmp);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
+  WilsonBase::Meooe(in, out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
+  WilsonBase::MeooeDag(in, out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mooee(const FermionField& in, FermionField& out) {
+  if(in.Grid()->_isCheckerBoarded) {
+    if(in.Checkerboard() == Odd) {
+      MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
+    } else {
+      MooeeInternal(in, out, DiagonalEven, TriangleEven);
+    }
+  } else {
+    MooeeInternal(in, out, Diagonal, Triangle);
+  }
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeDag(const FermionField& in, FermionField& out) {
+  Mooee(in, out); // blocks are hermitian
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInv(const FermionField& in, FermionField& out) {
+  if(in.Grid()->_isCheckerBoarded) {
+    if(in.Checkerboard() == Odd) {
+      MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
+    } else {
+      MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven);
+    }
+  } else {
+    MooeeInternal(in, out, DiagonalInv, TriangleInv);
+  }
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInvDag(const FermionField& in, FermionField& out) {
+  MooeeInv(in, out); // blocks are hermitian
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
+  DhopDir(in, out, dir, disp);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
+  DhopDirAll(in, out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
+  assert(!fixedBoundaries); // TODO check for changes required for open bc
+
+  // NOTE: code copied from original clover term
+  conformable(X.Grid(), Y.Grid());
+  conformable(X.Grid(), force.Grid());
+  GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
+  GaugeField clover_force(force.Grid());
+  PropagatorField Lambda(force.Grid());
+
+  // Guido: Here we are hitting some performance issues:
+  // need to extract the components of the DoubledGaugeField
+  // for each call
+  // Possible solution
+  // Create a vector object to store them? (cons: wasting space)
+  std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
+
+  Impl::extractLinkField(U, this->Umu);
+
+  force = Zero();
+  // Derivative of the Wilson hopping term
+  this->DhopDeriv(force, X, Y, dag);
+
+  ///////////////////////////////////////////////////////////
+  // Clover term derivative
+  ///////////////////////////////////////////////////////////
+  Impl::outerProductImpl(Lambda, X, Y);
+  //std::cout << "Lambda:" << Lambda << std::endl;
+
+  Gamma::Algebra sigma[] = {
+      Gamma::Algebra::SigmaXY,
+      Gamma::Algebra::SigmaXZ,
+      Gamma::Algebra::SigmaXT,
+      Gamma::Algebra::MinusSigmaXY,
+      Gamma::Algebra::SigmaYZ,
+      Gamma::Algebra::SigmaYT,
+      Gamma::Algebra::MinusSigmaXZ,
+      Gamma::Algebra::MinusSigmaYZ,
+      Gamma::Algebra::SigmaZT,
+      Gamma::Algebra::MinusSigmaXT,
+      Gamma::Algebra::MinusSigmaYT,
+      Gamma::Algebra::MinusSigmaZT};
+
+  /*
+    sigma_{\mu \nu}=
+    | 0         sigma[0]  sigma[1]  sigma[2] |
+    | sigma[3]    0       sigma[4]  sigma[5] |
+    | sigma[6]  sigma[7]     0      sigma[8] |
+    | sigma[9]  sigma[10] sigma[11]   0      |
+  */
+
+  int count = 0;
+  clover_force = Zero();
+  for (int mu = 0; mu < 4; mu++)
+  {
+    force_mu = Zero();
+    for (int nu = 0; nu < 4; nu++)
+    {
+      if (mu == nu)
+        continue;
+
+      RealD factor;
+      if (nu == 4 || mu == 4)
+      {
+        factor = 2.0 * csw_t;
+      }
+      else
+      {
+        factor = 2.0 * csw_r;
+      }
+      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
+      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
+      force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);   // checked
+      count++;
+    }
+
+    pokeLorentz(clover_force, U[mu] * force_mu, mu);
+  }
+  //clover_force *= csw;
+  force += clover_force;
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
+  assert(0);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
+  assert(0);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInternal(const FermionField&        in,
+								      FermionField&              out,
+								      const CloverDiagonalField& diagonal,
+								      const CloverTriangleField& triangle) {
+  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
+  out.Checkerboard() = in.Checkerboard();
+  conformable(in, out);
+  CompactHelpers::MooeeKernel(diagonal.oSites(), this->Ls, in, out, diagonal, triangle);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::ImportGauge(const GaugeField& _Umu) {
+  // NOTE: parts copied from original implementation
+
+  // Import gauge into base class
+  double t0 = usecond();
+  WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that
+
+  // Initialize temporary variables
+  double t1 = usecond();
+  conformable(_Umu.Grid(), this->GaugeGrid());
+  GridBase* grid = _Umu.Grid();
+  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
+  CloverField TmpOriginal(grid);
+  CloverField TmpInverse(grid);
+
+  // Compute the field strength terms mu>nu
+  double t2 = usecond();
+  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
+  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
+  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
+
+  // Compute the Clover Operator acting on Colour and Spin
+  // multiply here by the clover coefficients for the anisotropy
+  double t3 = usecond();
+  TmpOriginal  = Helpers::fillCloverYZ(Bx) * csw_r;
+  TmpOriginal += Helpers::fillCloverXZ(By) * csw_r;
+  TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r;
+  TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
+  TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
+  TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
+
+  // Instantiate the clover term
+  // - In case of the standard clover the mass term is added
+  // - In case of the exponential clover the clover term is exponentiated
+  double t4 = usecond();
+  CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, 4.0 + this->M5 /*this->diag_mass*/);
+
+  // Convert the data layout of the clover term
+  double t5 = usecond();
+  CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
+
+  // Modify the clover term at the temporal boundaries in case of open boundary conditions
+  double t6 = usecond();
+  if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, 4.0 + this->M5 /*this->diag_mass*/);
+
+  // Invert the Clover term
+  // In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved
+  // in TmpInverse can be used. In all other cases the clover term has to be explictly inverted.
+  // TODO: For now this inversion is explictly done on the CPU
+  double t7 = usecond();
+  CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries);
+
+  // Fill the remaining clover fields
+  double t8 = usecond();
+  pickCheckerboard(Even, DiagonalEven,    Diagonal);
+  pickCheckerboard(Even, TriangleEven,    Triangle);
+  pickCheckerboard(Odd,  DiagonalOdd,     Diagonal);
+  pickCheckerboard(Odd,  TriangleOdd,     Triangle);
+  pickCheckerboard(Even, DiagonalInvEven, DiagonalInv);
+  pickCheckerboard(Even, TriangleInvEven, TriangleInv);
+  pickCheckerboard(Odd,  DiagonalInvOdd,  DiagonalInv);
+  pickCheckerboard(Odd,  TriangleInvOdd,  TriangleInv);
+
+  // Report timings
+  double t9 = usecond();
+
+  std::cout << GridLogDebug << "CompactWilsonCloverFermion5D::ImportGauge timings:" << std::endl;
+  std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "instantiate clover =         " << (t5 - t4) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "convert layout =             " << (t6 - t5) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "modify boundaries =          " << (t7 - t6) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "invert clover =              " << (t8 - t7) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "pick cbs =                   " << (t9 - t8) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "total =                      " << (t9 - t0) / 1e6 << std::endl;
+}
+
+NAMESPACE_END(Grid);
diff --git a/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermion5DInstantiation.cc.master b/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermion5DInstantiation.cc.master
new file mode 100644
index 0000000000..fd9c787bfa
--- /dev/null
+++ b/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermion5DInstantiation.cc.master
@@ -0,0 +1,45 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/ qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation5D.cc.master
+
+    Copyright (C) 2017 - 2025
+
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
+    Author: Mattia Bruno <mattia.bruno@cern.ch>
+    Author: Christoph Lehner <christoph@lhnr.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid.h>
+#include <Grid/qcd/spin/Dirac.h>
+#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h>
+#include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h>
+#include <Grid/qcd/action/fermion/CloverHelpers.h>
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class CompactWilsonCloverFermion5D<IMPLEMENTATION, CompactCloverHelpers<IMPLEMENTATION>>; 
+template class CompactWilsonCloverFermion5D<IMPLEMENTATION, CompactExpCloverHelpers<IMPLEMENTATION>>; 
+
+NAMESPACE_END(Grid);
diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermion5DInstantiationWilsonImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermion5DInstantiationWilsonImplD.cc
new file mode 120000
index 0000000000..ad6eba2865
--- /dev/null
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermion5DInstantiationWilsonImplD.cc
@@ -0,0 +1 @@
+../CompactWilsonCloverFermion5DInstantiation.cc.master
\ No newline at end of file
diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermion5DInstantiationWilsonImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermion5DInstantiationWilsonImplF.cc
new file mode 120000
index 0000000000..ad6eba2865
--- /dev/null
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermion5DInstantiationWilsonImplF.cc
@@ -0,0 +1 @@
+../CompactWilsonCloverFermion5DInstantiation.cc.master
\ No newline at end of file
diff --git a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
index 728dc5e783..18194a23b8 100755
--- a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
+++ b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
@@ -62,7 +62,7 @@ do
 done
 done
 
-CC_LIST="CompactWilsonCloverFermionInstantiation"
+CC_LIST="CompactWilsonCloverFermionInstantiation CompactWilsonCloverFermion5DInstantiation"
 
 for impl in $COMPACT_WILSON_IMPL_LIST
 do

From 147c2eba751138c92847e2eb48397ef5e03f392d Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Thu, 10 Apr 2025 22:14:55 +0200
Subject: [PATCH 18/53] Add checkerboarding to 5D compact clover

---
 Grid/qcd/action/fermion/WilsonFermion5D.h     | 12 ++---
 .../WilsonFermion5DImplementation.h           | 49 +++++++++++++++++++
 2 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h
index 0b07d320ab..44d624efc3 100644
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -91,13 +91,13 @@ class WilsonFermion5D : public WilsonKernels<Impl>, public WilsonFermion5DStatic
   virtual void   Mdag (const FermionField &in, FermionField &out){assert(0);};
 
   // half checkerboard operations; leave unimplemented as abstract for now
-  virtual void   Meooe       (const FermionField &in, FermionField &out){assert(0);};
-  virtual void   Mooee       (const FermionField &in, FermionField &out){assert(0);};
-  virtual void   MooeeInv    (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   Meooe       (const FermionField &in, FermionField &out);
+  virtual void   Mooee       (const FermionField &in, FermionField &out);
+  virtual void   MooeeInv    (const FermionField &in, FermionField &out);
 
-  virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
-  virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
-  virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   MeooeDag    (const FermionField &in, FermionField &out);
+  virtual void   MooeeDag    (const FermionField &in, FermionField &out);
+  virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
   virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
   virtual void   MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
 
diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
index 95af4c3897..dd22a5eb37 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@@ -14,6 +14,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 Author: Andrew Lawson <andrew.lawson1991@gmail.com>
 Author: Vera Guelpers <V.M.Guelpers@soton.ac.uk>
+Author: Christoph Lehner <christoph@lhnr.de>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -450,6 +451,54 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
   Dhop(in,out,dag); // -0.5 is included
   axpy(out,4.0-M5,in,out);
 }
+template <class Impl>
+void WilsonFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out)
+{
+  if (in.Checkerboard() == Odd) {
+    DhopEO(in, out, DaggerNo);
+  } else {
+    DhopOE(in, out, DaggerNo);
+  }
+}
+
+template <class Impl>
+void WilsonFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out)
+{
+  if (in.Checkerboard() == Odd) {
+    DhopEO(in, out, DaggerYes);
+  } else {
+    DhopOE(in, out, DaggerYes);
+  }
+}
+
+template <class Impl>
+void WilsonFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  typename FermionField::scalar_type scal(4.0 + M5);
+  out = scal * in;
+}
+
+template <class Impl>
+void WilsonFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  Mooee(in, out);
+}
+
+template<class Impl>
+void WilsonFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  out = (1.0/(4.0 + M5))*in;
+}
+
+template<class Impl>
+void WilsonFermion5D<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  MooeeInv(in,out);
+}
 
 template<class Impl>
 void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in, RealD mass,std::vector<double> twist)

From 5e3a6f25b6adf6b12fbffa4ca116bc2bf1d74889 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Thu, 17 Apr 2025 18:38:10 +0200
Subject: [PATCH 19/53] Allow for reduced-precision GEMM

---
 Grid/algorithms/blas/BatchedBlas.h | 122 +++++++++++++++++++----------
 1 file changed, 80 insertions(+), 42 deletions(-)

diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h
index aef3246dcb..d25077345f 100644
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@@ -65,6 +65,7 @@ NAMESPACE_BEGIN(Grid);
 #endif
 
 enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ;
+enum GridBLASPrecision_t { GridBLAS_PRECISION_DEFAULT, GridBLAS_PRECISION_16F, GridBLAS_PRECISION_16BF, GridBLAS_PRECISION_TF32 };
 
 class GridBLAS {
 public:
@@ -97,7 +98,21 @@ class GridBLAS {
       gridblasInit=1;
     }
   }
-  
+
+#ifdef GRID_CUDA
+  cudaDataType toDataType(GridBLASPrecision_t p) {
+    switch (p) {
+    case GridBLAS_PRECISION_16F:
+      return CUBLAS_COMPUTE_32F_FAST_16F;
+    case GridBLAS_PRECISION_16BF:
+      return CUBLAS_COMPUTE_32F_FAST_16BF;
+    case GridBLAS_PRECISION_TF32:
+      return CUBLAS_COMPUTE_32F_FAST_TF32;
+    default:
+      assert(0);
+    }
+  }
+#endif
   // Force construct once
   GridBLAS() { Init(); };
   ~GridBLAS() { };
@@ -138,8 +153,10 @@ class GridBLAS {
 		   deviceVector<ComplexD*> &Amk,  // pointer list to matrices
 		   deviceVector<ComplexD*> &Bkn,
 		   ComplexD beta,
-		   deviceVector<ComplexD*> &Cmn)
+		   deviceVector<ComplexD*> &Cmn,
+		   GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
   {
+    assert(precision == GridBLAS_PRECISION_DEFAULT);
     gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
 		m,n,k,
 		alpha,
@@ -201,8 +218,10 @@ class GridBLAS {
 		   deviceVector<ComplexD*> &Amk,  // pointer list to matrices
 		   deviceVector<ComplexD*> &Bkn,
 		   ComplexD beta,
-		   deviceVector<ComplexD*> &Cmn)
+		   deviceVector<ComplexD*> &Cmn,
+		   GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
   {
+    assert(precision == GridBLAS_PRECISION_DEFAULT);
     RealD t2=usecond();
     int32_t batchCount = Amk.size();
     assert(Bkn.size()==batchCount);
@@ -448,7 +467,8 @@ class GridBLAS {
 		   deviceVector<ComplexF*> &Amk,  // pointer list to matrices
 		   deviceVector<ComplexF*> &Bkn,
 		   ComplexF beta,
-		   deviceVector<ComplexF*> &Cmn)
+		   deviceVector<ComplexF*> &Cmn,
+		   GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
   {
     RealD t2=usecond();
     int32_t batchCount = Amk.size();
@@ -473,6 +493,7 @@ class GridBLAS {
     assert(Bkn.size()==batchCount);
     assert(Cmn.size()==batchCount);
 #ifdef GRID_HIP
+    assert(precision == GridBLAS_PRECISION_DEFAULT);
     hipblasOperation_t hOpA;
     hipblasOperation_t hOpB;
     if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
@@ -503,50 +524,67 @@ class GridBLAS {
     if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
     if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
     if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
-    auto err = cublasCgemmBatched(gridblasHandle,
-				  hOpA,
-				  hOpB,
-				  m,n,k,
-				  (cuComplex *) &alpha_p[0],
-				  (cuComplex **)&Amk[0], lda,
-				  (cuComplex **)&Bkn[0], ldb,
-				  (cuComplex *) &beta_p[0],
-				  (cuComplex **)&Cmn[0], ldc,
-				  batchCount);
+    cublasStatus_t err;
+    if (precision == GridBLAS_PRECISION_DEFAULT) {
+      err = cublasCgemmBatched(gridblasHandle,
+			       hOpA,
+			       hOpB,
+			       m,n,k,
+			       (cuComplex *) &alpha_p[0],
+			       (cuComplex **)&Amk[0], lda,
+			       (cuComplex **)&Bkn[0], ldb,
+			       (cuComplex *) &beta_p[0],
+			       (cuComplex **)&Cmn[0], ldc,
+			       batchCount);
+    } else {
+      cudaDataType compute_precision = toDataType(precision);
+      err = cublasGemmBatchedEx(gridblasHandle,
+				hOpA,
+				hOpB,
+				m,n,k,
+				(cuComplex *) &alpha_p[0],
+				(cuComplex **)&Amk[0], CUDA_C_32F, lda,
+				(cuComplex **)&Bkn[0], CUDA_C_32F, ldb,
+				(cuComplex *) &beta_p[0],
+				(cuComplex **)&Cmn[0], CUDA_C_32F, ldc,
+				batchCount, compute_precision, CUBLAS_GEMM_DEFAULT);
+    }
     assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
-      int64_t m64=m;
-      int64_t n64=n;
-      int64_t k64=k;
-      int64_t lda64=lda;
-      int64_t ldb64=ldb;
-      int64_t ldc64=ldc;
-      int64_t batchCount64=batchCount;
-
-      oneapi::mkl::transpose iOpA;
-      oneapi::mkl::transpose iOpB;
-      
-      if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
-      if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
-      if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
-      if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
-      if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
-      if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
-
-      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
-						  &iOpA,
-						  &iOpB,
-						  &m64,&n64,&k64,
-						  (ComplexF *) &alpha_p[0],
-						  (const ComplexF **)&Amk[0], (const int64_t *)&lda64,
-						  (const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
-						  (ComplexF *) &beta_p[0],
-						  (ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
-						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
+    assert(precision == GridBLAS_PRECISION_DEFAULT);
+    int64_t m64=m;
+    int64_t n64=n;
+    int64_t k64=k;
+    int64_t lda64=lda;
+    int64_t ldb64=ldb;
+    int64_t ldc64=ldc;
+    int64_t batchCount64=batchCount;
+    
+    oneapi::mkl::transpose iOpA;
+    oneapi::mkl::transpose iOpB;
+    
+    if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
+    if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
+    if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
+    if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
+    if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
+    if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
+    
+    oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
+						&iOpA,
+						&iOpB,
+						&m64,&n64,&k64,
+						(ComplexF *) &alpha_p[0],
+						(const ComplexF **)&Amk[0], (const int64_t *)&lda64,
+						(const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
+						(ComplexF *) &beta_p[0],
+						(ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
+						(int64_t)1,&batchCount64,std::vector<sycl::event>());
     synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
+    assert(precision == GridBLAS_PRECISION_DEFAULT);
     // Need a default/reference implementation; use Eigen
       if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {

From f0573d04c76dd67a0af2ef1ce18ddf6b227567e2 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Thu, 17 Apr 2025 19:18:54 +0200
Subject: [PATCH 20/53] fix cuda BatchedGemmEx

---
 Grid/algorithms/blas/BatchedBlas.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h
index d25077345f..8c188321aa 100644
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@@ -100,7 +100,7 @@ class GridBLAS {
   }
 
 #ifdef GRID_CUDA
-  cudaDataType toDataType(GridBLASPrecision_t p) {
+  cublasComputeType_t toDataType(GridBLASPrecision_t p) {
     switch (p) {
     case GridBLAS_PRECISION_16F:
       return CUBLAS_COMPUTE_32F_FAST_16F;
@@ -537,16 +537,16 @@ class GridBLAS {
 			       (cuComplex **)&Cmn[0], ldc,
 			       batchCount);
     } else {
-      cudaDataType compute_precision = toDataType(precision);
+      cublasComputeType_t compute_precision = toDataType(precision);
       err = cublasGemmBatchedEx(gridblasHandle,
 				hOpA,
 				hOpB,
 				m,n,k,
-				(cuComplex *) &alpha_p[0],
-				(cuComplex **)&Amk[0], CUDA_C_32F, lda,
-				(cuComplex **)&Bkn[0], CUDA_C_32F, ldb,
-				(cuComplex *) &beta_p[0],
-				(cuComplex **)&Cmn[0], CUDA_C_32F, ldc,
+				(void *) &alpha_p[0],
+				(void **)&Amk[0], CUDA_C_32F, lda,
+				(void **)&Bkn[0], CUDA_C_32F, ldb,
+				(void *) &beta_p[0],
+				(void **)&Cmn[0], CUDA_C_32F, ldc,
 				batchCount, compute_precision, CUBLAS_GEMM_DEFAULT);
     }
     assert(err==CUBLAS_STATUS_SUCCESS);

From 0fb69d6fa3bd284d75779be0fdec02f059cd08c9 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Thu, 8 May 2025 08:32:02 +0000
Subject: [PATCH 21/53] no thread_for for identity operator

---
 Grid/lattice/Lattice_base.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h
index 515c847f95..d8b9a618b9 100644
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -236,7 +236,7 @@ class Lattice : public LatticeAccelerator<vobj>
   template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
     vobj vtmp;
     vtmp = r;
-#if 0
+#if 1
     deviceVector<vobj> vvtmp(1);
     acceleratorPut(vvtmp[0],vtmp);
     vobj *vvtmp_p = & vvtmp[0];

From f666dea5cf91e219021c2a6e0d778e72338c83e9 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ed.ac.uk>
Date: Fri, 23 May 2025 20:58:16 +0000
Subject: [PATCH 22/53] Making running on Aurora more debuggable

---
 Grid/algorithms/approx/Chebyshev.h     |  2 ++
 Grid/communicator/Communicator_mpi3.cc | 12 +++++++++
 Grid/stencil/Stencil.h                 |  1 +
 Grid/util/Init.cc                      |  7 +++--
 HMC/ComputeWilsonFlow.cc               |  9 ++++++-
 visualisation/CMakeLists.txt           | 37 ++++++++++++++++++++++++++
 visualisation/cmake-command            | 17 ++++++++++++
 7 files changed, 80 insertions(+), 5 deletions(-)
 create mode 100644 visualisation/CMakeLists.txt
 create mode 100644 visualisation/cmake-command

diff --git a/Grid/algorithms/approx/Chebyshev.h b/Grid/algorithms/approx/Chebyshev.h
index 3c6796855d..5863e948d0 100644
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@@ -269,7 +269,9 @@ class Chebyshev : public OperatorFunction<Field> {
     RealD xscale = 2.0/(hi-lo);
     RealD mscale = -(hi+lo)/(hi-lo);
     Linop.HermOp(T0,y);
+    grid->Barrier();
     axpby(T1,xscale,mscale,y,in);
+    grid->Barrier();
 
     // sum = .5 c[0] T0 + c[1] T1
     //    out = ()*T0 + Coeffs[1]*T1;
diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc
index 7dc706dfb7..55febcee01 100644
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -259,32 +259,39 @@ CartesianCommunicator::~CartesianCommunicator()
 }
 #ifdef USE_GRID_REDUCTION
 void CartesianCommunicator::GlobalSum(float &f){
+  FlightRecorder::StepLog("GlobalSumP2P");
   CartesianCommunicator::GlobalSumP2P(f);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
+  FlightRecorder::StepLog("GlobalSumP2P");
   CartesianCommunicator::GlobalSumP2P(d);
 }
 #else
 void CartesianCommunicator::GlobalSum(float &f){
+  FlightRecorder::StepLog("AllReduce");
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
   assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
+  FlightRecorder::StepLog("AllReduce");
   int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
   assert(ierr==0);
 }
 #endif
 void CartesianCommunicator::GlobalSum(uint32_t &u){
+  FlightRecorder::StepLog("AllReduce");
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
   assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
+  FlightRecorder::StepLog("AllReduce");
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
   assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
+  FlightRecorder::StepLog("AllReduceVector");
   int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
   assert(ierr==0);
 }
@@ -715,6 +722,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
 
 void CartesianCommunicator::StencilBarrier(void)
 {
+  FlightRecorder::StepLog("NodeBarrier");
   MPI_Barrier  (ShmComm);
 }
 //void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
@@ -722,11 +730,13 @@ void CartesianCommunicator::StencilBarrier(void)
 //}
 void CartesianCommunicator::Barrier(void)
 {
+  FlightRecorder::StepLog("GridBarrier");
   int ierr = MPI_Barrier(communicator);
   assert(ierr==0);
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
+  FlightRecorder::StepLog("Broadcast");
   int ierr=MPI_Bcast(data,
 		     bytes,
 		     MPI_BYTE,
@@ -745,6 +755,7 @@ void CartesianCommunicator::BarrierWorld(void){
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
+  FlightRecorder::StepLog("BroadcastWorld");
   int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
@@ -767,6 +778,7 @@ void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,
 }
 void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t bytes)
 {
+  FlightRecorder::StepLog("AllToAll");
   // MPI is a pain and uses "int" arguments
   // 64*64*64*128*16 == 500Million elements of data.
   // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h
index 1142891ae5..53e6c56a48 100644
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -385,6 +385,7 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 					Packets[i].from_rank,Packets[i].do_recv,
 					Packets[i].xbytes,Packets[i].rbytes,i);
     }
+    FlightRecorder::StepLog("Communicate begin has finished");
     // Get comms started then run checksums
     // Having this PRIOR to the dslash seems to make Sunspot work... (!)
     for(int i=0;i<Packets.size();i++){
diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc
index 1424667e00..3e7f5a6ddd 100644
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -631,12 +631,11 @@ void Grid_debug_handler_init(void)
   sa.sa_flags    = SA_SIGINFO;
   //  sigaction(SIGSEGV,&sa,NULL);
   sigaction(SIGTRAP,&sa,NULL);
-  sigaction(SIGBUS,&sa,NULL);
+  //  sigaction(SIGBUS,&sa,NULL);
   //  sigaction(SIGUSR2,&sa,NULL);
 
-  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
-
-  sigaction(SIGFPE,&sa,NULL);
+  //  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
+  //  sigaction(SIGFPE,&sa,NULL);
   sigaction(SIGKILL,&sa,NULL);
   sigaction(SIGILL,&sa,NULL);
 
diff --git a/HMC/ComputeWilsonFlow.cc b/HMC/ComputeWilsonFlow.cc
index 7d86af8c4d..ddbc5a4949 100644
--- a/HMC/ComputeWilsonFlow.cc
+++ b/HMC/ComputeWilsonFlow.cc
@@ -73,7 +73,7 @@ template <class T> void writeFile(T& in, std::string const fname){
   Grid::emptyUserRecord record;
   Grid::ScidacWriter WR(in.Grid()->IsBoss());
   WR.open(fname);
-  WR.writeScidacFieldRecord(in,record,0);
+  WR.writeScidacFieldRecord(in,record,0); // Lexico
   WR.close();
 #endif
   // What is the appropriate way to throw error?
@@ -171,6 +171,12 @@ int main(int argc, char **argv) {
     std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
     writeFile(qfield,tfile);
 
+    std::string ufile = file_pre + "U_" + std::to_string(tau) + "_" + file_post;
+    {
+      //      PeriodicGimplR::GaugeField Ucopy = U;
+      //      NerscIO::writeConfiguration(Ucopy,ufile);
+    }
+    
     RealD E = real(sum(R))/ RealD(U.Grid()->gSites());
     RealD T = real( sum(qfield) );
     Coordinate scoor; for (int mu=0; mu < Nd; mu++) scoor[mu] = 0;
@@ -183,6 +189,7 @@ int main(int argc, char **argv) {
   
   int t=WFPar.maxTau;
   WF.smear(Uflow, Umu);
+  //  NerscIO::writeConfiguration(Uflow,filesmr);
 
   RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow);
   RealD WFlow_TC   = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow);
diff --git a/visualisation/CMakeLists.txt b/visualisation/CMakeLists.txt
new file mode 100644
index 0000000000..4e9cf7b6b7
--- /dev/null
+++ b/visualisation/CMakeLists.txt
@@ -0,0 +1,37 @@
+cmake_minimum_required(VERSION 3.12 FATAL_ERROR)
+
+project(GridViewer)
+
+list(APPEND CMAKE_PREFIX_PATH "/home/paboyle/Visualisation/install/")
+
+find_package(VTK COMPONENTS 
+  CommonColor
+  CommonCore
+  FiltersCore
+  FiltersModeling
+  IOImage
+  IOFFMPEG
+  InteractionStyle
+  InteractionWidgets
+  RenderingContextOpenGL2
+  RenderingCore
+  RenderingFreeType
+  RenderingGL2PSOpenGL2
+  RenderingOpenGL2
+)
+
+if (NOT VTK_FOUND)
+  message(FATAL_ERROR "GridViewer: Unable to find the VTK build folder.")
+endif()
+
+# Prevent a "command line is too long" failure in Windows.
+set(CMAKE_NINJA_FORCE_RESPONSE_FILE "ON" CACHE BOOL "Force Ninja to use response files.")
+
+add_executable(FieldDensityAnimate MACOSX_BUNDLE FieldDensityAnimate.cxx )
+  target_link_libraries(FieldDensityAnimate PRIVATE ${VTK_LIBRARIES}
+)
+# vtk_module_autoinit is needed
+vtk_module_autoinit(
+  TARGETS FieldDensityAnimate
+  MODULES ${VTK_LIBRARIES}
+)
diff --git a/visualisation/cmake-command b/visualisation/cmake-command
new file mode 100644
index 0000000000..4f18c11016
--- /dev/null
+++ b/visualisation/cmake-command
@@ -0,0 +1,17 @@
+export grid_config=/home/paboyle/GPT/install/bin/grid-config
+libs=`$grid_config --libs`
+ldflags=`$grid_config --ldflags`
+cxxflags=`$grid_config --cxxflags`
+cxx=`$grid_config --cxx`
+cc=icx
+
+mkdir build
+cd build
+
+echo CC $cc
+echo CXX $cxx
+echo CXXFLAGS $cxxflags
+echo LDFLAGS  $ldflags
+echo LIBS  $libs
+
+LDFLAGS="$ldflags $libs " cmake .. -DCMAKE_C_COMPILER=$cc -DCMAKE_CXX_COMPILER="$cxx" -DCMAKE_CXX_FLAGS="$cxxflags "

From 1d234921dbbfea80f10c3f72423aa081d280e8ed Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Sun, 22 Jun 2025 13:11:17 +0000
Subject: [PATCH 23/53] --debug-stdout in subdirectory

---
 Grid/util/Init.cc | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc
index 3e7f5a6ddd..02a70100ac 100644
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -382,15 +382,20 @@ void Grid_init(int *argc,char ***argv)
   /////////////////////////////////////////////////////////////////
   if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){
     Grid_quiesce_nodes();
-  } else { 
+  } else {
+    char* root = getenv("GRID_STDOUT_ROOT");
     FILE *fp;
     std::ostringstream fname;
+    if (root)
+      fname << root << "/";
     fname<<"Grid.stdout.";
     fname<<CartesianCommunicator::RankWorld();
     fp=freopen(fname.str().c_str(),"w",stdout);
     assert(fp!=(FILE *)NULL);
 
     std::ostringstream ename;
+    if (root)
+      ename << root << "/";
     ename<<"Grid.stderr.";
     ename<<CartesianCommunicator::RankWorld();
     fp=freopen(ename.str().c_str(),"w",stderr);

From a04a2b7ac3ac6feaa84066e72a2b62ceb60da11b Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Thu, 26 Jun 2025 21:01:28 +0200
Subject: [PATCH 24/53] fix communicator_none

---
 Grid/communicator/Communicator_base.cc |  2 --
 Grid/communicator/Communicator_base.h  | 10 +++-------
 Grid/communicator/Communicator_none.cc | 14 +++++++-------
 configure.ac                           |  4 ++--
 4 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/Grid/communicator/Communicator_base.cc b/Grid/communicator/Communicator_base.cc
index a52e85a465..f9a4c442d9 100644
--- a/Grid/communicator/Communicator_base.cc
+++ b/Grid/communicator/Communicator_base.cc
@@ -41,8 +41,6 @@ bool Stencil_force_mpi = true;
 CartesianCommunicator::CommunicatorPolicy_t  
 CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
 int CartesianCommunicator::nCommThreads = -1;
-CartesianCommunicator::StencilCompressionPolicy_t  
-CartesianCommunicator::StencilCompressionPolicy= CartesianCommunicator::StencilCompressionPolicyNone;
 
 /////////////////////////////////
 // Grid information queries
diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h
index 1f985b1777..8fd8ec347f 100644
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -51,10 +51,6 @@ class CartesianCommunicator : public SharedMemory {
   static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
   static int       nCommThreads;
 
-  enum StencilCompressionPolicy_t { StencilCompressionPolicyNone, StencilCompressionPolicyBfloat16 };
-  static StencilCompressionPolicy_t StencilCompressionPolicy;
-  static void SetStencilCompressionPolicy(StencilCompressionPolicy_t policy ) { StencilCompressionPolicy = policy; }
-
   ////////////////////////////////////////////
   // Communicator should know nothing of the physics grid, only processor grid.
   ////////////////////////////////////////////
@@ -192,7 +188,7 @@ class CartesianCommunicator : public SharedMemory {
 			       int xmit_to_rank,int do_xmit,
 			       void *recv,
 			       int recv_from_rank,int do_recv,
-			       int bytes,int dir,size_t word_size);
+			       int bytes,int dir);
 
   double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
 				      void *xmit,
@@ -210,10 +206,10 @@ class CartesianCommunicator : public SharedMemory {
 				    int xmit_to_rank,int do_xmit,
 				    void *recv,void *recv_comp,
 				    int recv_from_rank,int do_recv,
-				    int xbytes,int rbytes,int dir,size_t word_size);
+				    int xbytes,int rbytes,int dir);
   
   
-  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i,size_t word_size);
+  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
   void StencilBarrier(void);
 
   ////////////////////////////////////////////////////////////
diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc
index 3048841ca5..4be3c1feff 100644
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@@ -130,7 +130,7 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int xmit_to_rank,int dox,
 						     void *recv,
 						     int recv_from_rank,int dor,
-						     int bytes, int dir,size_t word_size)
+						     int bytes, int dir)
 {
   return 2.0*bytes;
 }
@@ -146,15 +146,15 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
   return 0.0;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-							 void *xmit,
-							 int xmit_to_rank,int dox,
-							 void *recv,
-							 int recv_from_rank,int dor,
-							 int xbytes,int rbytes, int dir,size_t word_size)
+							 void *xmit,void *xmit_comp,
+							 int dest,int dox,
+							 void *recv,void *recv_comp,
+							 int from,int dor,
+							 int xbytes,int rbytes,int dir)
 {
   return xbytes+rbytes;
 }
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir,size_t word_size)
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
 }
 
diff --git a/configure.ac b/configure.ac
index 9664a6758d..1aa24dd4e2 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_PREREQ([2.69])
+AC_PREREQ([2.72])
 AC_INIT([Grid],[0.7.0],[https://github.com/paboyle/Grid],[Grid])
 AC_CANONICAL_BUILD
 AC_CANONICAL_HOST
@@ -214,7 +214,7 @@ esac
 
 ############### Symplectic group
 AC_ARG_ENABLE([Sp],
-    [AC_HELP_STRING([--enable-Sp=yes|no], [enable gauge group Sp2n])],
+    [AS_HELP_STRING([--enable-Sp=yes|no],[enable gauge group Sp2n])],
     [ac_ENABLE_SP=${enable_Sp}], [ac_ENABLE_SP=no])
 
 AM_CONDITIONAL(BUILD_SP, [ test "${ac_ENABLE_SP}X" == "yesX" ])

From 1016ed514089a56bc8bdfb2872679c98959402cf Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Fri, 27 Jun 2025 08:14:08 +0200
Subject: [PATCH 25/53] experiment with AC 2.71

---
 configure.ac | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 1aa24dd4e2..29c71b0650 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_PREREQ([2.72])
+AC_PREREQ([2.71])
 AC_INIT([Grid],[0.7.0],[https://github.com/paboyle/Grid],[Grid])
 AC_CANONICAL_BUILD
 AC_CANONICAL_HOST

From ffc6dbda2403e84c34be20e0c71b05429f826685 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Fri, 27 Jun 2025 20:16:07 +0000
Subject: [PATCH 26/53] towards argonne compatibility

---
 configure.ac | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 29c71b0650..dc09f87c3b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_PREREQ([2.71])
+AC_PREREQ([2.69])
 AC_INIT([Grid],[0.7.0],[https://github.com/paboyle/Grid],[Grid])
 AC_CANONICAL_BUILD
 AC_CANONICAL_HOST

From dc734001bf65c24baeb8a81111d44afd3ac6c436 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Sat, 28 Jun 2025 08:26:52 +0000
Subject: [PATCH 27/53] SYCL blas compatibility

---
 Grid/algorithms/blas/BatchedBlas.h | 137 +++++++++++++++++------------
 1 file changed, 83 insertions(+), 54 deletions(-)

diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h
index a769e011b6..c1025b5953 100644
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@@ -1049,6 +1049,40 @@ class GridBLAS {
   
 #else
 
+#ifdef GRID_SYCL
+  template<typename T>
+  void getrfBatchedSYCL(int64_t n,
+			deviceVector<T*> &Ann,
+			deviceVector<int64_t> &ipiv,
+			deviceVector<int64_t> &info) {
+    
+    int64_t batchCount = Ann.size();
+
+    static deviceVector<T> scratchpad;
+    int64_t sp_size = oneapi::mkl::lapack::getrf_batch_scratchpad_size<T>(*gridblasHandle, &n, &n, &n, (int64_t)1, &batchCount);
+    if (sp_size > scratchpad.size())
+      scratchpad.resize(sp_size);
+
+    static deviceVector<int64_t*> _ipiv;
+    if (batchCount > _ipiv.size())
+      _ipiv.resize(batchCount);
+    int64_t** p_ipiv = &_ipiv[0];
+    int64_t* pipiv = &ipiv[0];
+
+    accelerator_for(i, batchCount, 1, { p_ipiv[i] = &pipiv[i*n]; });
+
+    oneapi::mkl::lapack::getrf_batch(*gridblasHandle,
+				    &n, &n,
+				    (T **)&Ann[0],
+				    &n,
+				    (int64_t**)&_ipiv[0],
+				    (int64_t)1, &batchCount,
+				    (T*)&scratchpad[0], (int64_t)scratchpad.size(),
+				    std::vector<sycl::event>());
+    synchronise();
+  }
+#endif
+
   void getrfBatched(int64_t n,
 		    deviceVector<ComplexD*> &Ann,
 		    deviceVector<int64_t> &ipiv,
@@ -1075,24 +1109,11 @@ class GridBLAS {
     assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
-    assert(0);
-    /*
-      TODO: cache scratchpad
-
-    oneapi::mkl::lapack::gerf_batch(*gridblasHandle,
-				    &n, &n,
-				    (ComplexD **)&Ann[0],
-				    &n,
-				    (int64_t)&ipiv[0],
-				    (int64_t)1, &batchCount,
-				    scratchpad, scratchpad_size,
-				    std::vector<sycl::event>());
-    */
-    synchronise();
+    getrfBatchedSYCL(n, Ann, ipiv, info);
 #endif
   }
 
-void getrfBatched(int64_t n,
+  void getrfBatched(int64_t n,
 		    deviceVector<ComplexF*> &Ann,
 		    deviceVector<int64_t> &ipiv,
 		    deviceVector<int64_t> &info)
@@ -1118,23 +1139,55 @@ void getrfBatched(int64_t n,
     assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
-    assert(0);
-    /*
-      TODO: cache scratchpad
+    getrfBatchedSYCL(n, Ann, ipiv, info);
+#endif
+  }
+
+#ifdef GRID_SYCL
+  template<typename T>
+  void getriBatchedSYCL(int64_t n,
+			deviceVector<T*> &Ann,
+			deviceVector<int64_t> &ipiv,
+			deviceVector<int64_t> &info,
+			deviceVector<T*> &Cnn) {
+
+    int64_t batchCount = Ann.size();
+
+    static deviceVector<T> scratchpad;
+    int64_t sp_size = oneapi::mkl::lapack::getri_batch_scratchpad_size<T>(*gridblasHandle, &n, &n, (int64_t)1, &batchCount);
+    if (sp_size > scratchpad.size())
+      scratchpad.resize(sp_size);
+
+    static deviceVector<int64_t*> _ipiv;
+    if (batchCount > _ipiv.size())
+      _ipiv.resize(batchCount);
+    int64_t** p_ipiv = &_ipiv[0];
+    int64_t* pipiv = &ipiv[0];
+
+    accelerator_for(i, batchCount, 1, { p_ipiv[i] = &pipiv[i*n]; });
+
+    oneapi::mkl::lapack::getri_batch(*gridblasHandle,
+				     &n,
+				     (T **)&Ann[0],
+				     &n,
+				     (int64_t**)p_ipiv,
+				     (int64_t)1, &batchCount,
+				     (T *)&scratchpad[0], (int64_t)scratchpad.size(),
+				     std::vector<sycl::event>());
 
-    oneapi::mkl::lapack::gerf_batch(*gridblasHandle,
-				    &n, &n,
-				    (ComplexD **)&Ann[0],
-				    &n,
-				    (int64_t)&ipiv[0],
-				    (int64_t)1, &batchCount,
-				    scratchpad, scratchpad_size,
-				    std::vector<sycl::event>());
-    */
     synchronise();
-#endif
+
+    T** pA = &Ann[0];
+    T** pC = &Cnn[0];
+    accelerator_for(i, batchCount*n*n, 1, {
+	auto j = i / batchCount;
+	auto k = i % batchCount;
+	pC[k][j] = pA[k][j];
+      });
   }
 
+#endif
+
   void getriBatched(int64_t n,
 		    deviceVector<ComplexD*> &Ann,
 		    deviceVector<int64_t> &ipiv,
@@ -1165,19 +1218,7 @@ void getrfBatched(int64_t n,
     assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
-    assert(0);
-    /*
-      TODO: cache scratchpad
-    oneapi::mkl::lapack::geri_batch(*gridblasHandle,
-				    &n, &n,
-				    (ComplexD **)&Ann[0],
-				    &n,
-				    (int64_t)&ipiv[0],
-				    (int64_t)1, &batchCount,
-				    scratchpad, scratchpad_size,
-				    std::vector<sycl::event>());
-    */
-    synchronise();
+    getriBatchedSYCL(n, Ann, ipiv, info, Cnn);
 #endif
   }
 
@@ -1211,19 +1252,7 @@ void getrfBatched(int64_t n,
     assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
-    assert(0);
-    /*
-      TODO: cache scratchpad
-    oneapi::mkl::lapack::geri_batch(*gridblasHandle,
-				    &n, &n,
-				    (ComplexD **)&Ann[0],
-				    &n,
-				    (int64_t)&ipiv[0],
-				    (int64_t)1, &batchCount,
-				    scratchpad, scratchpad_size,
-				    std::vector<sycl::event>());
-    */
-    synchronise();
+    getriBatchedSYCL(n, Ann, ipiv, info, Cnn);
 #endif
   }
 

From 5cca827323b27e1cba33ae3fa5e4d09cf0bc2851 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Mon, 30 Jun 2025 13:08:42 +0000
Subject: [PATCH 28/53] Rollback half-prec comms for now

---
 Grid/communicator/Communicator_base.h         |   5 +-
 Grid/communicator/Communicator_mpi3.cc        |  37 +--
 Grid/communicator/Communicator_none.cc        |  12 +-
 Grid/communicator/SharedMemoryMPI.cc          |  56 +++-
 .../action/fermion/ImprovedStaggeredFermion.h |   6 -
 .../fermion/ImprovedStaggeredFermion5D.h      |   6 -
 .../action/fermion/NaiveStaggeredFermion.h    |   6 -
 Grid/qcd/action/fermion/WilsonCompressor.h    | 206 ++++++++++++-
 Grid/qcd/action/fermion/WilsonFermion.h       |   6 -
 Grid/qcd/action/fermion/WilsonFermion5D.h     |   9 +-
 Grid/stencil/Stencil.cc                       |   9 +-
 Grid/stencil/Stencil.h                        | 283 ++++--------------
 12 files changed, 326 insertions(+), 315 deletions(-)

diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h
index 8fd8ec347f..62fcfa7b01 100644
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -183,7 +183,6 @@ class CartesianCommunicator : public SharedMemory {
 		      int recv_from_rank,
 		      int bytes);
   
-  int IsOffNode(int rank);
   double StencilSendToRecvFrom(void *xmit,
 			       int xmit_to_rank,int do_xmit,
 			       void *recv,
@@ -202,9 +201,9 @@ class CartesianCommunicator : public SharedMemory {
   void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
 
   double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-				    void *xmit,void *xmit_comp,
+				    void *xmit,
 				    int xmit_to_rank,int do_xmit,
-				    void *recv,void *recv_comp,
+				    void *recv,
 				    int recv_from_rank,int do_recv,
 				    int xbytes,int rbytes,int dir);
   
diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc
index 552d7f198b..8f3bf9afc3 100644
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -270,24 +270,24 @@ void CartesianCommunicator::GlobalSum(double &d)
 }
 #else
 void CartesianCommunicator::GlobalSum(float &f){
-  FlightRecorder::StepLog("AllReduce float");
+  FlightRecorder::StepLog("AllReduce");
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
   assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
-  FlightRecorder::StepLog("AllReduce double");
+  FlightRecorder::StepLog("AllReduce");
   int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
   assert(ierr==0);
 }
 #endif
 void CartesianCommunicator::GlobalSum(uint32_t &u){
-  FlightRecorder::StepLog("AllReduce uint32_t");
+  FlightRecorder::StepLog("AllReduce");
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
   assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
-  FlightRecorder::StepLog("AllReduce uint64_t");
+  FlightRecorder::StepLog("AllReduce");
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
   assert(ierr==0);
 }
@@ -301,31 +301,26 @@ void CartesianCommunicator::GlobalXOR(uint32_t &u){
   assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint64_t &u){
-  FlightRecorder::StepLog("GlobalXOR");
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
   assert(ierr==0);
 }
 void CartesianCommunicator::GlobalMax(float &f)
 {
-  FlightRecorder::StepLog("GlobalMax");
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
   assert(ierr==0);
 }
 void CartesianCommunicator::GlobalMax(double &d)
 {
-  FlightRecorder::StepLog("GlobalMax");
   int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
   assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
-  FlightRecorder::StepLog("GlobalSumVector(float *)");
   int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
   assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
-  FlightRecorder::StepLog("GlobalSumVector(double *)");
   int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
   assert(ierr==0);
 }
@@ -400,16 +395,11 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 {
   std::vector<CommsRequest_t> list;
   double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
-  offbytes       += StencilSendToRecvFromBegin(list,xmit,xmit,dest,dox,recv,recv,from,dor,bytes,bytes,dir);
+  offbytes       += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
   StencilSendToRecvFromComplete(list,dir);
   return offbytes;
 }
-int CartesianCommunicator::IsOffNode(int rank)
-{
-  int grank = ShmRanks[rank];
-  if ( grank == MPI_UNDEFINED ) return true;
-  else return false;
-}
+
 
 #ifdef ACCELERATOR_AWARE_MPI
 void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
@@ -424,9 +414,9 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
   return 0.0; // Do nothing -- no preparation required
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-							 void *xmit,void *xmit_comp,
+							 void *xmit,
 							 int dest,int dox,
-							 void *recv,void *recv_comp,
+							 void *recv,
 							 int from,int dor,
 							 int xbytes,int rbytes,int dir)
 {
@@ -450,8 +440,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
   if ( dor ) {
     if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
       tag= dir+from*32;
-      //      std::cout << " StencilSendToRecvFrom "<<dir<<" MPI_Irecv "<<std::hex<<recv<<std::dec<<std::endl;
-      ierr=MPI_Irecv(recv_comp, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
+      ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
       assert(ierr==0);
       list.push_back(rrq);
       off_node_bytes+=rbytes;
@@ -460,7 +449,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
     else { 
       void *shm = (void *) this->ShmBufferTranslate(from,xmit);
       assert(shm!=NULL);
-      //      std::cout << " StencilSendToRecvFrom "<<dir<<" CopyDeviceToDevice recv "<<std::hex<<recv<<" remote "<<shm <<std::dec<<std::endl;
       acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
     }
 #endif
@@ -469,7 +457,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
   if (dox) {
     if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
       tag= dir+_processor*32;
-      ierr =MPI_Isend(xmit_comp, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
       assert(ierr==0);
       list.push_back(xrq);
       off_node_bytes+=xbytes;
@@ -688,9 +676,9 @@ void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsReque
 }  
 
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-							 void *xmit,void *xmit_comp,
+							 void *xmit,
 							 int dest,int dox,
-							 void *recv,void *recv_comp,
+							 void *recv,
 							 int from,int dor,
 							 int xbytes,int rbytes,int dir)
 {
@@ -841,7 +829,6 @@ int CartesianCommunicator::RankWorld(void){
   return r;
 }
 void CartesianCommunicator::BarrierWorld(void){
-  FlightRecorder::StepLog("BarrierWorld");
   int ierr = MPI_Barrier(communicator_world);
   assert(ierr==0);
 }
diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc
index 4be3c1feff..3dee8f4d11 100644
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@@ -124,8 +124,6 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
   dest=0;
 }
 
-int CartesianCommunicator::IsOffNode(int rank) { return false; }
-
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int xmit_to_rank,int dox,
 						     void *recv,
@@ -146,11 +144,11 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
   return 0.0;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-							 void *xmit,void *xmit_comp,
-							 int dest,int dox,
-							 void *recv,void *recv_comp,
-							 int from,int dor,
-							 int xbytes,int rbytes,int dir)
+							 void *xmit,
+							 int xmit_to_rank,int dox,
+							 void *recv,
+							 int recv_from_rank,int dor,
+							 int xbytes,int rbytes, int dir)
 {
   return xbytes+rbytes;
 }
diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc
index 469ec3bb09..b5fd4197b9 100644
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -543,21 +543,49 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
   ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 #ifndef ACCELERATOR_AWARE_MPI
   // printf("Host buffer allocate for GPU non-aware MPI\n");
+#if 0
+  HostCommBuf= acceleratorAllocHost(bytes);
+#else 
   HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
+#if 0
+  #warning "Moving host buffers to specific NUMA domain"
+  int numa;
+  char *numa_name=(char *)getenv("MPI_BUF_NUMA");
+  if(numa_name) {
+    unsigned long page_size = sysconf(_SC_PAGESIZE);
+    numa = atoi(numa_name);
+    unsigned long page_count = bytes/page_size;
+    std::vector<void *> pages(page_count);
+    std::vector<int>    nodes(page_count,numa);
+    std::vector<int>    status(page_count,-1);
+    for(unsigned long p=0;p<page_count;p++){
+      pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
+    }
+    int ret = move_pages(0,
+			 page_count,
+			 &pages[0],
+			 &nodes[0],
+			 &status[0],
+			 MPOL_MF_MOVE);
+    printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
+    if (ret) perror(" move_pages failed for reason:");
+  }
+#endif  
+  acceleratorPin(HostCommBuf,bytes);
+#endif  
+
 #endif  
   ShmCommBuf = acceleratorAllocDevice(bytes);
   if (ShmCommBuf == (void *)NULL ) {
-    std::cerr << "SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
+    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
     exit(EXIT_FAILURE);  
   }
   if ( WorldRank == 0 ){
-    std::cout << Mheader " acceleratorAllocDevice "<< bytes 
+    std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
 	      << "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
   }
   SharedMemoryZero(ShmCommBuf,bytes);
-  if ( WorldRank == 0 ){
-    std::cout<< Mheader "Setting up IPC"<<std::endl;
-  }
+  std::cout<< "Setting up IPC"<<std::endl;
   ///////////////////////////////////////////////////////////////////////////////////////////////////////////
   // Loop over ranks/gpu's on our node
   ///////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -588,6 +616,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
       if ( err != ZE_RESULT_SUCCESS ) {
 	std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
 	exit(EXIT_FAILURE);
+      } else {
+	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
       }
       memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
       handle.pid = getpid();
@@ -646,12 +676,12 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef SHM_SOCKETS
       myfd=UnixSockets::RecvFileDescriptor();
 #else
-      //      std::cout<<"mapping seeking remote pid/fd "
-      //	       <<handle.pid<<"/"
-      //	       <<handle.fd<<std::endl;
+      std::cout<<"mapping seeking remote pid/fd "
+	       <<handle.pid<<"/"
+	       <<handle.fd<<std::endl;
 
       int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
-      //      std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
+      std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
       //      int myfd  = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
       myfd  = syscall(438,pidfd,handle.fd,0);
       int err_t = errno;
@@ -661,7 +691,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 	assert(0);
       }
 #endif
-      //      std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
+      std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
       memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
       memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
 
@@ -670,6 +700,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 	std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
 	std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; 
 	exit(EXIT_FAILURE);
+      } else {
+	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
+	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle pointer is "<<std::hex<<thisBuf<<std::dec<<std::endl;
       }
       assert(thisBuf!=nullptr);
     }
@@ -750,7 +783,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
     WorldShmCommBufs[r] =ptr;
     //    std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
   }
-  std::cout<< Mheader " Intra-node IPC setup is complete "<<std::endl;
   _ShmAlloc=1;
   _ShmAllocBytes  = bytes;
 };
@@ -1007,13 +1039,11 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
 {
   int gpeer = ShmRanks[rank];
   assert(gpeer!=ShmRank); // never send to self
-  //  std::cout << "ShmBufferTranslate for rank " << rank<<" peer "<<gpeer<<std::endl;
   if (gpeer == MPI_UNDEFINED){
     return NULL;
   } else { 
     uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
     uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
-    //    std::cout << "ShmBufferTranslate : local,offset,remote "<<std::hex<<local_p<<" "<<offset<<" "<<remote<<std::dec<<std::endl;
     return (void *) remote;
   }
 }
diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
index e567c19fee..f7655f2498 100644
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@@ -154,12 +154,6 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
   StencilImpl Stencil;
   StencilImpl StencilEven;
   StencilImpl StencilOdd;
-  void SloppyComms(int sloppy)
-  {
-    Stencil.SetSloppyComms(sloppy);
-    StencilEven.SetSloppyComms(sloppy);
-    StencilOdd.SetSloppyComms(sloppy);
-  }
 
   // Copy of the gauge field , with even and odd subsets
   DoubledGaugeField Umu;
diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
index ce65bfa3c9..2641a6b8a5 100644
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -179,12 +179,6 @@ class ImprovedStaggeredFermion5D :  public StaggeredKernels<Impl>, public Improv
   StencilImpl Stencil; 
   StencilImpl StencilEven; 
   StencilImpl StencilOdd; 
-  void SloppyComms(int sloppy)
-  {
-    Stencil.SetSloppyComms(sloppy);
-    StencilEven.SetSloppyComms(sloppy);
-    StencilOdd.SetSloppyComms(sloppy);
-  }
     
   // Copy of the gauge field , with even and odd subsets
   DoubledGaugeField Umu;
diff --git a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
index 53b1826792..9ec6be9016 100644
--- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
@@ -146,12 +146,6 @@ class NaiveStaggeredFermion : public StaggeredKernels<Impl>, public NaiveStagger
   StencilImpl Stencil;
   StencilImpl StencilEven;
   StencilImpl StencilOdd;
-  void SloppyComms(int sloppy)
-  {
-    Stencil.SetSloppyComms(sloppy);
-    StencilEven.SetSloppyComms(sloppy);
-    StencilOdd.SetSloppyComms(sloppy);
-  }
 
   // Copy of the gauge field , with even and odd subsets
   DoubledGaugeField Umu;
diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h
index 458f2c8364..22f2d8e31f 100644
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@@ -32,6 +32,209 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 NAMESPACE_BEGIN(Grid);
 
+///////////////////////////////////////////////////////////////
+// Wilson compressor will need FaceGather policies for:
+// Periodic, Dirichlet, and partial Dirichlet for DWF
+///////////////////////////////////////////////////////////////
+const int dwf_compressor_depth=2;
+#define DWF_COMPRESS
+class FaceGatherPartialDWF
+{
+public:
+#ifdef DWF_COMPRESS
+  static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
+#else
+  static int PartialCompressionFactor(GridBase *grid) { return 1;}
+#endif
+  template<class vobj,class cobj,class compressor>
+  static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
+				   const Lattice<vobj> &rhs,
+				   cobj *buffer,
+				   compressor &compress,
+				   int off,int so,int partial)
+  {
+    //DWF only hack: If a direction that is OFF node we use Partial Dirichlet
+    //  Shrinks local and remote comms buffers
+    GridBase *Grid = rhs.Grid();
+    int Ls = Grid->_rdimensions[0];
+#ifdef DWF_COMPRESS
+    int depth=dwf_compressor_depth;
+#else 
+    int depth=Ls/2;
+#endif
+    std::pair<int,int> *table_v = & table[0];
+    auto rhs_v = rhs.View(AcceleratorRead);
+    int vol=table.size()/Ls;
+    accelerator_forNB( idx,table.size(), vobj::Nsimd(), {
+	Integer i=idx/Ls;
+	Integer s=idx%Ls;
+	Integer sc=depth+s-(Ls-depth);
+	if(s<depth)     compress.Compress(buffer[off+i+s*vol],rhs_v[so+table_v[idx].second]);
+	if(s>=Ls-depth) compress.Compress(buffer[off+i+sc*vol],rhs_v[so+table_v[idx].second]);
+    });
+    rhs_v.ViewClose();
+  }
+  template<class decompressor,class Decompression>
+  static void DecompressFace(decompressor decompress,Decompression &dd)
+  {
+    auto Ls = dd.dims[0];
+#ifdef DWF_COMPRESS
+    int depth=dwf_compressor_depth;
+#else
+    int depth=Ls/2;
+#endif    
+    // Just pass in the Grid
+    auto kp = dd.kernel_p;
+    auto mp = dd.mpi_p;
+    int size= dd.buffer_size;
+    int vol= size/Ls;
+    accelerator_forNB(o,size,1,{
+	int idx=o/Ls;
+	int   s=o%Ls;
+	if ( s < depth ) {
+	  int oo=s*vol+idx;
+	  kp[o]=mp[oo];
+	} else if ( s >= Ls-depth ) {
+	  int sc = depth + s - (Ls-depth);
+	  int oo=sc*vol+idx; 
+	  kp[o]=mp[oo];
+	} else {
+	  kp[o] = Zero();//fill rest with zero if partial dirichlet
+	}
+    });
+  }
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  // Need to gather *interior portions* for ALL s-slices in simd directions
+  // Do the gather as need to treat SIMD lanes differently, and insert zeroes on receive side
+  // Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  template<class vobj,class cobj,class compressor>
+  static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
+				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
+				    compressor &compress,int type,int partial)
+  {
+    GridBase *Grid = rhs.Grid();
+    int Ls = Grid->_rdimensions[0];
+#ifdef DWF_COMPRESS
+    int depth=dwf_compressor_depth;
+#else
+    int depth = Ls/2;
+#endif
+    
+    // insertion of zeroes...
+    assert( (table.size()&0x1)==0);
+    int num=table.size()/2;
+    int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
+    
+    auto rhs_v = rhs.View(AcceleratorRead);
+    auto p0=&pointers[0][0];
+    auto p1=&pointers[1][0];
+    auto tp=&table[0];
+    int nnum=num/Ls;
+    accelerator_forNB(j, num, vobj::Nsimd(), {
+	//  Reorders both local and remote comms buffers
+	//  
+	int s  = j % Ls;
+	int sp1 = (s+depth)%Ls;  // peri incremented s slice
+	
+	int hxyz= j/Ls;
+
+	int xyz0= hxyz*2; // xyzt part of coor
+	int xyz1= hxyz*2+1;
+	
+	int jj= hxyz + sp1*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
+	
+	int kk0= xyz0*Ls + s ; // s=0 goes to s=1
+	int kk1= xyz1*Ls + s ; // s=Ls-1 -> s=0
+	compress.CompressExchange(p0[jj],p1[jj],
+				  rhs_v[so+tp[kk0 ].second], // Same s, consecutive xyz sites
+				  rhs_v[so+tp[kk1 ].second], 
+				  type);
+    });
+    rhs_v.ViewClose();
+  }
+  // Merge routine is for SIMD faces
+  template<class decompressor,class Merger>
+  static void MergeFace(decompressor decompress,Merger &mm)
+  {
+    auto Ls = mm.dims[0];
+#ifdef DWF_COMPRESS
+    int depth=dwf_compressor_depth;
+#else
+    int depth = Ls/2;
+#endif
+    int  num= mm.buffer_size/2; // relate vol and Ls to buffer size
+    auto mp = &mm.mpointer[0];
+    auto vp0= &mm.vpointers[0][0]; // First arg is exchange first
+    auto vp1= &mm.vpointers[1][0];
+    auto type= mm.type;
+    int nnum = num/Ls;
+    accelerator_forNB(o,num,Merger::Nsimd,{
+
+	int  s=o%Ls;
+	int hxyz=o/Ls; // xyzt related component
+	int xyz0=hxyz*2;
+	int xyz1=hxyz*2+1;
+
+	int sp = (s+depth)%Ls; 
+	int jj= hxyz + sp*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
+
+	int oo0= s+xyz0*Ls;
+	int oo1= s+xyz1*Ls;
+
+	// same ss0, ss1 pair goes to new layout
+	decompress.Exchange(mp[oo0],mp[oo1],vp0[jj],vp1[jj],type);
+      });
+  }
+};
+class FaceGatherDWFMixedBCs
+{
+public:
+#ifdef DWF_COMPRESS
+  static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
+#else 
+  static int PartialCompressionFactor(GridBase *grid) {return 1;}
+#endif
+  
+  template<class vobj,class cobj,class compressor>
+  static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
+					 const Lattice<vobj> &rhs,
+					 cobj *buffer,
+					 compressor &compress,
+					 int off,int so,int partial)
+  {
+    //    std::cout << " face gather simple DWF partial "<<partial <<std::endl;
+    if(partial) FaceGatherPartialDWF::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
+    else        FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
+  }
+  template<class vobj,class cobj,class compressor>
+  static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
+				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
+				    compressor &compress,int type,int partial)
+  {
+    //    std::cout << " face gather exch DWF partial "<<partial <<std::endl;
+    if(partial) FaceGatherPartialDWF::Gather_plane_exchange(table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
+    else        FaceGatherSimple::Gather_plane_exchange    (table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
+  }
+  template<class decompressor,class Merger>
+  static void MergeFace(decompressor decompress,Merger &mm)
+  {
+    int partial = mm.partial;
+    //    std::cout << " merge DWF partial "<<partial <<std::endl;
+    if ( partial ) FaceGatherPartialDWF::MergeFace(decompress,mm);
+    else           FaceGatherSimple::MergeFace(decompress,mm);
+  }
+
+  template<class decompressor,class Decompression>
+  static void DecompressFace(decompressor decompress,Decompression &dd)
+  {
+    int partial = dd.partial;
+    //    std::cout << " decompress DWF partial "<<partial <<std::endl;
+    if ( partial ) FaceGatherPartialDWF::DecompressFace(decompress,dd);
+    else           FaceGatherSimple::DecompressFace(decompress,dd);
+  }
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////
 // optimised versions supporting half precision too??? Deprecate
 /////////////////////////////////////////////////////////////////////////////////////////////
@@ -39,7 +242,8 @@ NAMESPACE_BEGIN(Grid);
 
 //Could make FaceGather a template param, but then behaviour is runtime not compile time
 template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
-class WilsonCompressorTemplate : public FaceGatherSimple
+class WilsonCompressorTemplate  : public FaceGatherDWFMixedBCs
+//  : public FaceGatherSimple
 {
 public:
   
diff --git a/Grid/qcd/action/fermion/WilsonFermion.h b/Grid/qcd/action/fermion/WilsonFermion.h
index 41a2471e2b..16320a9330 100644
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@@ -165,12 +165,6 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic
   StencilImpl Stencil;
   StencilImpl StencilEven;
   StencilImpl StencilOdd;
-  void SloppyComms(int sloppy)
-  {
-    Stencil.SetSloppyComms(sloppy);
-    StencilEven.SetSloppyComms(sloppy);
-    StencilOdd.SetSloppyComms(sloppy);
-  }
 
   // Copy of the gauge field , with even and odd subsets
   DoubledGaugeField Umu;
diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h
index ea0eaa32e0..9dadc866f6 100644
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -204,14 +204,7 @@ class WilsonFermion5D : public WilsonKernels<Impl>, public WilsonFermion5DStatic
   DoubledGaugeField Umu;
   DoubledGaugeField UmuEven;
   DoubledGaugeField UmuOdd;
-
-
-  void SloppyComms(int sloppy)
-  {
-    Stencil.SetSloppyComms(sloppy);
-    StencilEven.SetSloppyComms(sloppy);
-    StencilOdd.SetSloppyComms(sloppy);
-  }
+    
   // Comms buffer
   //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
 
diff --git a/Grid/stencil/Stencil.cc b/Grid/stencil/Stencil.cc
index 1d04716918..27dc75eddd 100644
--- a/Grid/stencil/Stencil.cc
+++ b/Grid/stencil/Stencil.cc
@@ -30,26 +30,25 @@
 NAMESPACE_BEGIN(Grid);
 
 uint64_t DslashFullCount;
-//uint64_t DslashPartialCount;
+uint64_t DslashPartialCount;
 uint64_t DslashDirichletCount;
 
 void DslashResetCounts(void)
 {
   DslashFullCount=0;
-  //  DslashPartialCount=0;
+  DslashPartialCount=0;
   DslashDirichletCount=0;
 }
 void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full)
 {
   dirichlet = DslashDirichletCount;
-  partial   = 0;
+  partial   = DslashPartialCount;
   full      = DslashFullCount;
 }
 void DslashLogFull(void)     { DslashFullCount++;}
-//void DslashLogPartial(void)  { DslashPartialCount++;}
+void DslashLogPartial(void)  { DslashPartialCount++;}
 void DslashLogDirichlet(void){ DslashDirichletCount++;}
 
-deviceVector<unsigned char> StencilBuffer::DeviceCommBuf; 
 
 void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
 				 int off,std::vector<std::pair<int,int> > & table)
diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h
index cf38db27eb..20237732df 100644
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
 // These can move into a params header and be given MacroMagic serialisation
 struct DefaultImplParams {
   Coordinate dirichlet; // Blocksize of dirichlet BCs
-  //  int  partialDirichlet;
+  int  partialDirichlet;
   DefaultImplParams()  {
     dirichlet.resize(0);
-    //    partialDirichlet=0;
+    partialDirichlet=0;
   };
 };
 
@@ -69,12 +69,6 @@ struct DefaultImplParams {
 void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
 				 int off,std::vector<std::pair<int,int> > & table);
 
-class StencilBuffer
-{
-public:
-  static deviceVector<unsigned char> DeviceCommBuf;     // placed in Stencil.cc
-};
-
 void DslashResetCounts(void);
 void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
 void DslashLogFull(void);
@@ -119,8 +113,8 @@ class CartesianStencilAccelerator {
   ///////////////////////////////////////////////////
   // If true, this is partially communicated per face
   ///////////////////////////////////////////////////
-  //  StencilVector _comms_partial_send; 
-  //  StencilVector _comms_partial_recv;
+  StencilVector _comms_partial_send; 
+  StencilVector _comms_partial_recv;
   //
   StencilVector _comm_buf_size;
   StencilVector _permute_type;
@@ -211,16 +205,16 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
   struct Packet {
     void * send_buf;
     void * recv_buf;
-    void * compressed_send_buf;
-    void * compressed_recv_buf;
+#ifndef ACCELERATOR_AWARE_MPI
+    void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE
+    void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE
+#endif
     Integer to_rank;
     Integer from_rank;
     Integer do_send;
     Integer do_recv;
     Integer xbytes;
     Integer rbytes;
-    Integer xbytes_compressed;
-    Integer rbytes_compressed;
   };
   struct Merge {
     static constexpr int Nsimd = vobj::Nsimd();
@@ -229,7 +223,7 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
     std::vector<cobj *> vpointers;
     Integer buffer_size;
     Integer type;
-    //    Integer partial; // partial dirichlet BCs
+    Integer partial; // partial dirichlet BCs
     Coordinate dims;
   };
   struct Decompress {
@@ -237,7 +231,7 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
     cobj * kernel_p;
     cobj * mpi_p;
     Integer buffer_size;
-    //    Integer partial; // partial dirichlet BCs
+    Integer partial; // partial dirichlet BCs
     Coordinate dims;
   };
   struct CopyReceiveBuffer {
@@ -258,45 +252,9 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 
 protected:
   GridBase *                        _grid;
-
-  ///////////////////////////////////////////////////
-  // Sloppy comms will make a second buffer upon comms
-  ///////////////////////////////////////////////////
-  size_t device_heap_top;  //
-  size_t device_heap_bytes;//
-  size_t device_heap_size; //
-  void *DeviceBufferMalloc(size_t bytes)
-  {
-    void *ptr = (void *)device_heap_top;
-    device_heap_top  += bytes;
-    device_heap_bytes+= bytes;
-    if ( device_heap_bytes > device_heap_size ) {
-      std::cout << "DeviceBufferMalloc overflow bytes "<<bytes<<" heap bytes "<<device_heap_bytes<<" heap size "<<device_heap_size<<std::endl;
-      assert (device_heap_bytes <= device_heap_size);
-    }
-    return ptr;
-  }
-  void  DeviceBufferFreeAll(void)
-  {
-    device_heap_size = _unified_buffer_size*sizeof(cobj);
-    // Resize up if necessary, never down
-    if ( StencilBuffer::DeviceCommBuf.size() < device_heap_size ) {
-      StencilBuffer::DeviceCommBuf.resize(device_heap_size);
-    }
-    device_heap_top  =(size_t) &StencilBuffer::DeviceCommBuf[0];
-    device_heap_size = StencilBuffer::DeviceCommBuf.size();
-    device_heap_bytes=0;
-  }
-
 public:
   GridBase *Grid(void) const { return _grid; }
 
-  /////////////////////////////////////////////////////////
-  // Control reduced precision comms
-  /////////////////////////////////////////////////////////
-  int SloppyComms;
-  void SetSloppyComms(int sloppy) { SloppyComms = sloppy; };
-
   ////////////////////////////////////////////////////////////////////////
   // Needed to conveniently communicate gparity parameters into GPU memory
   // without adding parameters. Perhaps a template parameter to StenciView is
@@ -310,7 +268,7 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
   }
 
   int face_table_computed;
-  //  int partialDirichlet;
+  int partialDirichlet;
   int fullDirichlet;
   std::vector<deviceVector<std::pair<int,int> > > face_table ;
   deviceVector<int> surface_list;
@@ -403,145 +361,24 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
   ////////////////////////////////////////////////////////////////////////
   // Non blocking send and receive. Necessarily parallel.
   ////////////////////////////////////////////////////////////////////////
-  void DecompressPacket(Packet &packet)
-  {
-    if ( !SloppyComms ) return;
-
-    if ( packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
-
-      typedef typename getPrecision<cobj>::real_scalar_type word;
-      uint64_t words = packet.rbytes/sizeof(word);
-      const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
-      const uint64_t outer = words/nsimd;
-
-      if(sizeof(word)==8) {
-
-	// Can either choose to represent as float vs double and prec change
-	// OR
-	// truncate the mantissa bfp16 style
-	double *dbuf =(double *) packet.recv_buf;
-	float  *fbuf =(float  *) packet.compressed_recv_buf;
-
-	accelerator_forNB(ss,outer,nsimd,{
-	  int lane = acceleratorSIMTlane(nsimd);
-	  dbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]; //conversion
-	});
-
-      } else if ( sizeof(word)==4){
-	// Can either choose to represent as half vs float and prec change
-        // OR
-	// truncate the mantissa bfp16 style
-
-	uint32_t *fbuf =(uint32_t *) packet.recv_buf;
-	uint16_t *hbuf =(uint16_t *) packet.compressed_recv_buf;
-
-	accelerator_forNB(ss,outer,nsimd,{
-	  int lane = acceleratorSIMTlane(nsimd);
-	  fbuf[ss*nsimd+lane] = ((uint32_t)hbuf[ss*nsimd+lane])<<16; //copy back and pad each word with zeroes
-	});
-
-      } else {
-	assert(0 && "unknown floating point precision");
-      }
-    }
-  }
-  void CompressPacket(Packet &packet)
-  {
-    packet.xbytes_compressed = packet.xbytes;
-    packet.compressed_send_buf = packet.send_buf;
-
-    packet.rbytes_compressed = packet.rbytes;
-    packet.compressed_recv_buf = packet.recv_buf;
-
-    if ( !SloppyComms  ) {
-      return;
-    }
-
-    typedef typename getPrecision<cobj>::real_scalar_type word;
-    uint64_t words = packet.xbytes/sizeof(word);
-    const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
-    const uint64_t outer = words/nsimd;
-
-    if (packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
-
-      packet.rbytes_compressed = packet.rbytes/2;
-      packet.compressed_recv_buf = DeviceBufferMalloc(packet.rbytes_compressed);
-      //      std::cout << " CompressPacket recv from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
-      
-    }
-    //else {
-    //      std::cout << " CompressPacket recv is uncompressed from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
-    //    }
-    
-    if (packet.do_send && _grid->IsOffNode(packet.to_rank) ) {
-
-      packet.xbytes_compressed = packet.xbytes/2;
-      packet.compressed_send_buf = DeviceBufferMalloc(packet.xbytes_compressed);
-      //      std::cout << " CompressPacket send to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
-
-      if(sizeof(word)==8) {
-
-	double *dbuf =(double *) packet.send_buf;
-	float  *fbuf =(float  *) packet.compressed_send_buf;
-
-	accelerator_forNB(ss,outer,nsimd,{
-	  int lane = acceleratorSIMTlane(nsimd);
-	  fbuf[ss*nsimd+lane] = dbuf[ss*nsimd+lane]; // convert fp64 to fp32
-	});
-
-      } else if ( sizeof(word)==4){
-
-	uint32_t *fbuf =(uint32_t *) packet.send_buf;
-	uint16_t *hbuf =(uint16_t *) packet.compressed_send_buf;
-	
-	accelerator_forNB(ss,outer,nsimd,{
-	  int lane = acceleratorSIMTlane(nsimd);
-	  hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>16; // convert as in Bagel/BFM ; bfloat16 ; s7e8 Intel patent
-	});
-
-      } else {
-	assert(0 && "unknown floating point precision");
-      }
-
-    }
-    //    else {
-    //      std::cout << " CompressPacket send is uncompressed to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
-    //    }
-
-    return;
-  }
   void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
   {
+    //    std::cout << "Communicate Begin "<<std::endl;
+    //    _grid->Barrier();
     FlightRecorder::StepLog("Communicate begin");
-    ///////////////////////////////////////////////
     // All GPU kernel tasks must complete
-    //    accelerator_barrier();      All kernels should ALREADY be complete
-    //Everyone is here, so noone running slow and still using receive buffer
-    _grid->StencilBarrier();
-    // But the HaloGather had a barrier too.
-    ///////////////////////////////////////////////
-    if (SloppyComms) {
-      DeviceBufferFreeAll();
-    }
-    for(int i=0;i<Packets.size();i++){
-      this->CompressPacket(Packets[i]);
-    }
-    if (SloppyComms) { 
-      accelerator_barrier();
-#ifdef NVLINK_GET
-      _grid->StencilBarrier(); 
-#endif
-    }
-    
+    //    accelerator_barrier();     // All kernels should ALREADY be complete
+    //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
+                               // But the HaloGather had a barrier too.
     for(int i=0;i<Packets.size();i++){
       //      std::cout << "Communicate prepare "<<i<<std::endl;
       //      _grid->Barrier();
       _grid->StencilSendToRecvFromPrepare(MpiReqs,
-					  Packets[i].compressed_send_buf,
+					  Packets[i].send_buf,
 					  Packets[i].to_rank,Packets[i].do_send,
-					  Packets[i].compressed_recv_buf,
+					  Packets[i].recv_buf,
 					  Packets[i].from_rank,Packets[i].do_recv,
-					  Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
+					  Packets[i].xbytes,Packets[i].rbytes,i);
     }
     //    std::cout << "Communicate PollDtoH "<<std::endl;
     //    _grid->Barrier();
@@ -552,22 +389,19 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
     // Starts intranode
     for(int i=0;i<Packets.size();i++){
       //      std::cout << "Communicate Begin "<<i<<std::endl;
-      //      _grid->Barrier();
       _grid->StencilSendToRecvFromBegin(MpiReqs,
-					Packets[i].send_buf,Packets[i].compressed_send_buf,
+					Packets[i].send_buf,
 					Packets[i].to_rank,Packets[i].do_send,
-					Packets[i].recv_buf,Packets[i].compressed_recv_buf,
+					Packets[i].recv_buf,
 					Packets[i].from_rank,Packets[i].do_recv,
-					Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
-      //      std::cout << "Communicate Begin started "<<i<<std::endl;
-      //      _grid->Barrier();
+					Packets[i].xbytes,Packets[i].rbytes,i);
     }
     FlightRecorder::StepLog("Communicate begin has finished");
     // Get comms started then run checksums
     // Having this PRIOR to the dslash seems to make Sunspot work... (!)
     for(int i=0;i<Packets.size();i++){
       if ( Packets[i].do_send )
-	FlightRecorder::xmitLog(Packets[i].compressed_send_buf,Packets[i].xbytes_compressed);
+	FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes);
     }
   }
 
@@ -582,15 +416,14 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
     //    std::cout << "Communicate Complete Complete "<<std::endl;
     //    _grid->Barrier();
     _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
-    //    if   ( this->partialDirichlet ) DslashLogPartial();
-    if ( this->fullDirichlet ) DslashLogDirichlet();
+    if   ( this->partialDirichlet ) DslashLogPartial();
+    else if ( this->fullDirichlet ) DslashLogDirichlet();
     else DslashLogFull();
     //    acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
     //    accelerator_barrier(); 
     for(int i=0;i<Packets.size();i++){
-      this->DecompressPacket(Packets[i]);
       if ( Packets[i].do_recv )
-	FlightRecorder::recvLog(Packets[i].compressed_recv_buf,Packets[i].rbytes_compressed,Packets[i].from_rank);
+	FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
     }
     FlightRecorder::StepLog("Finish communicate complete");
   }
@@ -785,7 +618,7 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
   }
   void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) {
     Decompress d;
-    //    d.partial  = this->partialDirichlet;
+    d.partial  = this->partialDirichlet;
     d.dims     = _grid->_fdimensions;
     d.kernel_p = k_p;
     d.mpi_p    = m_p;
@@ -794,7 +627,7 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
   }
   void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
     Merge m;
-    //    m.partial  = this->partialDirichlet;
+    m.partial  = this->partialDirichlet;
     m.dims     = _grid->_fdimensions;
     m.type     = type;
     m.mpointer = merge_p;
@@ -899,8 +732,8 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
       int block = dirichlet_block[dimension];
       this->_comms_send[ii] = comm_dim;
       this->_comms_recv[ii] = comm_dim;
-      //      this->_comms_partial_send[ii] = 0;
-      //      this->_comms_partial_recv[ii] = 0;
+      this->_comms_partial_send[ii] = 0;
+      this->_comms_partial_recv[ii] = 0;
       if ( block && comm_dim ) {
 	assert(abs(displacement) < ld );
 	// Quiesce communication across block boundaries
@@ -921,10 +754,10 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 	  if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
 	  if ( ( (ld*pc     ) % block ) == 0 ) this->_comms_recv[ii] = 0;
 	}
-	//	if ( partialDirichlet ) {
-	//	  this->_comms_partial_send[ii] = !this->_comms_send[ii];
-	//	  this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
-	//	}
+	if ( partialDirichlet ) {
+	  this->_comms_partial_send[ii] = !this->_comms_send[ii];
+	  this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
+	}
       }
     }
   }
@@ -936,7 +769,6 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 		   Parameters p=Parameters(),
 		   bool preserve_shm=false)
   {
-    SloppyComms = 0;
     face_table_computed=0;
     _grid    = grid;
     this->parameters=p;
@@ -954,7 +786,7 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
     this->same_node.resize(npoints);
 
     if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
-    //    partialDirichlet = p.partialDirichlet;
+    partialDirichlet = p.partialDirichlet;
     DirichletBlock(p.dirichlet); // comms send/recv set up
     fullDirichlet=0;
     for(int d=0;d<p.dirichlet.size();d++){
@@ -1035,7 +867,7 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
     /////////////////////////////////////////////////////////////////////////////////
     const int Nsimd = grid->Nsimd();
 
-    // Allow for multiple stencils to be communicated simultaneously
+    // Allow for multiple stencils to exist simultaneously
     if (!preserve_shm)
       _grid->ShmBufferFreeAll();
 
@@ -1103,8 +935,7 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
     GridBase *grid=_grid;
     const int Nsimd = grid->Nsimd();
 
-    //    int comms_recv      = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
-    int comms_recv      = this->_comms_recv[point];
+    int comms_recv      = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
     int fd              = _grid->_fdimensions[dimension];
     int ld              = _grid->_ldimensions[dimension];
     int rd              = _grid->_rdimensions[dimension];
@@ -1293,8 +1124,8 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 
     int comms_send   = this->_comms_send[point];
     int comms_recv   = this->_comms_recv[point];
-    //    int comms_partial_send   = this->_comms_partial_send[point] ;
-    //    int comms_partial_recv   = this->_comms_partial_recv[point] ;
+    int comms_partial_send   = this->_comms_partial_send[point] ;
+    int comms_partial_recv   = this->_comms_partial_recv[point] ;
     
     assert(rhs.Grid()==_grid);
     //	  conformable(_grid,rhs.Grid());
@@ -1329,11 +1160,11 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 	int rbytes;
 
 	if ( comms_send ) xbytes = bytes; // Full send
-	//	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else xbytes = 0; // full dirichlet
 
 	if ( comms_recv ) rbytes = bytes;
-	//	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else rbytes = 0;
 	
 	int so  = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
@@ -1360,8 +1191,7 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 	}
 
 
-	//	if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
-	if ( compress.DecompressionStep()&&comms_recv) {
+	if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
 	  recv_buf=u_simd_recv_buf[0];
 	} else {
 	  recv_buf=this->u_recv_buf_p;
@@ -1395,8 +1225,7 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 #endif
 
 	//	std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
-	//	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
-	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,0);
+	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
 
         int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask);
 	if ( !duplicate ) { // Force comms for now
@@ -1405,8 +1234,8 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 	  // Build a list of things to do after we synchronise GPUs
 	  // Start comms now???
 	  ///////////////////////////////////////////////////////////
-	  int do_send = (comms_send) && (!shm_send );
-	  int do_recv = (comms_send) && (!shm_recv );
+	  int do_send = (comms_send|comms_partial_send) && (!shm_send );
+	  int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
 	  AddPacket((void *)&send_buf[comm_off],
 		    (void *)&recv_buf[comm_off],
 		    xmit_to_rank, do_send,
@@ -1414,7 +1243,7 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 		    xbytes,rbytes);
 	}
 
-	if ( (compress.DecompressionStep() && comms_recv) ) {
+	if ( (compress.DecompressionStep() && comms_recv) || comms_partial_recv ) {
 	  AddDecompress(&this->u_recv_buf_p[comm_off],
 			&recv_buf[comm_off],
 			words,Decompressions);
@@ -1436,8 +1265,8 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 
     int comms_send   = this->_comms_send[point];
     int comms_recv   = this->_comms_recv[point];
-    //    int comms_partial_send   = this->_comms_partial_send[point] ;
-    //    int comms_partial_recv   = this->_comms_partial_recv[point] ;
+    int comms_partial_send   = this->_comms_partial_send[point] ;
+    int comms_partial_recv   = this->_comms_partial_recv[point] ;
 
     int fd = _grid->_fdimensions[dimension];
     int rd = _grid->_rdimensions[dimension];
@@ -1512,20 +1341,18 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 
 	
 	if ( comms_send ) xbytes = bytes;
-	//	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else xbytes = 0;
 
 	if ( comms_recv ) rbytes = bytes;
-	//	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else rbytes = 0;
 
 	// Gathers SIMD lanes for send and merge
 	// Different faces can be full comms or partial comms with  multiple ranks per node
-	//	if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
-	if ( comms_send || comms_recv ) {
+	if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
 
-	  //	  int partial = partialDirichlet;
-	  int partial = 0;
+	  int partial = partialDirichlet;
 	  compressor::Gather_plane_exchange(face_table[face_idx],rhs,
 					    spointers,dimension,sx,cbmask,
 					    compress,permute_type,partial );
@@ -1591,8 +1418,7 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 	      if ( (bytes != rbytes) && (rbytes!=0) ){
 		acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
 	      }
-	      //	      int do_send = (comms_send|comms_partial_send) && (!shm_send );
-	      int do_send = (comms_send) && (!shm_send );
+	      int do_send = (comms_send|comms_partial_send) && (!shm_send );
 	      AddPacket((void *)sp,(void *)rp,
 			xmit_to_rank,do_send,
 			recv_from_rank,do_send,
@@ -1606,8 +1432,7 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 	  }
 	}
 	// rpointer may be doing a remote read in the gather over SHM
-	//	if ( comms_recv|comms_partial_recv ) {
-	if ( comms_recv ) {
+	if ( comms_recv|comms_partial_recv ) {
 	  AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
 	}
 

From 87350e03b8ac068e3e693443da1675ef04681060 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Sun, 6 Jul 2025 16:33:45 +0000
Subject: [PATCH 29/53] checksum comms

---
 Grid/communicator/Communicator_base.h  |  2 +-
 Grid/communicator/Communicator_mpi3.cc | 56 +++++++++++++++++++++++-
 Grid/cshift/Cshift_mpi.h               | 59 ++++++++++++++++++++++++--
 benchmarks/Benchmark_comms.cc          |  3 +-
 benchmarks/Benchmark_dwf_fp32.cc       |  2 +-
 configure.ac                           | 11 +++++
 6 files changed, 125 insertions(+), 8 deletions(-)

diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h
index 62fcfa7b01..1c08762355 100644
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -243,7 +243,7 @@ class CartesianCommunicator : public SharedMemory {
     Broadcast(root,(void *)&data,sizeof(data));
   }
 
-}; 
+};
 
 NAMESPACE_END(Grid);
 
diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc
index 8f3bf9afc3..c4524a4616 100644
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -32,6 +32,7 @@ NAMESPACE_BEGIN(Grid);
 
 
 Grid_MPI_Comm       CartesianCommunicator::communicator_world;
+extern void * Grid_backtrace_buffer[_NBACKTRACE];
 
 ////////////////////////////////////////////
 // First initialise of comms system
@@ -555,12 +556,19 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
    * - post MPI receive buffers asynch
    * - post device - host send buffer transfer asynch
    */
-  
+
+#ifdef GRID_CHECKSUM_COMMS
+  rbytes += 8;
+  xbytes += 8;
+#endif
+
   if ( dor ) {
     if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
       tag= dir+from*32;
+
       host_recv = this->HostBufferMalloc(rbytes);
       ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
+
       assert(ierr==0);
       CommsRequest_t srq;
       srq.PacketType = InterNodeRecv;
@@ -579,9 +587,17 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
       tag= dir+_processor*32;
 
       host_xmit = this->HostBufferMalloc(xbytes);
+
       CommsRequest_t srq;
 
+#ifdef GRID_CHECKSUM_COMMS
+      uint64_t xbytes_data = xbytes - 8;
+      srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes_data); // Make this Asynch
+      assert(xbytes % 8 == 0);
+      *(uint64_t*)(((char*)host_xmit) + xbytes_data) = svm_xor((uint64_t*)xmit, xbytes_data / 8) ^ 1; // flip one bit so that a zero buffer is not consistent
+#else
       srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
+#endif
       
       //      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
       //      assert(ierr==0);
@@ -623,7 +639,11 @@ void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequ
 
 	if ( flag ) {
 	  //	  std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
+#ifdef GRID_CHECKSUM_COMMS
+	  acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes - 8);
+#else
 	  acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
+#endif
 	  list[idx].PacketType=InterNodeReceiveHtoD;
 	} else {
 	  pending ++;
@@ -786,7 +806,39 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
   //      acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
   //    }
   //  }
-  
+
+#ifdef GRID_CHECKSUM_COMMS
+  for(int r=0;r<list.size();r++){
+    if ( list[r].PacketType == InterNodeReceiveHtoD ) {
+      uint64_t rbytes_data = list[r].bytes - 8;
+      uint64_t expected_cs = *(uint64_t*)(((char*)list[r].host_buf) + rbytes_data);
+      uint64_t computed_cs = svm_xor((uint64_t*)list[r].device_buf, rbytes_data / 8) ^ 1;
+      if (expected_cs != computed_cs) {
+	// TODO: error message, backtrace, quit
+
+	fprintf(stderr, "GRID_CHECKSUM_COMMS error:\n");
+	fprintf(stderr, " processor = %d\n", (int)_processor);
+	for(int d=0;d<_processors.size();d++)
+	  fprintf(stderr, " processor_coord[%d] = %d\n", d, _processor_coor[d]);
+	fprintf(stderr, " hostname: %s\n", GridHostname());
+	fprintf(stderr, " expected_cs: %ld\n", expected_cs);
+	fprintf(stderr, " computed_cs: %ld\n", computed_cs);
+	fprintf(stderr, " dest: %d\n", list[r].dest);
+	fprintf(stderr, " tag: %d\n", list[r].tag);
+	fprintf(stderr, " commdir: %d\n", list[r].commdir);
+	fprintf(stderr, " bytes: %ld\n", (uint64_t)list[r].bytes);
+
+	fflush(stderr);
+
+	// backtrace
+	int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
+	backtrace_symbols_fd(Grid_backtrace_buffer,symbols, 2);
+
+	exit(1);
+      }
+    }
+  }
+#endif
   
   list.resize(0);               // Delete the list
   this->HostBufferFreeAll();    // Clean up the buffer allocs
diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h
index 05ee946b3a..8119bafc30 100644
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@@ -125,11 +125,20 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
   int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
   static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
   static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
+
 #ifndef ACCELERATOR_AWARE_MPI
+#ifdef GRID_CHECKSUM_COMMS
+  buffer_size += (8 + sizeof(vobj) - 1) / sizeof(vobj);
+#endif
+
   static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
   static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
+
+#ifdef GRID_CHECKSUM_COMMS
+  buffer_size -= (8 + sizeof(vobj) - 1) / sizeof(vobj);
 #endif
-  
+#endif
+
   int cb= (cbmask==0x2)? Odd : Even;
   int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
   RealD tcopy=0.0;
@@ -180,12 +189,29 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
 #else
       // bouncy bouncy
       acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
+
+#ifdef GRID_CHECKSUM_COMMS
+      assert(bytes % 8 == 0);
+      *(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = svm_xor((uint64_t*)&send_buf[0], bytes / 8) ^ 1;
+      bytes += 8;
+#endif
+      
       grid->SendToRecvFrom((void *)&hsend_buf[0],
 			   xmit_to_rank,
 			   (void *)&hrecv_buf[0],
 			   recv_from_rank,
 			   bytes);
+
+#ifdef GRID_CHECKSUM_COMMS
+      bytes -= 8;
       acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
+      uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes);
+      uint64_t computed_cs = svm_xor((uint64_t*)&recv_buf[0], bytes / 8) ^ 1;
+      assert(expected_cs == computed_cs);
+#else
+      acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
+#endif
+
 #endif
       FlightRecorder::StepLog("Cshift_SendRecv_complete");
 
@@ -255,9 +281,18 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
     send_buf_extract[s].resize(buffer_size);
     recv_buf_extract[s].resize(buffer_size);
   }
+
 #ifndef ACCELERATOR_AWARE_MPI
-  hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
-  hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
+#ifdef GRID_CHECKSUM_COMMS
+  buffer_size += (8 + sizeof(vobj) - 1) / sizeof(vobj);
+#endif
+
+  static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
+  static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
+
+#ifdef GRID_CHECKSUM_COMMS
+  buffer_size -= (8 + sizeof(vobj) - 1) / sizeof(vobj);
+#endif
 #endif
   
   int bytes = buffer_size*sizeof(scalar_object);
@@ -320,12 +355,30 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 #else
       // bouncy bouncy
 	acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
+
+#ifdef GRID_CHECKSUM_COMMS
+	assert(bytes % 8 == 0);
+	*(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = svm_xor((uint64_t*)send_buf_extract_mpi, bytes / 8) ^ 1;
+	bytes += 8;
+#endif
+	
 	grid->SendToRecvFrom((void *)&hsend_buf[0],
 			     xmit_to_rank,
 			     (void *)&hrecv_buf[0],
 			     recv_from_rank,
 			     bytes);
+	
+
+#ifdef GRID_CHECKSUM_COMMS
+	bytes -= 8;
 	acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
+	uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes);
+	uint64_t computed_cs = svm_xor((uint64_t*)recv_buf_extract_mpi, bytes / 8) ^ 1;
+	assert(expected_cs == computed_cs);
+#else
+	acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
+#endif
+
 #endif
 
 	xbytes+=bytes;
diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc
index 6696c8ebae..5feb97227c 100644
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -219,7 +219,8 @@ int main (int argc, char ** argv)
 	      int comm_proc = mpi_layout[mu]-1;
 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    }
-            int tid = omp_get_thread_num();
+            //int tid = omp_get_thread_num();
+	    int tid=0;
 	    tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,1,
 					       (void *)&rbuf[dir][0], recv_from_rank,1, bytes,tid);
 
diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc
index 583def29b0..a722346449 100644
--- a/benchmarks/Benchmark_dwf_fp32.cc
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@@ -262,7 +262,7 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
   FermionAction::ImplParams p;
   p.dirichlet=Dirichlet;
   FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
-  Dw.SloppyComms(sloppy);
+  //Dw.SloppyComms(sloppy);
   Dw.ImportGauge(Umu);
   
   int ncall =300;
diff --git a/configure.ac b/configure.ac
index c6a4d187ba..f4fcf20c95 100644
--- a/configure.ac
+++ b/configure.ac
@@ -263,6 +263,17 @@ case ${ac_ACCELERATOR_AWARE_MPI} in
     *);;
 esac
 
+############### CHECKSUM COMMS
+AC_ARG_ENABLE([checksum-comms],
+    [AS_HELP_STRING([--enable-checksum-comms=yes|no],[checksum all communication])],
+    [ac_CHECKSUM_COMMS=${enable_checksum_comms}], [ac_CHECKSUM_COMMS=yes])
+
+case ${ac_CHECKSUM_COMMS} in
+    yes)
+      AC_DEFINE([GRID_CHECKSUM_COMMS],[1],[checksum all communication]);;
+    *);;
+esac
+
 ############### SYCL/CUDA/HIP/none
 AC_ARG_ENABLE([accelerator],
     [AS_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none],[enable none,cuda,sycl,hip acceleration])],

From 912b6cf389194e58e233f28c44edfaea3bf959a9 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Sun, 6 Jul 2025 20:34:40 +0000
Subject: [PATCH 30/53] refined checksum

---
 Grid/communicator/Communicator_mpi3.cc | 10 ++++++++--
 Grid/cshift/Cshift_mpi.h               | 16 +++++++++++-----
 Grid/lattice/Lattice_reduction_sycl.h  | 20 ++++++++++++++++++++
 3 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc
index c4524a4616..c58a3439e9 100644
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -32,7 +32,11 @@ NAMESPACE_BEGIN(Grid);
 
 
 Grid_MPI_Comm       CartesianCommunicator::communicator_world;
+
+#ifdef GRID_CHECKSUM_COMMS
 extern void * Grid_backtrace_buffer[_NBACKTRACE];
+uint64_t checksum_index = 1;
+#endif
 
 ////////////////////////////////////////////
 // First initialise of comms system
@@ -558,6 +562,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
    */
 
 #ifdef GRID_CHECKSUM_COMMS
+  checksum_index += 1;
   rbytes += 8;
   xbytes += 8;
 #endif
@@ -576,6 +581,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
       srq.req        = rrq;
       srq.host_buf   = host_recv;
       srq.device_buf = recv;
+      srq.tag        = tag;
       list.push_back(srq);
       off_node_bytes+=rbytes;
     }
@@ -594,7 +600,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
       uint64_t xbytes_data = xbytes - 8;
       srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes_data); // Make this Asynch
       assert(xbytes % 8 == 0);
-      *(uint64_t*)(((char*)host_xmit) + xbytes_data) = svm_xor((uint64_t*)xmit, xbytes_data / 8) ^ 1; // flip one bit so that a zero buffer is not consistent
+      *(uint64_t*)(((char*)host_xmit) + xbytes_data) = checksum_gpu((uint64_t*)xmit, xbytes_data / 8) ^ (checksum_index + 1 + 1000 * tag); // flip one bit so that a zero buffer is not consistent
 #else
       srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
 #endif
@@ -812,7 +818,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
     if ( list[r].PacketType == InterNodeReceiveHtoD ) {
       uint64_t rbytes_data = list[r].bytes - 8;
       uint64_t expected_cs = *(uint64_t*)(((char*)list[r].host_buf) + rbytes_data);
-      uint64_t computed_cs = svm_xor((uint64_t*)list[r].device_buf, rbytes_data / 8) ^ 1;
+      uint64_t computed_cs = checksum_gpu((uint64_t*)list[r].device_buf, rbytes_data / 8) ^ (checksum_index + 1 + 1000 * list[r].tag); //
       if (expected_cs != computed_cs) {
 	// TODO: error message, backtrace, quit
 
diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h
index 8119bafc30..199721b478 100644
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@@ -29,8 +29,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef _GRID_CSHIFT_MPI_H_
 #define _GRID_CSHIFT_MPI_H_
 
+NAMESPACE_BEGIN(Grid);
+
+#ifdef GRID_CHECKSUM_COMMS
+extern uint64_t checksum_index;
+#endif
 
-NAMESPACE_BEGIN(Grid); 
 const int Cshift_verbose=0;
 template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
 {
@@ -192,7 +196,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
 
 #ifdef GRID_CHECKSUM_COMMS
       assert(bytes % 8 == 0);
-      *(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = svm_xor((uint64_t*)&send_buf[0], bytes / 8) ^ 1;
+      checksum_index++;
+      *(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = checksum_gpu((uint64_t*)&send_buf[0], bytes / 8) ^ (1 + checksum_index);
       bytes += 8;
 #endif
       
@@ -206,7 +211,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
       bytes -= 8;
       acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
       uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes);
-      uint64_t computed_cs = svm_xor((uint64_t*)&recv_buf[0], bytes / 8) ^ 1;
+      uint64_t computed_cs = checksum_gpu((uint64_t*)&recv_buf[0], bytes / 8) ^ (1 + checksum_index);
       assert(expected_cs == computed_cs);
 #else
       acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
@@ -358,7 +363,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 
 #ifdef GRID_CHECKSUM_COMMS
 	assert(bytes % 8 == 0);
-	*(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = svm_xor((uint64_t*)send_buf_extract_mpi, bytes / 8) ^ 1;
+	checksum_index++;
+	*(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = checksum_gpu((uint64_t*)send_buf_extract_mpi, bytes / 8) ^ (1 + checksum_index);
 	bytes += 8;
 #endif
 	
@@ -373,7 +379,7 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 	bytes -= 8;
 	acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
 	uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes);
-	uint64_t computed_cs = svm_xor((uint64_t*)recv_buf_extract_mpi, bytes / 8) ^ 1;
+	uint64_t computed_cs = checksum_gpu((uint64_t*)recv_buf_extract_mpi, bytes / 8) ^ (1 + checksum_index);
 	assert(expected_cs == computed_cs);
 #else
 	acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
diff --git a/Grid/lattice/Lattice_reduction_sycl.h b/Grid/lattice/Lattice_reduction_sycl.h
index bc9257b91c..463607f5b5 100644
--- a/Grid/lattice/Lattice_reduction_sycl.h
+++ b/Grid/lattice/Lattice_reduction_sycl.h
@@ -88,5 +88,25 @@ template<class Word> Word svm_xor(Word *vec,uint64_t L)
   return ret;
 }
 
+template<class Word> Word checksum_gpu(Word *vec,uint64_t L)
+{
+  Word identity;  identity=0;
+  Word ret = 0;
+  { 
+    sycl::buffer<Word, 1> abuff(&ret, {1});
+    theGridAccelerator->submit([&](sycl::handler &cgh) {
+      auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
+      cgh.parallel_for(sycl::range<1>{L},
+                      Reduction,
+                      [=] (sycl::id<1> index, auto &sum) {
+			auto l = index % 61;
+                        sum ^= vec[index]<<l | vec[index]>>(64-l);
+                      });
+    });
+  }
+  theGridAccelerator->wait();
+  return ret;
+}
+
 NAMESPACE_END(Grid);
 

From c0ca7f05db5fa2b37c668472aab78578001c92cd Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Mon, 7 Jul 2025 06:02:47 +0000
Subject: [PATCH 31/53] checksum fix

---
 Grid/communicator/Communicator_mpi3.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc
index c58a3439e9..2dad9698fb 100644
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -562,7 +562,6 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
    */
 
 #ifdef GRID_CHECKSUM_COMMS
-  checksum_index += 1;
   rbytes += 8;
   xbytes += 8;
 #endif
@@ -844,8 +843,10 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
       }
     }
   }
+
+  checksum_index += 1;
 #endif
-  
+
   list.resize(0);               // Delete the list
   this->HostBufferFreeAll();    // Clean up the buffer allocs
 #ifndef NVLINK_GET

From 6307fcb43936e928c80f6cf2d23d0b1102e7b93f Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Wed, 9 Jul 2025 09:00:25 +0200
Subject: [PATCH 32/53] view logging

---
 Grid/lattice/Lattice_view.h | 44 ++++++++++++++++++++++++++++++++++
 Grid/stencil/Stencil.h      |  6 +++++
 Grid/util/FlightRecorder.cc | 48 +++++++++++++++++++++++++++++++++++++
 Grid/util/FlightRecorder.h  | 18 ++++++++++++++
 configure.ac                | 11 +++++++++
 5 files changed, 127 insertions(+)

diff --git a/Grid/lattice/Lattice_view.h b/Grid/lattice/Lattice_view.h
index 064c10e6f5..63065efdad 100644
--- a/Grid/lattice/Lattice_view.h
+++ b/Grid/lattice/Lattice_view.h
@@ -106,6 +106,49 @@ class LatticeView : public LatticeAccelerator<vobj>
   }
 
 };
+
+#ifdef GRID_LOG_VIEWS
+
+// Little autoscope assister
+template<class View> 
+class ViewCloser
+{
+  View v;  // Take a copy of view and call view close when I go out of scope automatically
+  const char* filename; int line, mode;
+ public:
+  ViewCloser(View &_v, const char* _filename, int _line, int _mode) :
+    v(_v), filename(_filename), line(_line), mode(_mode) {
+
+    switch (mode){
+    case AcceleratorRead:
+    case AcceleratorWrite:
+    case CpuRead:
+    case CpuWrite:
+      ViewLogger::Log(filename, line, 1, mode, &v[0], v.size() * sizeof(v[0]));
+      break;
+    } 
+
+  };
+  ~ViewCloser() {
+
+    switch (mode) {
+    case AcceleratorWrite:
+    case CpuWrite:
+      ViewLogger::Log(filename, line, -1, mode, &v[0], v.size() * sizeof(v[0]));
+      break;
+    }
+      
+    v.ViewClose();
+  }
+};
+
+#define autoView(l_v,l,mode)				\
+	  auto l_v = l.View(mode);			\
+	  ViewCloser<decltype(l_v)> _autoView##l_v(l_v,__FILE__,__LINE__,mode);
+
+#else
+
+
 // Little autoscope assister
 template<class View> 
 class ViewCloser
@@ -119,6 +162,7 @@ class ViewCloser
 #define autoView(l_v,l,mode)				\
 	  auto l_v = l.View(mode);			\
 	  ViewCloser<decltype(l_v)> _autoView##l_v(l_v);
+#endif
 
 /////////////////////////////////////////////////////////////////////////////////////////
 // Lattice expression types used by ET to assemble the AST
diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h
index 20237732df..c7c14658af 100644
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -186,6 +186,12 @@ class CartesianStencilView : public CartesianStencilAccelerator<vobj,cobj,Parame
 
   void ViewClose(void)  {  }
 
+
+#ifdef GRID_LOG_VIEWS
+  size_t size() { return sizeof(uint64_t); };
+  uint64_t & operator[](size_t i) { static uint64_t v=0; return v; };
+#endif
+  
 };
 
 ////////////////////////////////////////
diff --git a/Grid/util/FlightRecorder.cc b/Grid/util/FlightRecorder.cc
index 139e7957de..4f4468010d 100644
--- a/Grid/util/FlightRecorder.cc
+++ b/Grid/util/FlightRecorder.cc
@@ -372,4 +372,52 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
   }
 }
 
+
+
+#ifdef GRID_LOG_VIEWS
+
+bool ViewLogger::Enabled = false;
+std::vector<ViewLogger::Entry_t> ViewLogger::LogVector;
+
+void ViewLogger::Begin() { Enabled = true; LogVector.resize(0); }
+void ViewLogger::End() { Enabled = false; }
+
+void ViewLogger::Log(const char* filename, int line, int index, int mode, void* data, uint64_t bytes) {
+  if (!Enabled)
+    return;
+  
+  size_t i = LogVector.size();
+  LogVector.resize(i + 1);
+  auto & n = LogVector[i];
+
+  n.filename = filename;
+  n.line = line;
+  n.index = index;
+
+  if (bytes < sizeof(uint64_t)) {
+    
+    n.head = n.tail = 0;
+    
+  } else {
+  
+    switch (mode) {
+    case AcceleratorRead:
+    case AcceleratorWrite:
+    case AcceleratorWriteDiscard:
+      acceleratorCopyFromDevice((char*)data, &n.head, sizeof(uint64_t));
+      acceleratorCopyFromDevice((char*)data + bytes - sizeof(uint64_t), &n.tail, sizeof(uint64_t));
+      break;
+      
+    case CpuRead:
+    case CpuWrite:
+      //case CpuWriteDiscard:
+      n.head = *(uint64_t*)data;
+      n.tail = *(uint64_t*)((char*)data + bytes - sizeof(uint64_t));
+      break;
+    }
+  }
+}
+
+#endif
+
 NAMESPACE_END(Grid);
diff --git a/Grid/util/FlightRecorder.h b/Grid/util/FlightRecorder.h
index 7c1cd17191..ee35f22591 100644
--- a/Grid/util/FlightRecorder.h
+++ b/Grid/util/FlightRecorder.h
@@ -42,5 +42,23 @@ class FlightRecorder {
   static void xmitLog(void *,uint64_t bytes);
   static void recvLog(void *,uint64_t bytes,int rank);
 };
+
+#ifdef GRID_LOG_VIEWS
+class ViewLogger {
+  struct Entry_t {
+    const char* filename;
+    int line;
+    int index;
+    uint64_t head, tail;
+  };
+
+public:
+  static bool Enabled;
+  static std::vector<Entry_t> LogVector;
+  static void Begin();
+  static void End();
+  static void Log(const char* filename, int line, int index, int mode, void* data, uint64_t bytes);
+};
+#endif
 NAMESPACE_END(Grid);
 
diff --git a/configure.ac b/configure.ac
index f4fcf20c95..84e8f46ef1 100644
--- a/configure.ac
+++ b/configure.ac
@@ -274,6 +274,17 @@ case ${ac_CHECKSUM_COMMS} in
     *);;
 esac
 
+############### LOG VIEWS
+AC_ARG_ENABLE([log-views],
+    [AS_HELP_STRING([--enable-log-views=yes|no],[log information on all view open/close])],
+    [ac_LOG_VIEWS=${enable_log_views}], [ac_LOG_VIEWS=yes])
+
+case ${ac_LOG_VIEWS} in
+    yes)
+      AC_DEFINE([GRID_LOG_VIEWS],[1],[log information on all view open/close]);;
+    *);;
+esac
+
 ############### SYCL/CUDA/HIP/none
 AC_ARG_ENABLE([accelerator],
     [AS_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none],[enable none,cuda,sycl,hip acceleration])],

From 5ad20a90917e893287f5bd0f659dbea6d793cc9e Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Thu, 10 Jul 2025 20:15:53 +0000
Subject: [PATCH 33/53] more appropriate view modes

---
 Grid/cshift/Cshift_common.h                               | 6 +++---
 .../action/fermion/implementation/CayleyFermion5Dcache.h  | 8 ++++----
 .../fermion/implementation/WilsonKernelsImplementation.h  | 6 +++---
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h
index fdb98cd4e3..b8099c2788 100644
--- a/Grid/cshift/Cshift_common.h
+++ b/Grid/cshift/Cshift_common.h
@@ -202,7 +202,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<
   {
     auto buffer_p = & buffer[0];
     auto table = MapCshiftTable();
-    autoView( rhs_v, rhs, AcceleratorWrite);
+    autoView( rhs_v, rhs, AcceleratorWriteDiscard);
     accelerator_for(i,ent,vobj::Nsimd(),{
 	coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
     });
@@ -228,7 +228,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
   if(cbmask ==0x3 ) {
     int _slice_stride = rhs.Grid()->_slice_stride[dimension];
     int _slice_block = rhs.Grid()->_slice_block[dimension];
-    autoView( rhs_v , rhs, AcceleratorWrite);
+    autoView( rhs_v , rhs, AcceleratorWriteDiscard);
     accelerator_for(nn,e1*e2,1,{
 	int n = nn%e1;
 	int b = nn/e1;
@@ -302,7 +302,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
   {
     auto table = MapCshiftTable();
     autoView(rhs_v , rhs, AcceleratorRead);
-    autoView(lhs_v , lhs, AcceleratorWrite);
+    autoView(lhs_v , lhs, AcceleratorWriteDiscard);
     accelerator_for(i,ent,vobj::Nsimd(),{
       coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
     });
diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
index 5fbc7612b5..f3e4aa026f 100644
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
@@ -52,7 +52,7 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
   GridBase *grid=psi_i.Grid();
   autoView(psi , psi_i,AcceleratorRead);
   autoView(phi , phi_i,AcceleratorRead);
-  autoView(chi , chi_i,AcceleratorWrite);
+  autoView(chi , chi_i,AcceleratorWriteDiscard);
   assert(phi.Checkerboard() == psi.Checkerboard());
 
   int Ls =this->Ls;
@@ -94,7 +94,7 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
   GridBase *grid=psi_i.Grid();
   autoView(psi , psi_i,AcceleratorRead);
   autoView(phi , phi_i,AcceleratorRead);
-  autoView(chi , chi_i,AcceleratorWrite);
+  autoView(chi , chi_i,AcceleratorWriteDiscard);
   assert(phi.Checkerboard() == psi.Checkerboard());
 
   int Ls=this->Ls;
@@ -130,7 +130,7 @@ CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi
   GridBase *grid=psi_i.Grid();
 
   autoView(psi , psi_i,AcceleratorRead);
-  autoView(chi , chi_i,AcceleratorWrite);
+  autoView(chi , chi_i,AcceleratorWriteDiscard);
 
   int Ls=this->Ls;
 
@@ -194,7 +194,7 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
   int Ls=this->Ls;
 
   autoView(psi , psi_i,AcceleratorRead);
-  autoView(chi , chi_i,AcceleratorWrite);
+  autoView(chi , chi_i,AcceleratorWriteDiscard);
 
   acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
   acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
index 26e80838d2..0908f72fcf 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -500,7 +500,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 {
     autoView(U_v  ,  U,AcceleratorRead);
     autoView(in_v , in,AcceleratorRead);
-    autoView(out_v,out,AcceleratorWrite);
+    autoView(out_v,out,AcceleratorWriteDiscard);
     autoView(st_v , st,AcceleratorRead);
 
    if( interior && exterior ) {
@@ -535,7 +535,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 {
     autoView(U_v  ,  U,AcceleratorRead);
     autoView(in_v , in,AcceleratorRead);
-    autoView(out_v,out,AcceleratorWrite);
+    autoView(out_v,out,AcceleratorWriteDiscard);
     autoView(st_v , st,AcceleratorRead);
     KERNEL_CALL_ID(GenericDhopSite);
 }
@@ -546,7 +546,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
   {
     autoView(U_v  ,U,AcceleratorRead);
     autoView(in_v ,in,AcceleratorRead);
-    autoView(out_v,out,AcceleratorWrite);
+    autoView(out_v,out,AcceleratorWriteDiscard);
     autoView(st_v ,st,AcceleratorRead);
 
    if( interior && exterior ) {

From 84c7196709db5f1ca156d04e407d44de2725bc26 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Fri, 11 Jul 2025 06:59:48 +0000
Subject: [PATCH 34/53] stencil view plays nicely with view logging

---
 Grid/stencil/Stencil.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h
index c7c14658af..5be82aa459 100644
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -188,7 +188,7 @@ class CartesianStencilView : public CartesianStencilAccelerator<vobj,cobj,Parame
 
 
 #ifdef GRID_LOG_VIEWS
-  size_t size() { return sizeof(uint64_t); };
+  size_t size() { return 0; };
   uint64_t & operator[](size_t i) { static uint64_t v=0; return v; };
 #endif
   

From 965cee4372db2a61f0f13546fd521670a614495c Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Mon, 14 Jul 2025 18:08:20 +0000
Subject: [PATCH 35/53] towards cleanup of LogViews

---
 Grid/lattice/Lattice_view.h                                 | 2 ++
 .../fermion/implementation/WilsonKernelsImplementation.h    | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/Grid/lattice/Lattice_view.h b/Grid/lattice/Lattice_view.h
index 63065efdad..2f6a29e613 100644
--- a/Grid/lattice/Lattice_view.h
+++ b/Grid/lattice/Lattice_view.h
@@ -133,7 +133,9 @@ class ViewCloser
 
     switch (mode) {
     case AcceleratorWrite:
+    case AcceleratorWriteDiscard:
     case CpuWrite:
+    // case CpuWriteDiscard: <- this is a duplicate of CpuWrite (does not make sense to differentiate)
       ViewLogger::Log(filename, line, -1, mode, &v[0], v.size() * sizeof(v[0]));
       break;
     }
diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
index 0908f72fcf..26e80838d2 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -500,7 +500,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 {
     autoView(U_v  ,  U,AcceleratorRead);
     autoView(in_v , in,AcceleratorRead);
-    autoView(out_v,out,AcceleratorWriteDiscard);
+    autoView(out_v,out,AcceleratorWrite);
     autoView(st_v , st,AcceleratorRead);
 
    if( interior && exterior ) {
@@ -535,7 +535,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 {
     autoView(U_v  ,  U,AcceleratorRead);
     autoView(in_v , in,AcceleratorRead);
-    autoView(out_v,out,AcceleratorWriteDiscard);
+    autoView(out_v,out,AcceleratorWrite);
     autoView(st_v , st,AcceleratorRead);
     KERNEL_CALL_ID(GenericDhopSite);
 }
@@ -546,7 +546,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
   {
     autoView(U_v  ,U,AcceleratorRead);
     autoView(in_v ,in,AcceleratorRead);
-    autoView(out_v,out,AcceleratorWriteDiscard);
+    autoView(out_v,out,AcceleratorWrite);
     autoView(st_v ,st,AcceleratorRead);
 
    if( interior && exterior ) {

From d7cdab503d0ca84e5a30b5b829726332a61d3ff7 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Tue, 22 Jul 2025 05:31:18 +0000
Subject: [PATCH 36/53] double up sycl fences and barriers

---
 Grid/threads/Accelerator.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index 2b91165cf4..8b4d51d093 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -346,7 +346,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 		     });						\
   });
 
-#define accelerator_barrier(dummy) { theGridAccelerator->wait(); }
+#define accelerator_barrier(dummy) { theGridAccelerator->wait(); theGridAccelerator->wait(); }
 
 inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
 inline void *acceleratorAllocHost(size_t bytes)  { return malloc_host(bytes,*theGridAccelerator);};
@@ -355,7 +355,7 @@ inline void acceleratorFreeHost(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
 
-inline void acceleratorCopySynchronise(void) {  theCopyAccelerator->wait(); }
+inline void acceleratorCopySynchronise(void) {  theCopyAccelerator->wait(); theCopyAccelerator->wait(); }
 
 
 ///////
@@ -377,9 +377,9 @@ inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *t
 inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)        { return theCopyAccelerator->memcpy(to,from,bytes); }
 inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes)      { return theCopyAccelerator->memcpy(to,from,bytes); }
 
-inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
-inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
-inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
+inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();theCopyAccelerator->wait();}
+inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();theCopyAccelerator->wait();}
+inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();theCopyAccelerator->wait();}
 
 inline int  acceleratorIsCommunicable(void *ptr)
 {
@@ -652,7 +652,7 @@ inline void acceleratorFreeCpu  (void *ptr){free(ptr);};
 //////////////////////////////////////////////
 
 #ifdef GRID_SYCL
-inline void acceleratorFenceComputeStream(void){ theGridAccelerator->ext_oneapi_submit_barrier(); };
+inline void acceleratorFenceComputeStream(void){ theGridAccelerator->ext_oneapi_submit_barrier(); theGridAccelerator->ext_oneapi_submit_barrier(); };
 #else
 // Ordering within a stream guaranteed on Nvidia & AMD
 inline void acceleratorFenceComputeStream(void){ };

From 81cc867748be6cba303eec2e6f0d776afc04dc95 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Wed, 23 Jul 2025 08:13:22 +0200
Subject: [PATCH 37/53] SYCL lane

---
 Grid/threads/Accelerator.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index 2b91165cf4..71dd2a3ab1 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -317,13 +317,11 @@ extern sycl::queue *theCopyAccelerator;
 #define accelerator 
 #define accelerator_inline strong_inline
 
-accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #ifdef GRID_SIMT
- return __spirv::initLocalInvocationId<3, sycl::id<3>>()[2]; 
+#define acceleratorSIMTlane(Nsimd) _lane
 #else
- return 0;
+#define acceleratorSIMTlane(Nsimd) 0
 #endif
-} // SYCL specific
 
 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
   theGridAccelerator->submit([&](sycl::handler &cgh) {		\
@@ -341,7 +339,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 		     {							\
 		       auto iter1    = item.get_global_id(0);		\
 		       auto iter2    = item.get_global_id(1);		\
-		       auto lane     = item.get_global_id(2);		\
+		       auto _lane     = item.get_global_id(2);		\
 		       { if (iter1 < unum1){ __VA_ARGS__ } };		\
 		     });						\
   });

From 8fbeee866191b5e8cd8e2cd1423ef7a60bc736a6 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Wed, 23 Jul 2025 06:37:53 +0000
Subject: [PATCH 38/53] cshift test

---
 Grid/cshift/Cshift_mpi.h   | 4 ++++
 Grid/threads/Accelerator.h | 8 +++++---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h
index 199721b478..86d4305354 100644
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@@ -380,6 +380,10 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 	acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
 	uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes);
 	uint64_t computed_cs = checksum_gpu((uint64_t*)recv_buf_extract_mpi, bytes / 8) ^ (1 + checksum_index);
+	if (expected_cs != computed_cs) {
+	  fprintf(stderr, "CSHIFT %ld <> %ld\n", expected_cs, computed_cs);
+	  fflush(stderr);
+	}
 	assert(expected_cs == computed_cs);
 #else
 	acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index fc3531ca79..8b4d51d093 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -317,11 +317,13 @@ extern sycl::queue *theCopyAccelerator;
 #define accelerator 
 #define accelerator_inline strong_inline
 
+accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #ifdef GRID_SIMT
-#define acceleratorSIMTlane(Nsimd) _lane
+ return __spirv::initLocalInvocationId<3, sycl::id<3>>()[2]; 
 #else
-#define acceleratorSIMTlane(Nsimd) 0
+ return 0;
 #endif
+} // SYCL specific
 
 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
   theGridAccelerator->submit([&](sycl::handler &cgh) {		\
@@ -339,7 +341,7 @@ extern sycl::queue *theCopyAccelerator;
 		     {							\
 		       auto iter1    = item.get_global_id(0);		\
 		       auto iter2    = item.get_global_id(1);		\
-		       auto _lane     = item.get_global_id(2);		\
+		       auto lane     = item.get_global_id(2);		\
 		       { if (iter1 < unum1){ __VA_ARGS__ } };		\
 		     });						\
   });

From 08b3844d54d0a86e03d88dcfb47eb661e2ec01fa Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Wed, 23 Jul 2025 16:08:19 +0200
Subject: [PATCH 39/53] remove error message since fallback to small-sum path
 is taken

---
 Grid/lattice/Lattice_reduction_gpu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h
index 91cb8226bd..3485114c40 100644
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -52,7 +52,7 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &
   // let the number of threads in a block be a multiple of 2, starting from warpSize
   threads = warpSize;
   if ( threads*sizeofsobj > sharedMemPerBlock ) {
-    std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
+    //std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
     return 0;
   }
   while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;

From d22d07bda5c7f09cdc7798f0d5761ee96cc62868 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Sat, 26 Jul 2025 05:20:44 +0000
Subject: [PATCH 40/53] checksum index per Grid to play nice with split grid

---
 Grid/communicator/Communicator_base.h  |  5 +++++
 Grid/communicator/Communicator_mpi3.cc | 11 +++++++----
 Grid/cshift/Cshift_mpi.h               |  8 ++------
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h
index 1c08762355..a0cb01341c 100644
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -243,6 +243,11 @@ class CartesianCommunicator : public SharedMemory {
     Broadcast(root,(void *)&data,sizeof(data));
   }
 
+#ifdef GRID_CHECKSUM_COMMS
+  uint64_t checksumIndex;
+
+  uint64_t incrementChecksumIndex() { return checksumIndex++; };
+#endif
 };
 
 NAMESPACE_END(Grid);
diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc
index 2dad9698fb..b992f59fb6 100644
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -35,7 +35,6 @@ Grid_MPI_Comm       CartesianCommunicator::communicator_world;
 
 #ifdef GRID_CHECKSUM_COMMS
 extern void * Grid_backtrace_buffer[_NBACKTRACE];
-uint64_t checksum_index = 1;
 #endif
 
 ////////////////////////////////////////////
@@ -250,6 +249,10 @@ void CartesianCommunicator::InitFromMPICommunicator(const Coordinate &processors
     MPI_Comm_dup(communicator,&communicator_halo[i]);
   }
   assert(Size==_Nprocessors);
+
+#ifdef GRID_CHECKSUM_COMMS
+  checksumIndex = 0;
+#endif
 }
 
 CartesianCommunicator::~CartesianCommunicator()
@@ -599,7 +602,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
       uint64_t xbytes_data = xbytes - 8;
       srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes_data); // Make this Asynch
       assert(xbytes % 8 == 0);
-      *(uint64_t*)(((char*)host_xmit) + xbytes_data) = checksum_gpu((uint64_t*)xmit, xbytes_data / 8) ^ (checksum_index + 1 + 1000 * tag); // flip one bit so that a zero buffer is not consistent
+      *(uint64_t*)(((char*)host_xmit) + xbytes_data) = checksum_gpu((uint64_t*)xmit, xbytes_data / 8) ^ (checksumIndex + 1 + 1000 * tag); // flip one bit so that a zero buffer is not consistent
 #else
       srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
 #endif
@@ -817,7 +820,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
     if ( list[r].PacketType == InterNodeReceiveHtoD ) {
       uint64_t rbytes_data = list[r].bytes - 8;
       uint64_t expected_cs = *(uint64_t*)(((char*)list[r].host_buf) + rbytes_data);
-      uint64_t computed_cs = checksum_gpu((uint64_t*)list[r].device_buf, rbytes_data / 8) ^ (checksum_index + 1 + 1000 * list[r].tag); //
+      uint64_t computed_cs = checksum_gpu((uint64_t*)list[r].device_buf, rbytes_data / 8) ^ (checksumIndex + 1 + 1000 * list[r].tag); //
       if (expected_cs != computed_cs) {
 	// TODO: error message, backtrace, quit
 
@@ -844,7 +847,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
     }
   }
 
-  checksum_index += 1;
+  incrementChecksumIndex();
 #endif
 
   list.resize(0);               // Delete the list
diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h
index 86d4305354..17057acec0 100644
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@@ -31,10 +31,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 NAMESPACE_BEGIN(Grid);
 
-#ifdef GRID_CHECKSUM_COMMS
-extern uint64_t checksum_index;
-#endif
-
 const int Cshift_verbose=0;
 template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
 {
@@ -196,7 +192,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
 
 #ifdef GRID_CHECKSUM_COMMS
       assert(bytes % 8 == 0);
-      checksum_index++;
+      uint64_t checksum_index = grid->incrementChecksumIndex();
       *(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = checksum_gpu((uint64_t*)&send_buf[0], bytes / 8) ^ (1 + checksum_index);
       bytes += 8;
 #endif
@@ -363,7 +359,7 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 
 #ifdef GRID_CHECKSUM_COMMS
 	assert(bytes % 8 == 0);
-	checksum_index++;
+	uint64_t checksum_index = grid->incrementChecksumIndex();
 	*(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = checksum_gpu((uint64_t*)send_buf_extract_mpi, bytes / 8) ^ (1 + checksum_index);
 	bytes += 8;
 #endif

From b83023567bb79194f62f8ad84a1bb2f0a7ab1602 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Mon, 28 Jul 2025 21:34:18 +0200
Subject: [PATCH 41/53] DeviceMemoryAllocator

---
 Grid/allocator/Allocator.h              |   1 +
 Grid/allocator/DeviceMemoryAllocator.cc | 114 ++++++++++++++++++++++++
 Grid/allocator/DeviceMemoryAllocator.h  |  37 ++++++++
 Grid/threads/Accelerator.h              |  10 +++
 configure.ac                            |  11 +++
 5 files changed, 173 insertions(+)
 create mode 100644 Grid/allocator/DeviceMemoryAllocator.cc
 create mode 100644 Grid/allocator/DeviceMemoryAllocator.h

diff --git a/Grid/allocator/Allocator.h b/Grid/allocator/Allocator.h
index 589ea36f83..f151bc0d54 100644
--- a/Grid/allocator/Allocator.h
+++ b/Grid/allocator/Allocator.h
@@ -2,3 +2,4 @@
 #include <Grid/allocator/MemoryStats.h>
 #include <Grid/allocator/MemoryManager.h>
 #include <Grid/allocator/AlignedAllocator.h>
+#include <Grid/allocator/DeviceMemoryAllocator.h>
diff --git a/Grid/allocator/DeviceMemoryAllocator.cc b/Grid/allocator/DeviceMemoryAllocator.cc
new file mode 100644
index 0000000000..0965207c52
--- /dev/null
+++ b/Grid/allocator/DeviceMemoryAllocator.cc
@@ -0,0 +1,114 @@
+#include <Grid/GridCore.h>
+
+NAMESPACE_BEGIN(Grid);
+
+#define DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE (64*1024)
+
+
+struct DeviceMemoryAllocator {
+
+  bool initialized;
+  char* base;
+  size_t size;
+  size_t offset;
+
+  DeviceMemoryAllocator() {
+    initialized = false;
+    base = 0;
+    size = 0;
+    offset = 0;
+  }
+
+  ~DeviceMemoryAllocator() {
+    if (initialized) {
+      acceleratorFreeDevice(base);
+      initialized = false;
+    }
+  }
+
+  std::vector<size_t> pages;
+  std::map<size_t, std::vector<size_t> > size_map;
+
+  void Init(size_t _size) {
+    assert(!initialized);
+    
+    size_t n_pages = (_size + DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE - 1) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
+    size = n_pages * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
+    std::cout << GridLogMessage << "Init with " <<
+      size << " bytes" << std::endl;
+
+    base = (char*)acceleratorAllocDeviceInternal(size);
+    assert(base);
+    
+    std::cout << GridLogMessage << "Initialize memory to zero" << std::endl;
+
+    {
+      uint64_t* ba = (uint64_t*)base;
+      size_t n = size / sizeof(uint64_t);
+      accelerator_for(i, n, 1, {
+	  ba[i] = (uint64_t)-1;
+	});
+    }  
+    
+    std::cout << GridLogMessage << "Done" << std::endl;
+    
+    offset = 0;
+    
+    pages.resize(n_pages, 0);
+    
+    std::cout << GridLogMessage << "Pages initialized" << std::endl;
+  
+    initialized = true;
+  }
+};
+
+static DeviceMemoryAllocator dma;
+
+void *acceleratorAllocDevice(size_t bytes) {
+  if (!dma.initialized)
+    dma.Init(MemoryManager::DeviceMaxBytes);
+  
+  if (!bytes)
+    bytes++;
+  
+  size_t n_pages = (bytes + DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE - 1) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
+  
+  // first check if
+  auto & sm = dma.size_map[n_pages];
+  if (sm.size()) {
+    size_t index = sm.back();
+    sm.pop_back();
+    assert(dma.pages[index] == 0);
+    dma.pages[index] = n_pages;
+    
+    return dma.base + index * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
+  }
+
+  size_t end = (dma.offset + n_pages) * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
+  assert(end <= dma.size);
+  dma.pages[dma.offset] = n_pages;
+
+  void* ptr = dma.base + dma.offset * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
+  dma.offset += n_pages;
+
+  std::cout << GridLogMessage << (dma.size - end) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE << " pages left to allocate ("
+	    << (dma.size - end) * 100 / dma.size << "% free)" << std::endl;
+
+  return ptr;
+
+}
+
+void acceleratorFreeDevice(void *ptr) {
+  if (!dma.initialized)
+    return;
+  
+  size_t index = ((size_t)((char*)ptr - dma.base)) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
+  size_t n_pages = dma.pages[index];
+  //std::cout << GridLogMessage << "Freeing ptr " << ptr << " has " << n_pages << " pages" << std::endl;
+  dma.pages[index] = 0;
+  auto & sm = dma.size_map[n_pages];
+  sm.push_back(index);
+  
+}
+
+NAMESPACE_END(Grid);
diff --git a/Grid/allocator/DeviceMemoryAllocator.h b/Grid/allocator/DeviceMemoryAllocator.h
new file mode 100644
index 0000000000..d3401d7204
--- /dev/null
+++ b/Grid/allocator/DeviceMemoryAllocator.h
@@ -0,0 +1,37 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/DeviceMemoryAllocator.h
+
+    Copyright (C) 2025
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+NAMESPACE_BEGIN(Grid);
+
+#ifdef GRID_DEVICE_MEMORY_ALLOCATOR
+void *acceleratorAllocDevice(size_t bytes);
+void acceleratorFreeDevice(void *ptr);
+#endif
+
+NAMESPACE_END(Grid);
diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index 8b4d51d093..9d9f765d2b 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -44,6 +44,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 inline void *memalign(size_t align, size_t bytes) { return malloc(bytes); }
 #endif
 
+#ifdef GRID_DEVICE_MEMORY_ALLOCATOR
+#define acceleratorAllocDevice acceleratorAllocDeviceInternal
+#define acceleratorFreeDevice acceleratorFreeDeviceInternal
+#endif
+
 NAMESPACE_BEGIN(Grid);
 
 //////////////////////////////////////////////////////////////////////////////////
@@ -722,3 +727,8 @@ template<class T> T acceleratorGet(T& dev)
 
 
 NAMESPACE_END(Grid);
+
+#ifdef GRID_DEVICE_MEMORY_ALLOCATOR
+#undef acceleratorAllocDevice
+#undef acceleratorFreeDevice
+#endif
diff --git a/configure.ac b/configure.ac
index 84e8f46ef1..82954e753a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -274,6 +274,17 @@ case ${ac_CHECKSUM_COMMS} in
     *);;
 esac
 
+############### DEVICE MEMORY ALLOCATOR
+AC_ARG_ENABLE([device-memory-allocator],
+    [AS_HELP_STRING([--enable-device-memory-allocator=yes|no],[device memory allocator])],
+    [ac_DEVICE_MEMORY_ALLOCATOR=${device_memory_allocator}], [ac_DEVICE_MEMORY_ALLOCATOR=yes])
+
+case ${ac_DEVICE_MEMORY_ALLOCATOR} in
+    yes)
+      AC_DEFINE([GRID_DEVICE_MEMORY_ALLOCATOR],[1],[device memory allocator]);;
+    *);;
+esac
+
 ############### LOG VIEWS
 AC_ARG_ENABLE([log-views],
     [AS_HELP_STRING([--enable-log-views=yes|no],[log information on all view open/close])],

From 928abe5c6df5e6d8375acd66530354963d830ccc Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Mon, 28 Jul 2025 21:51:07 +0200
Subject: [PATCH 42/53] DeviceMemoryAllocator

---
 Grid/allocator/DeviceMemoryAllocator.cc | 3 ++-
 configure.ac                            | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/Grid/allocator/DeviceMemoryAllocator.cc b/Grid/allocator/DeviceMemoryAllocator.cc
index 0965207c52..9599d8ddfb 100644
--- a/Grid/allocator/DeviceMemoryAllocator.cc
+++ b/Grid/allocator/DeviceMemoryAllocator.cc
@@ -4,7 +4,7 @@ NAMESPACE_BEGIN(Grid);
 
 #define DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE (64*1024)
 
-
+#ifdef GRID_DEVICE_MEMORY_ALLOCATOR
 struct DeviceMemoryAllocator {
 
   bool initialized;
@@ -110,5 +110,6 @@ void acceleratorFreeDevice(void *ptr) {
   sm.push_back(index);
   
 }
+#endif
 
 NAMESPACE_END(Grid);
diff --git a/configure.ac b/configure.ac
index 82954e753a..20f4c9d520 100644
--- a/configure.ac
+++ b/configure.ac
@@ -266,7 +266,7 @@ esac
 ############### CHECKSUM COMMS
 AC_ARG_ENABLE([checksum-comms],
     [AS_HELP_STRING([--enable-checksum-comms=yes|no],[checksum all communication])],
-    [ac_CHECKSUM_COMMS=${enable_checksum_comms}], [ac_CHECKSUM_COMMS=yes])
+    [ac_CHECKSUM_COMMS=${enable_checksum_comms}], [ac_CHECKSUM_COMMS=no])
 
 case ${ac_CHECKSUM_COMMS} in
     yes)
@@ -277,7 +277,7 @@ esac
 ############### DEVICE MEMORY ALLOCATOR
 AC_ARG_ENABLE([device-memory-allocator],
     [AS_HELP_STRING([--enable-device-memory-allocator=yes|no],[device memory allocator])],
-    [ac_DEVICE_MEMORY_ALLOCATOR=${device_memory_allocator}], [ac_DEVICE_MEMORY_ALLOCATOR=yes])
+    [ac_DEVICE_MEMORY_ALLOCATOR=${device_memory_allocator}], [ac_DEVICE_MEMORY_ALLOCATOR=no])
 
 case ${ac_DEVICE_MEMORY_ALLOCATOR} in
     yes)
@@ -288,7 +288,7 @@ esac
 ############### LOG VIEWS
 AC_ARG_ENABLE([log-views],
     [AS_HELP_STRING([--enable-log-views=yes|no],[log information on all view open/close])],
-    [ac_LOG_VIEWS=${enable_log_views}], [ac_LOG_VIEWS=yes])
+    [ac_LOG_VIEWS=${enable_log_views}], [ac_LOG_VIEWS=no])
 
 case ${ac_LOG_VIEWS} in
     yes)

From c973aa2cb71319c016d61aeab49fb5bdbce16d1e Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Tue, 29 Jul 2025 05:00:52 +0000
Subject: [PATCH 43/53] fix header

---
 Grid/allocator/DeviceMemoryAllocator.cc | 28 +++++++++++++++++++++++++
 Grid/allocator/DeviceMemoryAllocator.h  |  3 +--
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/Grid/allocator/DeviceMemoryAllocator.cc b/Grid/allocator/DeviceMemoryAllocator.cc
index 9599d8ddfb..1a1763861e 100644
--- a/Grid/allocator/DeviceMemoryAllocator.cc
+++ b/Grid/allocator/DeviceMemoryAllocator.cc
@@ -1,3 +1,31 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/DeviceMemoryAllocator.h
+
+    Copyright (C) 2025
+
+Author: Christoph Lehner <christoph@lhnr.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
 #include <Grid/GridCore.h>
 
 NAMESPACE_BEGIN(Grid);
diff --git a/Grid/allocator/DeviceMemoryAllocator.h b/Grid/allocator/DeviceMemoryAllocator.h
index d3401d7204..13c7eb8e18 100644
--- a/Grid/allocator/DeviceMemoryAllocator.h
+++ b/Grid/allocator/DeviceMemoryAllocator.h
@@ -6,8 +6,7 @@
 
     Copyright (C) 2025
 
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Christoph Lehner <christoph@lhnr.de>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by

From 5cdf5baf77b7c3a95020d2657867ae9e2ea3f170 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Tue, 29 Jul 2025 06:07:33 +0000
Subject: [PATCH 44/53] fix configure.ac

---
 configure.ac | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 20f4c9d520..d747077a85 100644
--- a/configure.ac
+++ b/configure.ac
@@ -277,7 +277,7 @@ esac
 ############### DEVICE MEMORY ALLOCATOR
 AC_ARG_ENABLE([device-memory-allocator],
     [AS_HELP_STRING([--enable-device-memory-allocator=yes|no],[device memory allocator])],
-    [ac_DEVICE_MEMORY_ALLOCATOR=${device_memory_allocator}], [ac_DEVICE_MEMORY_ALLOCATOR=no])
+    [ac_DEVICE_MEMORY_ALLOCATOR=${enable_device_memory_allocator}], [ac_DEVICE_MEMORY_ALLOCATOR=no])
 
 case ${ac_DEVICE_MEMORY_ALLOCATOR} in
     yes)

From 7a939edeb6c875f3064d45fdfbe84cdb09b50c4d Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Tue, 29 Jul 2025 15:01:50 +0000
Subject: [PATCH 45/53] oversubscription factor

---
 Grid/allocator/DeviceMemoryAllocator.cc | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/Grid/allocator/DeviceMemoryAllocator.cc b/Grid/allocator/DeviceMemoryAllocator.cc
index 1a1763861e..b958cfa1b9 100644
--- a/Grid/allocator/DeviceMemoryAllocator.cc
+++ b/Grid/allocator/DeviceMemoryAllocator.cc
@@ -31,6 +31,7 @@ Author: Christoph Lehner <christoph@lhnr.de>
 NAMESPACE_BEGIN(Grid);
 
 #define DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE (64*1024)
+#define OVERALLOCATION_FACTOR 1.2
 
 #ifdef GRID_DEVICE_MEMORY_ALLOCATOR
 struct DeviceMemoryAllocator {
@@ -59,6 +60,13 @@ struct DeviceMemoryAllocator {
 
   void Init(size_t _size) {
     assert(!initialized);
+
+    char* str;
+    if ((str = getenv("GRID_OVERALLOCATION_FACTOR"))) {
+      _size = (size_t)(_size * atof(str));
+    } else {
+      _size = (size_t)(_size * OVERALLOCATION_FACTOR);
+    }
     
     size_t n_pages = (_size + DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE - 1) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
     size = n_pages * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
@@ -73,9 +81,17 @@ struct DeviceMemoryAllocator {
     {
       uint64_t* ba = (uint64_t*)base;
       size_t n = size / sizeof(uint64_t);
-      accelerator_for(i, n, 1, {
-	  ba[i] = (uint64_t)-1;
-	});
+      size_t MAX_BLOCK_INIT = 128*1024*1024;
+      while (n > 0) {
+	size_t n0 = n;
+	if (n0 > MAX_BLOCK_INIT)
+	  n0 = MAX_BLOCK_INIT;
+	accelerator_for(i, n0, 1, {
+	    ba[i] = (uint64_t)-1;
+	  });
+	ba += n0;
+	n -= n0;
+      }
     }  
     
     std::cout << GridLogMessage << "Done" << std::endl;
@@ -108,6 +124,8 @@ void *acceleratorAllocDevice(size_t bytes) {
     sm.pop_back();
     assert(dma.pages[index] == 0);
     dma.pages[index] = n_pages;
+
+    std::cout << GridLogMessage << "Can re-use pointer for " << n_pages << " pages" << std::endl;
     
     return dma.base + index * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
   }

From 8bd4562a7b1d8a7010ce1420098c94f0751feff1 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Tue, 29 Jul 2025 20:38:26 +0200
Subject: [PATCH 46/53] quote reusable pages

---
 Grid/allocator/DeviceMemoryAllocator.cc | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/Grid/allocator/DeviceMemoryAllocator.cc b/Grid/allocator/DeviceMemoryAllocator.cc
index b958cfa1b9..f107aa4cc9 100644
--- a/Grid/allocator/DeviceMemoryAllocator.cc
+++ b/Grid/allocator/DeviceMemoryAllocator.cc
@@ -118,27 +118,37 @@ void *acceleratorAllocDevice(size_t bytes) {
   size_t n_pages = (bytes + DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE - 1) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
   
   // first check if
-  auto & sm = dma.size_map[n_pages];
-  if (sm.size()) {
-    size_t index = sm.back();
-    sm.pop_back();
+  auto sm = dma.size_map.find(n_pages);
+  if (sm != dma.size_map.end() && sm->second.size() > 0) {
+    size_t index = sm->second.back();
+    sm->second.pop_back();
+
+    if (sm->second.size() == 0)
+      dma.size_map.erase(sm);
+    
     assert(dma.pages[index] == 0);
     dma.pages[index] = n_pages;
 
-    std::cout << GridLogMessage << "Can re-use pointer for " << n_pages << " pages" << std::endl;
+    std::cout << GridLogMessage << "Can re-use perfect pointer for " << n_pages << " pages" << std::endl;
     
     return dma.base + index * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
   }
 
   size_t end = (dma.offset + n_pages) * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
+
+  // TODO: if there are reusable pages, use one that is slightly too large; iterator goes from small to large
   assert(end <= dma.size);
   dma.pages[dma.offset] = n_pages;
 
   void* ptr = dma.base + dma.offset * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
   dma.offset += n_pages;
 
+  size_t reusable_pages = 0;
+  for (auto & sm : dma.size_map)
+    reusable_pages += sm.first * sm.second.size();
+
   std::cout << GridLogMessage << (dma.size - end) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE << " pages left to allocate ("
-	    << (dma.size - end) * 100 / dma.size << "% free)" << std::endl;
+	    << (dma.size - end) * 100 / dma.size << "% free, " << reusable_pages << " reusable pages)" << std::endl;
 
   return ptr;
 

From bc4f7bdfc4dea17c00c82f33500ad090d8dcdc55 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Wed, 30 Jul 2025 08:41:15 +0200
Subject: [PATCH 47/53] cleanup device allocator

---
 Grid/allocator/DeviceMemoryAllocator.cc | 172 ++++++++++++++++--------
 1 file changed, 118 insertions(+), 54 deletions(-)

diff --git a/Grid/allocator/DeviceMemoryAllocator.cc b/Grid/allocator/DeviceMemoryAllocator.cc
index f107aa4cc9..c6711deba2 100644
--- a/Grid/allocator/DeviceMemoryAllocator.cc
+++ b/Grid/allocator/DeviceMemoryAllocator.cc
@@ -40,12 +40,14 @@ struct DeviceMemoryAllocator {
   char* base;
   size_t size;
   size_t offset;
+  bool verbose;
 
   DeviceMemoryAllocator() {
     initialized = false;
     base = 0;
     size = 0;
     offset = 0;
+    verbose = false;
   }
 
   ~DeviceMemoryAllocator() {
@@ -67,16 +69,18 @@ struct DeviceMemoryAllocator {
     } else {
       _size = (size_t)(_size * OVERALLOCATION_FACTOR);
     }
+
+    verbose = (getenv("GRID_DEBUG_DEVICE_ALLOCATOR") != 0);
     
     size_t n_pages = (_size + DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE - 1) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
     size = n_pages * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
-    std::cout << GridLogMessage << "Init with " <<
-      size << " bytes" << std::endl;
+    std::cout << GridLogMessage << "Init device allocator with " << size << " bytes" << std::endl;
 
     base = (char*)acceleratorAllocDeviceInternal(size);
     assert(base);
-    
-    std::cout << GridLogMessage << "Initialize memory to zero" << std::endl;
+
+    if (verbose)
+      std::cout << GridLogMessage << "Initialize memory to zero" << std::endl;
 
     {
       uint64_t* ba = (uint64_t*)base;
@@ -93,78 +97,138 @@ struct DeviceMemoryAllocator {
 	n -= n0;
       }
     }  
-    
-    std::cout << GridLogMessage << "Done" << std::endl;
+
+    if (verbose)
+      std::cout << GridLogMessage << "Done" << std::endl;
     
     offset = 0;
     
     pages.resize(n_pages, 0);
-    
-    std::cout << GridLogMessage << "Pages initialized" << std::endl;
+
+    if (verbose)
+      std::cout << GridLogMessage << "Pages initialized" << std::endl;
   
     initialized = true;
   }
-};
 
-static DeviceMemoryAllocator dma;
+  void* attemptReuseExactSize(size_t n_pages) {
+    auto sm = size_map.find(n_pages);
+    if (sm != size_map.end() && sm->second.size() > 0) {
+      size_t index = sm->second.back();
+      sm->second.pop_back();
+      
+      if (sm->second.size() == 0)
+	size_map.erase(sm);
+      
+      assert(pages[index] == 0);
+      pages[index] = n_pages;
+
+      return base + index * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
+    }
+    return 0;
+  }
 
-void *acceleratorAllocDevice(size_t bytes) {
-  if (!dma.initialized)
-    dma.Init(MemoryManager::DeviceMaxBytes);
-  
-  if (!bytes)
-    bytes++;
-  
-  size_t n_pages = (bytes + DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE - 1) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
-  
-  // first check if
-  auto sm = dma.size_map.find(n_pages);
-  if (sm != dma.size_map.end() && sm->second.size() > 0) {
-    size_t index = sm->second.back();
-    sm->second.pop_back();
-
-    if (sm->second.size() == 0)
-      dma.size_map.erase(sm);
+  void* attemptAllocUnused(size_t n_pages) {
+    size_t end = (offset + n_pages) * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
+    void* ptr = 0;
     
-    assert(dma.pages[index] == 0);
-    dma.pages[index] = n_pages;
-
-    std::cout << GridLogMessage << "Can re-use perfect pointer for " << n_pages << " pages" << std::endl;
+    if (end <= size) {
+      pages[offset] = n_pages;
     
-    return dma.base + index * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
+      ptr = base + offset * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
+      offset += n_pages;
+
+      if (verbose) {
+	size_t reusable_pages = 0;
+	for (auto & sm : size_map)
+	  reusable_pages += sm.first * sm.second.size();
+	
+	std::cout << GridLogMessage << (size - end) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE << " pages left to allocate ("
+		  << (size - end) * 100 / size << "% unallocated, " << reusable_pages << " reusable pages)" << std::endl;
+      }
+    }
+
+    return ptr;
   }
 
-  size_t end = (dma.offset + n_pages) * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
+  void* alloc(size_t bytes) {
+    if (!initialized)
+      Init(MemoryManager::DeviceMaxBytes);
+    
+    if (!bytes)
+      bytes++;
+    
+    size_t n_pages = (bytes + DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE - 1) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
+    
+    // first check if block of perfect size is available
+    void* ptr;
+    if ((ptr = attemptReuseExactSize(n_pages))) {
+      
+      if (verbose)
+	std::cout << GridLogMessage << "Can re-use perfect pointer for " << n_pages << " pages" << std::endl;
+      
+      return ptr;
+    }
 
-  // TODO: if there are reusable pages, use one that is slightly too large; iterator goes from small to large
-  assert(end <= dma.size);
-  dma.pages[dma.offset] = n_pages;
+    // if not, attempt to allocate in the unused area
+    if ((ptr = attemptAllocUnused(n_pages)))
+      return ptr;
+
+    // last attempt, find a re-usable region that barely fits and return it
+    // for loop of std::map iterates in ascending order
+    size_t reusable_pages = 0;
+    size_t n_pages_usable = 0;
+    for (auto & sm : size_map) {
+      assert(sm.second.size() > 0); // should never be empty
+      reusable_pages += sm.first * sm.second.size();
+      if (n_pages_usable == 0 && sm.first > n_pages)
+	n_pages_usable = sm.first;
+    }    
+
+    if (n_pages_usable == 0) {
+      std::cout << GridLogMessage << "Out of memory for " << n_pages << " pages!  Re-usable pages at time of death:" << std::endl;
+
+      for (auto & sm : size_map) {
+	std::cout << GridLogMessage << sm.second.size() << " x " << sm.first << " pages" << std::endl;
+      }
+	
+      exit(1);
+    }
 
-  void* ptr = dma.base + dma.offset * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
-  dma.offset += n_pages;
+    if ((ptr = attemptReuseExactSize(n_pages_usable))) {
+      
+      if (verbose)
+	std::cout << GridLogMessage << "Can re-use pointer for " << n_pages_usable << " pages when " << n_pages << " were needed; " << reusable_pages << " reusable pages" << std::endl;
+      
+      return ptr;
+    }
 
-  size_t reusable_pages = 0;
-  for (auto & sm : dma.size_map)
-    reusable_pages += sm.first * sm.second.size();
+    // this should never be reached
+    assert(0);
+    return ptr;
+  }
 
-  std::cout << GridLogMessage << (dma.size - end) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE << " pages left to allocate ("
-	    << (dma.size - end) * 100 / dma.size << "% free, " << reusable_pages << " reusable pages)" << std::endl;
+  void free(void* ptr) {
+    if (!initialized)
+      return;
+    
+    size_t index = ((size_t)((char*)ptr - base)) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
+    size_t n_pages = pages[index];
+    //std::cout << GridLogMessage << "Freeing ptr " << ptr << " has " << n_pages << " pages" << std::endl;
+    pages[index] = 0;
+    auto & sm = size_map[n_pages];
+    sm.push_back(index);
+  }
+};
 
-  return ptr;
+static DeviceMemoryAllocator dma;
 
+void *acceleratorAllocDevice(size_t bytes) {
+  return dma.alloc(bytes);
 }
 
 void acceleratorFreeDevice(void *ptr) {
-  if (!dma.initialized)
-    return;
-  
-  size_t index = ((size_t)((char*)ptr - dma.base)) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE;
-  size_t n_pages = dma.pages[index];
-  //std::cout << GridLogMessage << "Freeing ptr " << ptr << " has " << n_pages << " pages" << std::endl;
-  dma.pages[index] = 0;
-  auto & sm = dma.size_map[n_pages];
-  sm.push_back(index);
-  
+  dma.free(ptr);
 }
 #endif
 

From faed6d3e9d8bfa45326a2720b8916228b941b69b Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Fri, 12 Dec 2025 14:25:00 +0100
Subject: [PATCH 48/53] M5D virtual

---
 Grid/qcd/action/fermion/CayleyFermion5D.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Grid/qcd/action/fermion/CayleyFermion5D.h b/Grid/qcd/action/fermion/CayleyFermion5D.h
index ec80b692b9..6cba3363a8 100644
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@@ -87,14 +87,14 @@ class CayleyFermion5D : public WilsonFermion5D<Impl>
   /////////////////////////////////////////////////////
   // Instantiate different versions depending on Impl
   /////////////////////////////////////////////////////
-  void M5D(const FermionField &psi,
+  virtual void M5D(const FermionField &psi,
 	   const FermionField &phi,
 	   FermionField &chi,
 	   std::vector<Coeff_t> &lower,
 	   std::vector<Coeff_t> &diag,
 	   std::vector<Coeff_t> &upper);
 
-  void M5Ddag(const FermionField &psi,
+  virtual void M5Ddag(const FermionField &psi,
 	      const FermionField &phi,
 	      FermionField &chi,
 	      std::vector<Coeff_t> &lower,

From afd4423bef303fd176bcdb9f3ea89255cba06d53 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Sat, 28 Feb 2026 09:23:48 +0000
Subject: [PATCH 49/53] try a different barrier setup at Aurora

---
 Grid/threads/Accelerator.cc |  4 ++--
 Grid/threads/Accelerator.h  | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc
index aa39f88c93..f750b56c94 100644
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -210,8 +210,8 @@ void acceleratorInit(void)
   //  sycl::gpu_selector selector;
   //  sycl::device selectedDevice { selector };
   theGridAccelerator = new sycl::queue (sycl::gpu_selector_v);
-  theCopyAccelerator = new sycl::queue (sycl::gpu_selector_v);
-  //  theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
+  //theCopyAccelerator = new sycl::queue (sycl::gpu_selector_v);
+  theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
 
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
   zeInit(0);
diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index 9d9f765d2b..566a815987 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -351,7 +351,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 		     });						\
   });
 
-#define accelerator_barrier(dummy) { theGridAccelerator->wait(); theGridAccelerator->wait(); }
+#define accelerator_barrier(dummy) { theGridAccelerator->wait_and_throw(); theGridAccelerator->wait_and_throw(); }
 
 inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
 inline void *acceleratorAllocHost(size_t bytes)  { return malloc_host(bytes,*theGridAccelerator);};
@@ -360,7 +360,7 @@ inline void acceleratorFreeHost(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
 
-inline void acceleratorCopySynchronise(void) {  theCopyAccelerator->wait(); theCopyAccelerator->wait(); }
+inline void acceleratorCopySynchronise(void) {  theCopyAccelerator->wait_and_throw(); theCopyAccelerator->wait_and_throw(); }
 
 
 ///////
@@ -370,7 +370,7 @@ typedef sycl::event acceleratorEvent_t;
 
 inline void acceleratorEventWait(acceleratorEvent_t ev)
 {
-  ev.wait();
+  ev.wait_and_throw();
 }
 
 inline int acceleratorEventIsComplete(acceleratorEvent_t ev)
@@ -382,9 +382,9 @@ inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *t
 inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)        { return theCopyAccelerator->memcpy(to,from,bytes); }
 inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes)      { return theCopyAccelerator->memcpy(to,from,bytes); }
 
-inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();theCopyAccelerator->wait();}
-inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();theCopyAccelerator->wait();}
-inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();theCopyAccelerator->wait();}
+inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait_and_throw();theCopyAccelerator->wait_and_throw();}
+inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait_and_throw();theCopyAccelerator->wait_and_throw();}
+inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait_and_throw();theCopyAccelerator->wait_and_throw();}
 
 inline int  acceleratorIsCommunicable(void *ptr)
 {

From a8706981d24024d04cd2b80e54dbf29766137c35 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Sat, 9 May 2026 08:11:33 +0000
Subject: [PATCH 50/53] clean compile merge

---
 Grid/communicator/Communicator_base.h         |  12 +-
 Grid/communicator/Communicator_mpi3.cc        |  42 +--
 Grid/cshift/Cshift_mpi.h                      |   1 -
 Grid/lattice/Lattice_reduction_sycl.h         |  20 --
 .../action/fermion/ImprovedStaggeredFermion.h |   6 +
 .../fermion/ImprovedStaggeredFermion5D.h      |   6 +
 .../action/fermion/NaiveStaggeredFermion.h    |   6 +
 Grid/qcd/action/fermion/WilsonCompressor.h    | 206 +-------------
 Grid/qcd/action/fermion/WilsonFermion.h       |   6 +
 Grid/qcd/action/fermion/WilsonFermion5D.h     |   9 +-
 Grid/stencil/Stencil.cc                       |   9 +-
 Grid/stencil/Stencil.h                        | 253 ++++++++++++++----
 12 files changed, 262 insertions(+), 314 deletions(-)

diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h
index 690b5b23a0..deb93faee8 100644
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -183,6 +183,7 @@ class CartesianCommunicator : public SharedMemory {
 		      int recv_from_rank,
 		      uint64_t bytes);
   
+  int IsOffNode(int rank);
   double StencilSendToRecvFrom(void *xmit,
 			       int xmit_to_rank,int do_xmit,
 			       void *recv,
@@ -201,9 +202,9 @@ class CartesianCommunicator : public SharedMemory {
   void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
 
   double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-				    void *xmit,
+				    void *xmit,void *xmit_comp,
 				    int xmit_to_rank,int do_xmit,
-				    void *recv,
+				    void *recv,void *recv_comp,
 				    int recv_from_rank,int do_recv,
 				    uint64_t xbytes,uint64_t rbytes,int dir);
   
@@ -243,12 +244,7 @@ class CartesianCommunicator : public SharedMemory {
     Broadcast(root,(void *)&data,sizeof(data));
   }
 
-#ifdef GRID_CHECKSUM_COMMS
-  uint64_t checksumIndex;
-
-  uint64_t incrementChecksumIndex() { return checksumIndex++; };
-#endif
-};
+}; 
 
 NAMESPACE_END(Grid);
 
diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc
index 59f0dcac8e..3b8561d3ce 100644
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -40,10 +40,6 @@ uint64_t checksum_index = 1;
 #endif
 
 
-#ifdef GRID_CHECKSUM_COMMS
-extern void * Grid_backtrace_buffer[_NBACKTRACE];
-#endif
-
 ////////////////////////////////////////////
 // First initialise of comms system
 ////////////////////////////////////////////
@@ -255,11 +251,6 @@ void CartesianCommunicator::InitFromMPICommunicator(const Coordinate &processors
   for(int i=0;i<_ndimension*2;i++){
     MPI_Comm_dup(communicator,&communicator_halo[i]);
   }
-
-#ifdef GRID_CHECKSUM_COMMS
-  checksumIndex = 0;
-#endif
-
   GRID_ASSERT(Size==_Nprocessors);
 }
 
@@ -286,24 +277,24 @@ void CartesianCommunicator::GlobalSum(double &d)
 }
 #else
 void CartesianCommunicator::GlobalSum(float &f){
-  FlightRecorder::StepLog("AllReduce");
+  FlightRecorder::StepLog("AllReduce float");
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
   GRID_ASSERT(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
-  FlightRecorder::StepLog("AllReduce");
+  FlightRecorder::StepLog("AllReduce double");
   int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
   GRID_ASSERT(ierr==0);
 }
 #endif
 void CartesianCommunicator::GlobalSum(uint32_t &u){
-  FlightRecorder::StepLog("AllReduce");
+  FlightRecorder::StepLog("AllReduce uint32_t");
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
   GRID_ASSERT(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
-  FlightRecorder::StepLog("AllReduce");
+  FlightRecorder::StepLog("AllReduce uint64_t");
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
   GRID_ASSERT(ierr==0);
 }
@@ -317,26 +308,31 @@ void CartesianCommunicator::GlobalXOR(uint32_t &u){
   GRID_ASSERT(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint64_t &u){
+  FlightRecorder::StepLog("GlobalXOR");
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
   GRID_ASSERT(ierr==0);
 }
 void CartesianCommunicator::GlobalMax(float &f)
 {
+  FlightRecorder::StepLog("GlobalMax");
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
   GRID_ASSERT(ierr==0);
 }
 void CartesianCommunicator::GlobalMax(double &d)
 {
+  FlightRecorder::StepLog("GlobalMax");
   int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
   GRID_ASSERT(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
+  FlightRecorder::StepLog("GlobalSumVector(float *)");
   int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
   GRID_ASSERT(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
+  FlightRecorder::StepLog("GlobalSumVector(double *)");
   int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
   GRID_ASSERT(ierr==0);
 }
@@ -410,11 +406,16 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 {
   std::vector<CommsRequest_t> list;
   double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
-  offbytes       += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
+  offbytes       += StencilSendToRecvFromBegin(list,xmit,xmit,dest,dox,recv,recv,from,dor,bytes,bytes,dir);
   StencilSendToRecvFromComplete(list,dir);
   return offbytes;
 }
-
+int CartesianCommunicator::IsOffNode(int rank)
+{
+  int grank = ShmRanks[rank];
+  if ( grank == MPI_UNDEFINED ) return true;
+  else return false;
+}
 
 #ifdef ACCELERATOR_AWARE_MPI
 void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
@@ -429,9 +430,9 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
   return 0.0; // Do nothing -- no preparation required
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-							 void *xmit,
+							 void *xmit,void *xmit_comp,
 							 int dest,int dox,
-							 void *recv,
+							 void *recv,void *recv_comp,
 							 int from,int dor,
 							 uint64_t xbytes,uint64_t rbytes,int dir)
 {
@@ -581,7 +582,6 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
   if ( dor ) {
     if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
       tag= dir+from*32;
-
       host_recv = this->HostBufferMalloc(rbytes);
       ierr=MPI_Irecv(host_recv,(int)(rbytes/sizeof(int32_t)), MPI_INT32_T,from,tag,communicator_halo[commdir],&rrq);
       GRID_ASSERT(ierr==0);
@@ -603,7 +603,6 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
       tag= dir+_processor*32;
 
       host_xmit = this->HostBufferMalloc(xbytes);
-
       CommsRequest_t srq;
 
 #ifdef GRID_CHECKSUM_COMMS
@@ -714,9 +713,9 @@ void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsReque
 }  
 
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-							 void *xmit,
+							 void *xmit,void *xmit_comp,
 							 int dest,int dox,
-							 void *recv,
+							 void *recv,void *recv_comp,
 							 int from,int dor,
 							 uint64_t xbytes,uint64_t rbytes,int dir)
 {
@@ -900,6 +899,7 @@ int CartesianCommunicator::RankWorld(void){
   return r;
 }
 void CartesianCommunicator::BarrierWorld(void){
+  FlightRecorder::StepLog("BarrierWorld");
   int ierr = MPI_Barrier(communicator_world);
   GRID_ASSERT(ierr==0);
 }
diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h
index 5755965e5f..916213f061 100644
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@@ -197,7 +197,6 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
   int pad = (8 + sizeof(vobj) - 1) / sizeof(vobj);
   static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size+pad);
   static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size+pad);
-#endif
 #endif
 
   int cb= (cbmask==0x2)? Odd : Even;
diff --git a/Grid/lattice/Lattice_reduction_sycl.h b/Grid/lattice/Lattice_reduction_sycl.h
index 73ca4cbe5d..c0d1d07810 100644
--- a/Grid/lattice/Lattice_reduction_sycl.h
+++ b/Grid/lattice/Lattice_reduction_sycl.h
@@ -87,26 +87,6 @@ template<class Word> Word svm_xor(Word *vec,uint64_t L)
   theGridAccelerator->wait();
   return ret;
 }
-template<class Word> Word checksum_gpu(Word *vec,uint64_t L)
-{
-  Word identity;  identity=0;
-  Word ret = 0;
-  { 
-    sycl::buffer<Word, 1> abuff(&ret, {1});
-    theGridAccelerator->submit([&](sycl::handler &cgh) {
-      auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
-      cgh.parallel_for(sycl::range<1>{L},
-                       Reduction,
-                       [=] (sycl::id<1> index, auto &sum) {
-			 auto l = index % 61;
-                         sum ^= vec[index]<<l | vec[index]>>(64-l);
-                       });
-    });
-  }
-  theGridAccelerator->wait();
-  return ret;
-}
-
 template<class Word> Word checksum_gpu(Word *vec,uint64_t L)
 {
   Word identity;  identity=0;
diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
index 1f3ea83338..b202e3011a 100644
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@@ -154,6 +154,12 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
   StencilImpl Stencil;
   StencilImpl StencilEven;
   StencilImpl StencilOdd;
+  void SloppyComms(int sloppy)
+  {
+    Stencil.SetSloppyComms(sloppy);
+    StencilEven.SetSloppyComms(sloppy);
+    StencilOdd.SetSloppyComms(sloppy);
+  }
 
   // Copy of the gauge field , with even and odd subsets
   DoubledGaugeField Umu;
diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
index 0706e7039c..5ce6d24115 100644
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -179,6 +179,12 @@ class ImprovedStaggeredFermion5D :  public StaggeredKernels<Impl>, public Improv
   StencilImpl Stencil; 
   StencilImpl StencilEven; 
   StencilImpl StencilOdd; 
+  void SloppyComms(int sloppy)
+  {
+    Stencil.SetSloppyComms(sloppy);
+    StencilEven.SetSloppyComms(sloppy);
+    StencilOdd.SetSloppyComms(sloppy);
+  }
     
   // Copy of the gauge field , with even and odd subsets
   DoubledGaugeField Umu;
diff --git a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
index 9ec6be9016..53b1826792 100644
--- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
@@ -146,6 +146,12 @@ class NaiveStaggeredFermion : public StaggeredKernels<Impl>, public NaiveStagger
   StencilImpl Stencil;
   StencilImpl StencilEven;
   StencilImpl StencilOdd;
+  void SloppyComms(int sloppy)
+  {
+    Stencil.SetSloppyComms(sloppy);
+    StencilEven.SetSloppyComms(sloppy);
+    StencilOdd.SetSloppyComms(sloppy);
+  }
 
   // Copy of the gauge field , with even and odd subsets
   DoubledGaugeField Umu;
diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h
index e23d55fc27..c3a5e4cfc3 100644
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@@ -32,209 +32,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 NAMESPACE_BEGIN(Grid);
 
-///////////////////////////////////////////////////////////////
-// Wilson compressor will need FaceGather policies for:
-// Periodic, Dirichlet, and partial Dirichlet for DWF
-///////////////////////////////////////////////////////////////
-const int dwf_compressor_depth=2;
-#define DWF_COMPRESS
-class FaceGatherPartialDWF
-{
-public:
-#ifdef DWF_COMPRESS
-  static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
-#else
-  static int PartialCompressionFactor(GridBase *grid) { return 1;}
-#endif
-  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
-				   const Lattice<vobj> &rhs,
-				   cobj *buffer,
-				   compressor &compress,
-				   int off,int so,int partial)
-  {
-    //DWF only hack: If a direction that is OFF node we use Partial Dirichlet
-    //  Shrinks local and remote comms buffers
-    GridBase *Grid = rhs.Grid();
-    int Ls = Grid->_rdimensions[0];
-#ifdef DWF_COMPRESS
-    int depth=dwf_compressor_depth;
-#else 
-    int depth=Ls/2;
-#endif
-    std::pair<int,int> *table_v = & table[0];
-    auto rhs_v = rhs.View(AcceleratorRead);
-    int vol=table.size()/Ls;
-    accelerator_forNB( idx,table.size(), vobj::Nsimd(), {
-	Integer i=idx/Ls;
-	Integer s=idx%Ls;
-	Integer sc=depth+s-(Ls-depth);
-	if(s<depth)     compress.Compress(buffer[off+i+s*vol],rhs_v[so+table_v[idx].second]);
-	if(s>=Ls-depth) compress.Compress(buffer[off+i+sc*vol],rhs_v[so+table_v[idx].second]);
-    });
-    rhs_v.ViewClose();
-  }
-  template<class decompressor,class Decompression>
-  static void DecompressFace(decompressor decompress,Decompression &dd)
-  {
-    auto Ls = dd.dims[0];
-#ifdef DWF_COMPRESS
-    int depth=dwf_compressor_depth;
-#else
-    int depth=Ls/2;
-#endif    
-    // Just pass in the Grid
-    auto kp = dd.kernel_p;
-    auto mp = dd.mpi_p;
-    int size= dd.buffer_size;
-    int vol= size/Ls;
-    accelerator_forNB(o,size,1,{
-	int idx=o/Ls;
-	int   s=o%Ls;
-	if ( s < depth ) {
-	  int oo=s*vol+idx;
-	  kp[o]=mp[oo];
-	} else if ( s >= Ls-depth ) {
-	  int sc = depth + s - (Ls-depth);
-	  int oo=sc*vol+idx; 
-	  kp[o]=mp[oo];
-	} else {
-	  kp[o] = Zero();//fill rest with zero if partial dirichlet
-	}
-    });
-  }
-  ////////////////////////////////////////////////////////////////////////////////////////////
-  // Need to gather *interior portions* for ALL s-slices in simd directions
-  // Do the gather as need to treat SIMD lanes differently, and insert zeroes on receive side
-  // Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
-  ////////////////////////////////////////////////////////////////////////////////////////////
-  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
-				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
-				    compressor &compress,int type,int partial)
-  {
-    GridBase *Grid = rhs.Grid();
-    int Ls = Grid->_rdimensions[0];
-#ifdef DWF_COMPRESS
-    int depth=dwf_compressor_depth;
-#else
-    int depth = Ls/2;
-#endif
-    
-    // insertion of zeroes...
-    assert( (table.size()&0x1)==0);
-    int num=table.size()/2;
-    int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
-    
-    auto rhs_v = rhs.View(AcceleratorRead);
-    auto p0=&pointers[0][0];
-    auto p1=&pointers[1][0];
-    auto tp=&table[0];
-    int nnum=num/Ls;
-    accelerator_forNB(j, num, vobj::Nsimd(), {
-	//  Reorders both local and remote comms buffers
-	//  
-	int s  = j % Ls;
-	int sp1 = (s+depth)%Ls;  // peri incremented s slice
-	
-	int hxyz= j/Ls;
-
-	int xyz0= hxyz*2; // xyzt part of coor
-	int xyz1= hxyz*2+1;
-	
-	int jj= hxyz + sp1*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
-	
-	int kk0= xyz0*Ls + s ; // s=0 goes to s=1
-	int kk1= xyz1*Ls + s ; // s=Ls-1 -> s=0
-	compress.CompressExchange(p0[jj],p1[jj],
-				  rhs_v[so+tp[kk0 ].second], // Same s, consecutive xyz sites
-				  rhs_v[so+tp[kk1 ].second], 
-				  type);
-    });
-    rhs_v.ViewClose();
-  }
-  // Merge routine is for SIMD faces
-  template<class decompressor,class Merger>
-  static void MergeFace(decompressor decompress,Merger &mm)
-  {
-    auto Ls = mm.dims[0];
-#ifdef DWF_COMPRESS
-    int depth=dwf_compressor_depth;
-#else
-    int depth = Ls/2;
-#endif
-    int  num= mm.buffer_size/2; // relate vol and Ls to buffer size
-    auto mp = &mm.mpointer[0];
-    auto vp0= &mm.vpointers[0][0]; // First arg is exchange first
-    auto vp1= &mm.vpointers[1][0];
-    auto type= mm.type;
-    int nnum = num/Ls;
-    accelerator_forNB(o,num,Merger::Nsimd,{
-
-	int  s=o%Ls;
-	int hxyz=o/Ls; // xyzt related component
-	int xyz0=hxyz*2;
-	int xyz1=hxyz*2+1;
-
-	int sp = (s+depth)%Ls; 
-	int jj= hxyz + sp*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
-
-	int oo0= s+xyz0*Ls;
-	int oo1= s+xyz1*Ls;
-
-	// same ss0, ss1 pair goes to new layout
-	decompress.Exchange(mp[oo0],mp[oo1],vp0[jj],vp1[jj],type);
-      });
-  }
-};
-class FaceGatherDWFMixedBCs
-{
-public:
-#ifdef DWF_COMPRESS
-  static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
-#else 
-  static int PartialCompressionFactor(GridBase *grid) {return 1;}
-#endif
-  
-  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
-					 const Lattice<vobj> &rhs,
-					 cobj *buffer,
-					 compressor &compress,
-					 int off,int so,int partial)
-  {
-    //    std::cout << " face gather simple DWF partial "<<partial <<std::endl;
-    if(partial) FaceGatherPartialDWF::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
-    else        FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
-  }
-  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
-				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
-				    compressor &compress,int type,int partial)
-  {
-    //    std::cout << " face gather exch DWF partial "<<partial <<std::endl;
-    if(partial) FaceGatherPartialDWF::Gather_plane_exchange(table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
-    else        FaceGatherSimple::Gather_plane_exchange    (table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
-  }
-  template<class decompressor,class Merger>
-  static void MergeFace(decompressor decompress,Merger &mm)
-  {
-    int partial = mm.partial;
-    //    std::cout << " merge DWF partial "<<partial <<std::endl;
-    if ( partial ) FaceGatherPartialDWF::MergeFace(decompress,mm);
-    else           FaceGatherSimple::MergeFace(decompress,mm);
-  }
-
-  template<class decompressor,class Decompression>
-  static void DecompressFace(decompressor decompress,Decompression &dd)
-  {
-    int partial = dd.partial;
-    //    std::cout << " decompress DWF partial "<<partial <<std::endl;
-    if ( partial ) FaceGatherPartialDWF::DecompressFace(decompress,dd);
-    else           FaceGatherSimple::DecompressFace(decompress,dd);
-  }
-};
-
 /////////////////////////////////////////////////////////////////////////////////////////////
 // optimised versions supporting half precision too??? Deprecate
 /////////////////////////////////////////////////////////////////////////////////////////////
@@ -242,8 +39,7 @@ class FaceGatherDWFMixedBCs
 
 //Could make FaceGather a template param, but then behaviour is runtime not compile time
 template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
-class WilsonCompressorTemplate  : public FaceGatherDWFMixedBCs
-//  : public FaceGatherSimple
+class WilsonCompressorTemplate : public FaceGatherSimple
 {
 public:
   
diff --git a/Grid/qcd/action/fermion/WilsonFermion.h b/Grid/qcd/action/fermion/WilsonFermion.h
index 16320a9330..41a2471e2b 100644
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@@ -165,6 +165,12 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic
   StencilImpl Stencil;
   StencilImpl StencilEven;
   StencilImpl StencilOdd;
+  void SloppyComms(int sloppy)
+  {
+    Stencil.SetSloppyComms(sloppy);
+    StencilEven.SetSloppyComms(sloppy);
+    StencilOdd.SetSloppyComms(sloppy);
+  }
 
   // Copy of the gauge field , with even and odd subsets
   DoubledGaugeField Umu;
diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h
index 9dadc866f6..ea0eaa32e0 100644
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -204,7 +204,14 @@ class WilsonFermion5D : public WilsonKernels<Impl>, public WilsonFermion5DStatic
   DoubledGaugeField Umu;
   DoubledGaugeField UmuEven;
   DoubledGaugeField UmuOdd;
-    
+
+
+  void SloppyComms(int sloppy)
+  {
+    Stencil.SetSloppyComms(sloppy);
+    StencilEven.SetSloppyComms(sloppy);
+    StencilOdd.SetSloppyComms(sloppy);
+  }
   // Comms buffer
   //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
 
diff --git a/Grid/stencil/Stencil.cc b/Grid/stencil/Stencil.cc
index 27dc75eddd..1d04716918 100644
--- a/Grid/stencil/Stencil.cc
+++ b/Grid/stencil/Stencil.cc
@@ -30,25 +30,26 @@
 NAMESPACE_BEGIN(Grid);
 
 uint64_t DslashFullCount;
-uint64_t DslashPartialCount;
+//uint64_t DslashPartialCount;
 uint64_t DslashDirichletCount;
 
 void DslashResetCounts(void)
 {
   DslashFullCount=0;
-  DslashPartialCount=0;
+  //  DslashPartialCount=0;
   DslashDirichletCount=0;
 }
 void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full)
 {
   dirichlet = DslashDirichletCount;
-  partial   = DslashPartialCount;
+  partial   = 0;
   full      = DslashFullCount;
 }
 void DslashLogFull(void)     { DslashFullCount++;}
-void DslashLogPartial(void)  { DslashPartialCount++;}
+//void DslashLogPartial(void)  { DslashPartialCount++;}
 void DslashLogDirichlet(void){ DslashDirichletCount++;}
 
+deviceVector<unsigned char> StencilBuffer::DeviceCommBuf; 
 
 void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
 				 int off,std::vector<std::pair<int,int> > & table)
diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h
index 881c467377..090b7fe02f 100644
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
 // These can move into a params header and be given MacroMagic serialisation
 struct DefaultImplParams {
   Coordinate dirichlet; // Blocksize of dirichlet BCs
-  int  partialDirichlet;
+  //  int  partialDirichlet;
   DefaultImplParams()  {
     dirichlet.resize(0);
-    partialDirichlet=0;
+    //    partialDirichlet=0;
   };
 };
 
@@ -69,6 +69,12 @@ struct DefaultImplParams {
 void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
 				 int off,std::vector<std::pair<int,int> > & table);
 
+class StencilBuffer
+{
+public:
+  static deviceVector<unsigned char> DeviceCommBuf;     // placed in Stencil.cc
+};
+
 void DslashResetCounts(void);
 void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
 void DslashLogFull(void);
@@ -113,8 +119,8 @@ class CartesianStencilAccelerator {
   ///////////////////////////////////////////////////
   // If true, this is partially communicated per face
   ///////////////////////////////////////////////////
-  StencilVector _comms_partial_send; 
-  StencilVector _comms_partial_recv;
+  //  StencilVector _comms_partial_send; 
+  //  StencilVector _comms_partial_recv;
   //
   StencilVector _comm_buf_size;
   StencilVector _permute_type;
@@ -210,16 +216,16 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
   struct Packet {
     void * send_buf;
     void * recv_buf;
-#ifndef ACCELERATOR_AWARE_MPI
-    void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE
-    void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE
-#endif
+    void * compressed_send_buf;
+    void * compressed_recv_buf;
     Integer to_rank;
     Integer from_rank;
     Integer do_send;
     Integer do_recv;
     Integer xbytes;
     Integer rbytes;
+    Integer xbytes_compressed;
+    Integer rbytes_compressed;
   };
   struct Merge {
     static constexpr int Nsimd = vobj::Nsimd();
@@ -228,7 +234,7 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
     std::vector<cobj *> vpointers;
     Integer buffer_size;
     Integer type;
-    Integer partial; // partial dirichlet BCs
+    //    Integer partial; // partial dirichlet BCs
     Coordinate dims;
   };
   struct Decompress {
@@ -236,7 +242,7 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
     cobj * kernel_p;
     cobj * mpi_p;
     Integer buffer_size;
-    Integer partial; // partial dirichlet BCs
+    //    Integer partial; // partial dirichlet BCs
     Coordinate dims;
   };
   struct CopyReceiveBuffer {
@@ -290,6 +296,12 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 public:
   GridBase *Grid(void) const { return _grid; }
 
+  /////////////////////////////////////////////////////////
+  // Control reduced precision comms
+  /////////////////////////////////////////////////////////
+  int SloppyComms;
+  void SetSloppyComms(int sloppy) { SloppyComms = sloppy; };
+
   ////////////////////////////////////////////////////////////////////////
   // Needed to conveniently communicate gparity parameters into GPU memory
   // without adding parameters. Perhaps a template parameter to StenciView is
@@ -303,7 +315,7 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
   }
 
   int face_table_computed;
-  int partialDirichlet;
+  //  int partialDirichlet;
   int fullDirichlet;
   std::vector<deviceVector<std::pair<int,int> > > face_table ;
   deviceVector<int> surface_list;
@@ -396,24 +408,145 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
   ////////////////////////////////////////////////////////////////////////
   // Non blocking send and receive. Necessarily parallel.
   ////////////////////////////////////////////////////////////////////////
+  void DecompressPacket(Packet &packet)
+  {
+    if ( !SloppyComms ) return;
+
+    if ( packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
+
+      typedef typename getPrecision<cobj>::real_scalar_type word;
+      uint64_t words = packet.rbytes/sizeof(word);
+      const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
+      const uint64_t outer = words/nsimd;
+
+      if(sizeof(word)==8) {
+
+	// Can either choose to represent as float vs double and prec change
+	// OR
+	// truncate the mantissa bfp16 style
+	double *dbuf =(double *) packet.recv_buf;
+	float  *fbuf =(float  *) packet.compressed_recv_buf;
+
+	accelerator_forNB(ss,outer,nsimd,{
+	  int lane = acceleratorSIMTlane(nsimd);
+	  dbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]; //conversion
+	});
+
+      } else if ( sizeof(word)==4){
+	// Can either choose to represent as half vs float and prec change
+        // OR
+	// truncate the mantissa bfp16 style
+
+	uint32_t *fbuf =(uint32_t *) packet.recv_buf;
+	uint16_t *hbuf =(uint16_t *) packet.compressed_recv_buf;
+
+	accelerator_forNB(ss,outer,nsimd,{
+	  int lane = acceleratorSIMTlane(nsimd);
+	  fbuf[ss*nsimd+lane] = ((uint32_t)hbuf[ss*nsimd+lane])<<16; //copy back and pad each word with zeroes
+	});
+
+      } else {
+	assert(0 && "unknown floating point precision");
+      }
+    }
+  }
+  void CompressPacket(Packet &packet)
+  {
+    packet.xbytes_compressed = packet.xbytes;
+    packet.compressed_send_buf = packet.send_buf;
+
+    packet.rbytes_compressed = packet.rbytes;
+    packet.compressed_recv_buf = packet.recv_buf;
+
+    if ( !SloppyComms  ) {
+      return;
+    }
+
+    typedef typename getPrecision<cobj>::real_scalar_type word;
+    uint64_t words = packet.xbytes/sizeof(word);
+    const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
+    const uint64_t outer = words/nsimd;
+
+    if (packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
+
+      packet.rbytes_compressed = packet.rbytes/2;
+      packet.compressed_recv_buf = DeviceBufferMalloc(packet.rbytes_compressed);
+      //      std::cout << " CompressPacket recv from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
+      
+    }
+    //else {
+    //      std::cout << " CompressPacket recv is uncompressed from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
+    //    }
+    
+    if (packet.do_send && _grid->IsOffNode(packet.to_rank) ) {
+
+      packet.xbytes_compressed = packet.xbytes/2;
+      packet.compressed_send_buf = DeviceBufferMalloc(packet.xbytes_compressed);
+      //      std::cout << " CompressPacket send to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
+
+      if(sizeof(word)==8) {
+
+	double *dbuf =(double *) packet.send_buf;
+	float  *fbuf =(float  *) packet.compressed_send_buf;
+
+	accelerator_forNB(ss,outer,nsimd,{
+	  int lane = acceleratorSIMTlane(nsimd);
+	  fbuf[ss*nsimd+lane] = dbuf[ss*nsimd+lane]; // convert fp64 to fp32
+	});
+
+      } else if ( sizeof(word)==4){
+
+	uint32_t *fbuf =(uint32_t *) packet.send_buf;
+	uint16_t *hbuf =(uint16_t *) packet.compressed_send_buf;
+	
+	accelerator_forNB(ss,outer,nsimd,{
+	  int lane = acceleratorSIMTlane(nsimd);
+	  hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>16; // convert as in Bagel/BFM ; bfloat16 ; s7e8 Intel patent
+	});
+
+      } else {
+	assert(0 && "unknown floating point precision");
+      }
+
+    }
+    //    else {
+    //      std::cout << " CompressPacket send is uncompressed to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
+    //    }
+
+    return;
+  }
   void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
   {
-    //    std::cout << "Communicate Begin "<<std::endl;
-    //    _grid->Barrier();
     FlightRecorder::StepLog("Communicate begin");
+    ///////////////////////////////////////////////
     // All GPU kernel tasks must complete
-    //    accelerator_barrier();     // All kernels should ALREADY be complete
-    //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
-                               // But the HaloGather had a barrier too.
+    //    accelerator_barrier();      All kernels should ALREADY be complete
+    //Everyone is here, so noone running slow and still using receive buffer
+    _grid->StencilBarrier();
+    // But the HaloGather had a barrier too.
+    ///////////////////////////////////////////////
+    if (SloppyComms) {
+      DeviceBufferFreeAll();
+    }
+    for(int i=0;i<Packets.size();i++){
+      this->CompressPacket(Packets[i]);
+    }
+    if (SloppyComms) { 
+      accelerator_barrier();
+#ifdef NVLINK_GET
+      _grid->StencilBarrier(); 
+#endif
+    }
+    
     for(int i=0;i<Packets.size();i++){
       //      std::cout << "Communicate prepare "<<i<<std::endl;
       //      _grid->Barrier();
       _grid->StencilSendToRecvFromPrepare(MpiReqs,
-					  Packets[i].send_buf,
+					  Packets[i].compressed_send_buf,
 					  Packets[i].to_rank,Packets[i].do_send,
-					  Packets[i].recv_buf,
+					  Packets[i].compressed_recv_buf,
 					  Packets[i].from_rank,Packets[i].do_recv,
-					  Packets[i].xbytes,Packets[i].rbytes,i);
+					  Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
     }
     //    std::cout << "Communicate PollDtoH "<<std::endl;
     //    _grid->Barrier();
@@ -424,19 +557,22 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
     // Starts intranode
     for(int i=0;i<Packets.size();i++){
       //      std::cout << "Communicate Begin "<<i<<std::endl;
+      //      _grid->Barrier();
       _grid->StencilSendToRecvFromBegin(MpiReqs,
-					Packets[i].send_buf,
+					Packets[i].send_buf,Packets[i].compressed_send_buf,
 					Packets[i].to_rank,Packets[i].do_send,
-					Packets[i].recv_buf,
+					Packets[i].recv_buf,Packets[i].compressed_recv_buf,
 					Packets[i].from_rank,Packets[i].do_recv,
-					Packets[i].xbytes,Packets[i].rbytes,i);
+					Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
+      //      std::cout << "Communicate Begin started "<<i<<std::endl;
+      //      _grid->Barrier();
     }
     FlightRecorder::StepLog("Communicate begin has finished");
     // Get comms started then run checksums
     // Having this PRIOR to the dslash seems to make Sunspot work... (!)
     for(int i=0;i<Packets.size();i++){
       if ( Packets[i].do_send )
-	FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes);
+	FlightRecorder::xmitLog(Packets[i].compressed_send_buf,Packets[i].xbytes_compressed);
     }
   }
 
@@ -451,14 +587,15 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
     //    std::cout << "Communicate Complete Complete "<<std::endl;
     //    _grid->Barrier();
     _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
-    if   ( this->partialDirichlet ) DslashLogPartial();
-    else if ( this->fullDirichlet ) DslashLogDirichlet();
+    //    if   ( this->partialDirichlet ) DslashLogPartial();
+    if ( this->fullDirichlet ) DslashLogDirichlet();
     else DslashLogFull();
     //    acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
     //    accelerator_barrier(); 
     for(int i=0;i<Packets.size();i++){
+      this->DecompressPacket(Packets[i]);
       if ( Packets[i].do_recv )
-	FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
+	FlightRecorder::recvLog(Packets[i].compressed_recv_buf,Packets[i].rbytes_compressed,Packets[i].from_rank);
     }
     FlightRecorder::StepLog("Finish communicate complete");
   }
@@ -655,7 +792,7 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
   }
   void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) {
     Decompress d;
-    d.partial  = this->partialDirichlet;
+    //    d.partial  = this->partialDirichlet;
     d.dims     = _grid->_fdimensions;
     d.kernel_p = k_p;
     d.mpi_p    = m_p;
@@ -664,7 +801,7 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
   }
   void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
     Merge m;
-    m.partial  = this->partialDirichlet;
+    //    m.partial  = this->partialDirichlet;
     m.dims     = _grid->_fdimensions;
     m.type     = type;
     m.mpointer = merge_p;
@@ -769,8 +906,8 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
       int block = dirichlet_block[dimension];
       this->_comms_send[ii] = comm_dim;
       this->_comms_recv[ii] = comm_dim;
-      this->_comms_partial_send[ii] = 0;
-      this->_comms_partial_recv[ii] = 0;
+      //      this->_comms_partial_send[ii] = 0;
+      //      this->_comms_partial_recv[ii] = 0;
       if ( block && comm_dim ) {
 	assert(abs(displacement) < ld );
 	// Quiesce communication across block boundaries
@@ -791,10 +928,10 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 	  if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
 	  if ( ( (ld*pc     ) % block ) == 0 ) this->_comms_recv[ii] = 0;
 	}
-	if ( partialDirichlet ) {
-	  this->_comms_partial_send[ii] = !this->_comms_send[ii];
-	  this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
-	}
+	//	if ( partialDirichlet ) {
+	//	  this->_comms_partial_send[ii] = !this->_comms_send[ii];
+	//	  this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
+	//	}
       }
     }
   }
@@ -806,6 +943,7 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 		   Parameters p=Parameters(),
 		   bool preserve_shm=false)
   {
+    SloppyComms = 0;
     face_table_computed=0;
     _grid    = grid;
     this->parameters=p;
@@ -823,7 +961,7 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
     this->same_node.resize(npoints);
 
     if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
-    partialDirichlet = p.partialDirichlet;
+    //    partialDirichlet = p.partialDirichlet;
     DirichletBlock(p.dirichlet); // comms send/recv set up
     fullDirichlet=0;
     for(int d=0;d<p.dirichlet.size();d++){
@@ -904,7 +1042,7 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
     /////////////////////////////////////////////////////////////////////////////////
     const int Nsimd = grid->Nsimd();
 
-    // Allow for multiple stencils to exist simultaneously
+    // Allow for multiple stencils to be communicated simultaneously
     if (!preserve_shm)
       _grid->ShmBufferFreeAll();
 
@@ -972,7 +1110,8 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
     GridBase *grid=_grid;
     const int Nsimd = grid->Nsimd();
 
-    int comms_recv      = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
+    //    int comms_recv      = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
+    int comms_recv      = this->_comms_recv[point];
     int fd              = _grid->_fdimensions[dimension];
     int ld              = _grid->_ldimensions[dimension];
     int rd              = _grid->_rdimensions[dimension];
@@ -1161,8 +1300,8 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 
     int comms_send   = this->_comms_send[point];
     int comms_recv   = this->_comms_recv[point];
-    int comms_partial_send   = this->_comms_partial_send[point] ;
-    int comms_partial_recv   = this->_comms_partial_recv[point] ;
+    //    int comms_partial_send   = this->_comms_partial_send[point] ;
+    //    int comms_partial_recv   = this->_comms_partial_recv[point] ;
     
     GRID_ASSERT(rhs.Grid()==_grid);
     //	  conformable(_grid,rhs.Grid());
@@ -1197,11 +1336,11 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 	int rbytes;
 
 	if ( comms_send ) xbytes = bytes; // Full send
-	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else xbytes = 0; // full dirichlet
 
 	if ( comms_recv ) rbytes = bytes;
-	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else rbytes = 0;
 	
 	int so  = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
@@ -1228,7 +1367,8 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 	}
 
 
-	if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
+	//	if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
+	if ( compress.DecompressionStep()&&comms_recv) {
 	  recv_buf=u_simd_recv_buf[0];
 	} else {
 	  recv_buf=this->u_recv_buf_p;
@@ -1262,7 +1402,8 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 #endif
 
 	//	std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
-	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
+	//	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
+	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,0);
 
         int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask);
 	if ( !duplicate ) { // Force comms for now
@@ -1271,8 +1412,8 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 	  // Build a list of things to do after we synchronise GPUs
 	  // Start comms now???
 	  ///////////////////////////////////////////////////////////
-	  int do_send = (comms_send|comms_partial_send) && (!shm_send );
-	  int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
+	  int do_send = (comms_send) && (!shm_send );
+	  int do_recv = (comms_send) && (!shm_recv );
 	  AddPacket((void *)&send_buf[comm_off],
 		    (void *)&recv_buf[comm_off],
 		    xmit_to_rank, do_send,
@@ -1280,7 +1421,7 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 		    xbytes,rbytes);
 	}
 
-	if ( (compress.DecompressionStep() && comms_recv) || comms_partial_recv ) {
+	if ( (compress.DecompressionStep() && comms_recv) ) {
 	  AddDecompress(&this->u_recv_buf_p[comm_off],
 			&recv_buf[comm_off],
 			words,Decompressions);
@@ -1302,8 +1443,8 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 
     int comms_send   = this->_comms_send[point];
     int comms_recv   = this->_comms_recv[point];
-    int comms_partial_send   = this->_comms_partial_send[point] ;
-    int comms_partial_recv   = this->_comms_partial_recv[point] ;
+    //    int comms_partial_send   = this->_comms_partial_send[point] ;
+    //    int comms_partial_recv   = this->_comms_partial_recv[point] ;
 
     int fd = _grid->_fdimensions[dimension];
     int rd = _grid->_rdimensions[dimension];
@@ -1378,18 +1519,20 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 
 	
 	if ( comms_send ) xbytes = bytes;
-	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else xbytes = 0;
 
 	if ( comms_recv ) rbytes = bytes;
-	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else rbytes = 0;
 
 	// Gathers SIMD lanes for send and merge
 	// Different faces can be full comms or partial comms with  multiple ranks per node
-	if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
+	//	if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
+	if ( comms_send || comms_recv ) {
 
-	  int partial = partialDirichlet;
+	  //	  int partial = partialDirichlet;
+	  int partial = 0;
 	  compressor::Gather_plane_exchange(face_table[face_idx],rhs,
 					    spointers,dimension,sx,cbmask,
 					    compress,permute_type,partial );
@@ -1455,7 +1598,8 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 	      if ( (bytes != rbytes) && (rbytes!=0) ){
 		acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
 	      }
-	      int do_send = (comms_send|comms_partial_send) && (!shm_send );
+	      //	      int do_send = (comms_send|comms_partial_send) && (!shm_send );
+	      int do_send = (comms_send) && (!shm_send );
 	      AddPacket((void *)sp,(void *)rp,
 			xmit_to_rank,do_send,
 			recv_from_rank,do_send,
@@ -1469,7 +1613,8 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 	  }
 	}
 	// rpointer may be doing a remote read in the gather over SHM
-	if ( comms_recv|comms_partial_recv ) {
+	//	if ( comms_recv|comms_partial_recv ) {
+	if ( comms_recv ) {
 	  AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
 	}
 

From 30cf9a0a7984d610a6081d423d65dec3a8326d3c Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Sat, 9 May 2026 10:16:17 +0200
Subject: [PATCH 51/53] Update Benchmark_dwf_fp32.cc

---
 benchmarks/Benchmark_dwf_fp32.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc
index c83e129a47..f092751946 100644
--- a/benchmarks/Benchmark_dwf_fp32.cc
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@@ -266,7 +266,7 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
   FermionAction::ImplParams p;
   p.dirichlet=Dirichlet;
   FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
-  //Dw.SloppyComms(sloppy);
+  Dw.SloppyComms(sloppy);
   Dw.ImportGauge(Umu);
   
   int ncall =300;

From 14bff9e1aa9f5ead983e246a8213d21b67564d74 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Sat, 9 May 2026 08:19:26 +0000
Subject: [PATCH 52/53] merge adjust

---
 Grid/communicator/Communicator_none.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc
index 5adbc5d5de..a7232fcdd4 100644
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@@ -127,6 +127,8 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
   dest=0;
 }
 
+int CartesianCommunicator::IsOffNode(int rank) { return false; }
+
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int xmit_to_rank,int dox,
 						     void *recv,

From a6ede60b08b266ff3d97d46a753908fc70552305 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Sat, 9 May 2026 08:35:01 +0000
Subject: [PATCH 53/53] merge adjust

---
 Grid/communicator/SharedMemoryMPI.cc          | 55 ++++---------------
 Grid/cshift/Cshift_mpi.h                      |  4 +-
 Grid/lattice/Lattice_reduction_gpu.h          |  2 +-
 Grid/lattice/Lattice_reduction_sycl.h         | 10 ++--
 Grid/qcd/action/fermion/CayleyFermion5D.h     |  4 +-
 .../implementation/CayleyFermion5Dcache.h     |  4 +-
 Grid/qcd/smearing/GaugeConfiguration.h        |  1 -
 Grid/threads/Accelerator.cc                   |  4 +-
 Grid/util/Init.cc                             |  3 +-
 HMC/ComputeWilsonFlow.cc                      |  3 +-
 configure.ac                                  |  2 +-
 11 files changed, 29 insertions(+), 63 deletions(-)

diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc
index 4af766fd56..213543ccd7 100644
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -540,49 +540,21 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
   ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 #ifndef ACCELERATOR_AWARE_MPI
   // printf("Host buffer allocate for GPU non-aware MPI\n");
-#if 0
-  HostCommBuf= acceleratorAllocHost(bytes);
-#else 
   HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
-#if 0
-  #warning "Moving host buffers to specific NUMA domain"
-  int numa;
-  char *numa_name=(char *)getenv("MPI_BUF_NUMA");
-  if(numa_name) {
-    unsigned long page_size = sysconf(_SC_PAGESIZE);
-    numa = atoi(numa_name);
-    unsigned long page_count = bytes/page_size;
-    std::vector<void *> pages(page_count);
-    std::vector<int>    nodes(page_count,numa);
-    std::vector<int>    status(page_count,-1);
-    for(unsigned long p=0;p<page_count;p++){
-      pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
-    }
-    int ret = move_pages(0,
-			 page_count,
-			 &pages[0],
-			 &nodes[0],
-			 &status[0],
-			 MPOL_MF_MOVE);
-    printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
-    if (ret) perror(" move_pages failed for reason:");
-  }
-#endif  
-  acceleratorPin(HostCommBuf,bytes);
-#endif  
-
 #endif  
   ShmCommBuf = acceleratorAllocDevice(bytes);
   if (ShmCommBuf == (void *)NULL ) {
-    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
+    std::cerr << "SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
     exit(EXIT_FAILURE);  
   }
   if ( WorldRank == 0 ){
-    std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
+    std::cout << Mheader " acceleratorAllocDevice "<< bytes 
 	      << "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
   }
   SharedMemoryZero(ShmCommBuf,bytes);
-  std::cout<< "Setting up IPC"<<std::endl;
+  if ( WorldRank == 0 ){
+    std::cout<< Mheader "Setting up IPC"<<std::endl;
+  }
   ///////////////////////////////////////////////////////////////////////////////////////////////////////////
   // Loop over ranks/gpu's on our node
   ///////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -613,8 +585,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
       if ( err != ZE_RESULT_SUCCESS ) {
 	std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
 	exit(EXIT_FAILURE);
-      } else {
-	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
       }
       memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
       handle.pid = getpid();
@@ -673,12 +643,12 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef SHM_SOCKETS
       myfd=UnixSockets::RecvFileDescriptor();
 #else
-      std::cout<<"mapping seeking remote pid/fd "
-	       <<handle.pid<<"/"
-	       <<handle.fd<<std::endl;
+      //      std::cout<<"mapping seeking remote pid/fd "
+      //	       <<handle.pid<<"/"
+      //	       <<handle.fd<<std::endl;
 
       int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
-      std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
+      //      std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
       //      int myfd  = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
       myfd  = syscall(438,pidfd,handle.fd,0);
       int err_t = errno;
@@ -688,7 +658,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 	assert(0);
       }
 #endif
-      std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
+      //      std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
       memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
       memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
 
@@ -697,9 +667,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 	std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
 	std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; 
 	exit(EXIT_FAILURE);
-      } else {
-	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
-	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle pointer is "<<std::hex<<thisBuf<<std::dec<<std::endl;
       }
       GRID_ASSERT(thisBuf!=nullptr);
     }
@@ -780,6 +747,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
     WorldShmCommBufs[r] =ptr;
     //    std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
   }
+  std::cout<< Mheader " Intra-node IPC setup is complete "<<std::endl;
   _ShmAlloc=1;
   _ShmAllocBytes  = bytes;
 };
@@ -1042,6 +1010,7 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
   } else { 
     uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
     uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
+    //    std::cout << "ShmBufferTranslate : local,offset,remote "<<std::hex<<local_p<<" "<<offset<<" "<<remote<<std::dec<<std::endl;
     return (void *) remote;
   }
 }
diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h
index 916213f061..252478ade2 100644
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@@ -192,13 +192,12 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
   int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
   static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
   static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
-
 #ifndef ACCELERATOR_AWARE_MPI
   int pad = (8 + sizeof(vobj) - 1) / sizeof(vobj);
   static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size+pad);
   static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size+pad);
 #endif
-
+  
   int cb= (cbmask==0x2)? Odd : Even;
   int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
   RealD tcopy=0.0;
@@ -351,7 +350,6 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
     send_buf_extract[s].resize(buffer_size);
     recv_buf_extract[s].resize(buffer_size);
   }
-
 #ifndef ACCELERATOR_AWARE_MPI
 #ifdef GRID_CHECKSUM_COMMS
   buffer_size += (8 + sizeof(vobj) - 1) / sizeof(vobj);
diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h
index 461392f82d..849f53095e 100644
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -52,7 +52,7 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &
   // let the number of threads in a block be a multiple of 2, starting from warpSize
   threads = warpSize;
   if ( threads*sizeofsobj > sharedMemPerBlock ) {
-    //std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
+    std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
     return 0;
   }
   while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
diff --git a/Grid/lattice/Lattice_reduction_sycl.h b/Grid/lattice/Lattice_reduction_sycl.h
index c0d1d07810..ace7f01086 100644
--- a/Grid/lattice/Lattice_reduction_sycl.h
+++ b/Grid/lattice/Lattice_reduction_sycl.h
@@ -96,11 +96,11 @@ template<class Word> Word checksum_gpu(Word *vec,uint64_t L)
     theGridAccelerator->submit([&](sycl::handler &cgh) {
       auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
       cgh.parallel_for(sycl::range<1>{L},
-                      Reduction,
-                      [=] (sycl::id<1> index, auto &sum) {
-			auto l = index % 61;
-                        sum ^= vec[index]<<l | vec[index]>>(64-l);
-                      });
+                       Reduction,
+                       [=] (sycl::id<1> index, auto &sum) {
+			 auto l = index % 61;
+                         sum ^= vec[index]<<l | vec[index]>>(64-l);
+                       });
     });
   }
   theGridAccelerator->wait();
diff --git a/Grid/qcd/action/fermion/CayleyFermion5D.h b/Grid/qcd/action/fermion/CayleyFermion5D.h
index 6d6c93237e..77491def22 100644
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@@ -87,14 +87,14 @@ class CayleyFermion5D : public WilsonFermion5D<Impl>
   /////////////////////////////////////////////////////
   // Instantiate different versions depending on Impl
   /////////////////////////////////////////////////////
-  virtual void M5D(const FermionField &psi,
+  void M5D(const FermionField &psi,
 	   const FermionField &phi,
 	   FermionField &chi,
 	   std::vector<Coeff_t> &lower,
 	   std::vector<Coeff_t> &diag,
 	   std::vector<Coeff_t> &upper);
 
-  virtual void M5Ddag(const FermionField &psi,
+  void M5Ddag(const FermionField &psi,
 	      const FermionField &phi,
 	      FermionField &chi,
 	      std::vector<Coeff_t> &lower,
diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
index 8e7e2a603a..4bc703a313 100644
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
@@ -130,7 +130,7 @@ CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi
   GridBase *grid=psi_i.Grid();
 
   autoView(psi , psi_i,AcceleratorRead);
-  autoView(chi , chi_i,AcceleratorWriteDiscard);
+  autoView(chi , chi_i,AcceleratorWrite);
 
   int Ls=this->Ls;
 
@@ -194,7 +194,7 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
   int Ls=this->Ls;
 
   autoView(psi , psi_i,AcceleratorRead);
-  autoView(chi , chi_i,AcceleratorWriteDiscard);
+  autoView(chi , chi_i,AcceleratorWrite);
 
   acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
   acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
diff --git a/Grid/qcd/smearing/GaugeConfiguration.h b/Grid/qcd/smearing/GaugeConfiguration.h
index 522541a19d..6f6fd016ef 100644
--- a/Grid/qcd/smearing/GaugeConfiguration.h
+++ b/Grid/qcd/smearing/GaugeConfiguration.h
@@ -120,7 +120,6 @@ class SmearedConfiguration : public ConfigurationBase<typename Gimpl::Field>
   }
 public:
   /*! @brief Returns smeared configuration at level 'Level' */
-public:
   const GaugeField &get_smeared_conf(int Level) const
   {
     return SmearedSet[Level];
diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc
index f750b56c94..aa39f88c93 100644
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -210,8 +210,8 @@ void acceleratorInit(void)
   //  sycl::gpu_selector selector;
   //  sycl::device selectedDevice { selector };
   theGridAccelerator = new sycl::queue (sycl::gpu_selector_v);
-  //theCopyAccelerator = new sycl::queue (sycl::gpu_selector_v);
-  theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
+  theCopyAccelerator = new sycl::queue (sycl::gpu_selector_v);
+  //  theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
 
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
   zeInit(0);
diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc
index 2702b89629..23061372a1 100644
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -397,7 +397,7 @@ void Grid_init(int *argc,char ***argv)
   /////////////////////////////////////////////////////////////////
   if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){
     Grid_quiesce_nodes();
-  } else {
+  } else { 
     FILE *fp;
     std::ostringstream fname;
 
@@ -412,7 +412,6 @@ void Grid_init(int *argc,char ***argv)
     fname << (rank/radix)*radix ;
     mkdir(fname.str().c_str(), S_IRWXU );
     fname << "/";
-
     fname<<"Grid.stdout.";
     fname<<CartesianCommunicator::RankWorld();
 
diff --git a/HMC/ComputeWilsonFlow.cc b/HMC/ComputeWilsonFlow.cc
index 223cb0d842..9bc935361d 100644
--- a/HMC/ComputeWilsonFlow.cc
+++ b/HMC/ComputeWilsonFlow.cc
@@ -216,7 +216,8 @@ int main(int argc, char **argv) {
   int t=WFPar.maxTau;
   WF.smear(Uflow, Umu);
   //  NerscIO::writeConfiguration(Uflow,filesmr);
-
+  
+  
   RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow);
   RealD WFlow_TC   = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow);
   RealD WFlow_TC5Li   = WilsonLoops<PeriodicGimplR>::TopologicalCharge5Li(Uflow);
diff --git a/configure.ac b/configure.ac
index b06303799d..ce786b0169 100644
--- a/configure.ac
+++ b/configure.ac
@@ -275,7 +275,7 @@ esac
 ############### CHECKSUM COMMS
 AC_ARG_ENABLE([checksum-comms],
     [AS_HELP_STRING([--enable-checksum-comms=yes|no],[checksum all communication])],
-    [ac_CHECKSUM_COMMS=${enable_checksum_comms}], [ac_CHECKSUM_COMMS=yes])
+    [ac_CHECKSUM_COMMS=${enable_checksum_comms}], [ac_CHECKSUM_COMMS=no])
 
 case ${ac_CHECKSUM_COMMS} in
     yes)