From f84410d8e06eb457754c3022ca5e6cfc3131cf3b Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sat, 20 May 2023 11:22:58 +0200 Subject: [PATCH 01/53] reduce zmobius verbosity --- Grid/qcd/action/fermion/ZMobiusFermion.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/qcd/action/fermion/ZMobiusFermion.h b/Grid/qcd/action/fermion/ZMobiusFermion.h index fc8a74395d..365714f417 100644 --- a/Grid/qcd/action/fermion/ZMobiusFermion.h +++ b/Grid/qcd/action/fermion/ZMobiusFermion.h @@ -57,7 +57,7 @@ class ZMobiusFermion : public CayleyFermion5D { // RealD eps = 1.0; - std::cout< zgamma(this->Ls); for(int s=0;sLs;s++){ zgamma[s] = gamma[s]; From d20caace9834daa5bdc0bd52299c717fdc797858 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Wed, 25 Sep 2024 12:40:38 +0200 Subject: [PATCH 02/53] Low-level half-precision comms framework and bfloat16 implementation --- Grid/communicator/Communicator_base.cc | 2 + Grid/communicator/Communicator_base.h | 10 ++- Grid/communicator/Communicator_mpi3.cc | 98 +++++++++++++++++++++++--- Grid/communicator/Communicator_none.cc | 6 +- Grid/communicator/SharedMemory.h | 2 +- Grid/stencil/Stencil.h | 6 +- Grid/util/Init.cc | 7 +- benchmarks/Benchmark_comms.cc | 16 ++--- 8 files changed, 121 insertions(+), 26 deletions(-) diff --git a/Grid/communicator/Communicator_base.cc b/Grid/communicator/Communicator_base.cc index 79efb90cba..e2118eba35 100644 --- a/Grid/communicator/Communicator_base.cc +++ b/Grid/communicator/Communicator_base.cc @@ -41,6 +41,8 @@ bool Stencil_force_mpi = true; CartesianCommunicator::CommunicatorPolicy_t CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent; int CartesianCommunicator::nCommThreads = -1; +CartesianCommunicator::StencilCompressionPolicy_t +CartesianCommunicator::StencilCompressionPolicy= CartesianCommunicator::StencilCompressionPolicyNone; ///////////////////////////////// // Grid information queries diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index b98424a19c..ffe960e950 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -49,6 +49,10 @@ class CartesianCommunicator : public SharedMemory { static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; } static int nCommThreads; + enum StencilCompressionPolicy_t { StencilCompressionPolicyNone, StencilCompressionPolicyBfloat16 }; + static StencilCompressionPolicy_t StencilCompressionPolicy; + static void SetStencilCompressionPolicy(StencilCompressionPolicy_t policy ) { StencilCompressionPolicy = policy; } + //////////////////////////////////////////// // Communicator should know nothing of the physics grid, only processor grid. //////////////////////////////////////////// @@ -148,17 +152,17 @@ class CartesianCommunicator : public SharedMemory { int xmit_to_rank,int do_xmit, void *recv, int recv_from_rank,int do_recv, - int bytes,int dir); + int bytes,int dir,size_t word_size); double StencilSendToRecvFromBegin(std::vector &list, void *xmit, int xmit_to_rank,int do_xmit, void *recv, int recv_from_rank,int do_recv, - int xbytes,int rbytes,int dir); + int xbytes,int rbytes,int dir,size_t word_size); - void StencilSendToRecvFromComplete(std::vector &waitall,int i); + void StencilSendToRecvFromComplete(std::vector &waitall,int i,size_t word_size); void StencilBarrier(void); //////////////////////////////////////////////////////////// diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index e7d7a96def..cbe5c5bb20 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -340,11 +340,11 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, int dest, int dox, void *recv, int from, int dor, - int bytes,int dir) + int bytes,int dir,size_t word_size) { std::vector list; - double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); - StencilSendToRecvFromComplete(list,dir); + double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir,word_size); + StencilSendToRecvFromComplete(list,dir,word_size); return offbytes; } @@ -353,7 +353,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector Date: Wed, 25 Sep 2024 12:49:53 +0200 Subject: [PATCH 03/53] adjust help message --- Grid/util/Init.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index bd2ce36600..e420524791 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -455,7 +455,7 @@ void Grid_init(int *argc,char ***argv) std::cout< Date: Fri, 27 Sep 2024 20:07:21 +0300 Subject: [PATCH 04/53] mute benchmarking logs mute benchmarking logs (2) --- Grid/communicator/Communicator_mpi3.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index cbe5c5bb20..1859d976a3 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -399,7 +399,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector +#ifdef GRID_CUDA +#define CUDA_PROFILE +#endif + +#ifdef CUDA_PROFILE +#include +#endif + +using namespace std; +using namespace Grid; + +//////////////////////// +/// Move to domains //// +//////////////////////// + +Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT +}; + +void Benchmark(int Ls, Coordinate Dirichlet); + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + + int threads = GridThread::GetThreads(); + + int Ls=16; + for(int i=0;i> Ls; + } + } + + ////////////////// + // With comms + ////////////////// + Coordinate Dirichlet(Nd+1,0); + + std::cout << "\n\n\n\n\n\n" < seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); +#define SINGLE +#ifdef SINGLE + typedef vComplexF Simd; + typedef LatticeFermionF FermionField; + typedef LatticeGaugeFieldF GaugeField; + typedef LatticeColourMatrixF ColourMatrixField; + typedef DomainWallFermionF FermionAction; +#endif +#ifdef DOUBLE + typedef vComplexD Simd; + typedef LatticeFermionD FermionField; + typedef LatticeGaugeFieldD GaugeField; + typedef LatticeColourMatrixD ColourMatrixField; + typedef DomainWallFermionD FermionAction; +#endif +#ifdef DOUBLE2 + typedef vComplexD2 Simd; + typedef LatticeFermionD2 FermionField; + typedef LatticeGaugeFieldD2 GaugeField; + typedef LatticeColourMatrixD2 ColourMatrixField; + typedef DomainWallFermionD2 FermionAction; +#endif + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl; + GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG")); + + std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; + GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG")); + + + FermionField src (FGrid); random(RNG5,src); +#if 0 + src = Zero(); + { + Coordinate origin({0,0,0,latt4[2]-1,0}); + SpinColourVectorF tmp; + tmp=Zero(); + tmp()(0)(0)=Complex(-2.0,0.0); + std::cout << " source site 0 " << tmp<::HotConfiguration(RNG4,Umu); + // SU::ColdConfiguration(Umu); + UmuCopy=Umu; + std::cout << GridLogMessage << "Random gauge initialised " << std::endl; + + //////////////////////////////////// + // Apply BCs + //////////////////////////////////// + Coordinate Block(4); + for(int d=0;d<4;d++) Block[d]= Dirichlet[d+1]; + + std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl; + std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl; + + DirichletFilter Filter(Block); + Filter.applyFilter(Umu); + + //////////////////////////////////// + // Naive wilson implementation + //////////////////////////////////// + std::vector U(4,UGrid); + for(int mu=0;mu(Umu,mu); + } + + std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl; + + if (1) + { + ref = Zero(); + for(int mu=0;muoSites();ss++){ + for(int s=0;soSites();ss++){ + for(int s=0;s_Nprocessors; + RealD NN = UGrid->NodeCount(); + + std::cout << GridLogMessage<< "*****************************************************************" <Barrier(); + Dw.Dhop(src,result,0); + std::cout<Barrier(); + + double volume=Ls; for(int mu=0;mu1.0e-4) ) { + std::cout<Barrier(); + std::cout<Barrier(); + exit(-1); + } + assert (n2e< 1.0e-4 ); + } + + if (1) + { // Naive wilson dag implementation + ref = Zero(); + for(int mu=0;muoSites();ss++){ + for(int s=0;soSites();ss++){ + for(int s=0;sBarrier(); + Dw.DhopEO(src_o,r_e,DaggerNo); + double t0=usecond(); + for(int i=0;iBarrier(); + + double volume=Ls; for(int mu=0;mu Date: Wed, 18 Dec 2024 14:33:27 +0100 Subject: [PATCH 06/53] Update Benchmark_dwf_fp32_jureap.cc --- benchmarks/Benchmark_dwf_fp32_jureap.cc | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/benchmarks/Benchmark_dwf_fp32_jureap.cc b/benchmarks/Benchmark_dwf_fp32_jureap.cc index 57b684e991..a768b00a07 100644 --- a/benchmarks/Benchmark_dwf_fp32_jureap.cc +++ b/benchmarks/Benchmark_dwf_fp32_jureap.cc @@ -45,6 +45,8 @@ Gamma::Algebra Gmu [] = { void Benchmark(int Ls, Coordinate Dirichlet); +#include + int main (int argc, char ** argv) { Grid_init(&argc,&argv); @@ -247,13 +249,25 @@ void Benchmark(int Ls, Coordinate Dirichlet) FGrid->Barrier(); Dw.Dhop(src,result,0); std::cout<Barrier(); + std::this_thread::sleep_for(std::chrono::seconds(5)); + auto cpu_start = std::chrono::system_clock::now(); double t0=usecond(); for(int i=0;iBarrier(); + auto cpu_stop = std::chrono::system_clock::now(); + std::this_thread::sleep_for(std::chrono::seconds(5)); + if (FGrid->ThisRank() == 0) { + std::ofstream file("energy.times"); + file<<"energy_start:"<< std::chrono::duration_cast(cpu_start.time_since_epoch()).count()<(cpu_stop.time_since_epoch()).count()< Date: Wed, 18 Dec 2024 15:38:37 +0100 Subject: [PATCH 07/53] Update Benchmark_dwf_fp32_jureap.cc --- benchmarks/Benchmark_dwf_fp32_jureap.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/Benchmark_dwf_fp32_jureap.cc b/benchmarks/Benchmark_dwf_fp32_jureap.cc index a768b00a07..92dd40bc1b 100644 --- a/benchmarks/Benchmark_dwf_fp32_jureap.cc +++ b/benchmarks/Benchmark_dwf_fp32_jureap.cc @@ -46,6 +46,7 @@ Gamma::Algebra Gmu [] = { void Benchmark(int Ls, Coordinate Dirichlet); #include +#include int main (int argc, char ** argv) { From eb1058da1d1b45c1dcc63acb6af4baa7ec661cb6 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sun, 26 Jan 2025 16:56:07 +0100 Subject: [PATCH 08/53] Merge with upstream --- Grid/util/Lexicographic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/util/Lexicographic.h b/Grid/util/Lexicographic.h index 422e42ee76..8636333d77 100644 --- a/Grid/util/Lexicographic.h +++ b/Grid/util/Lexicographic.h @@ -50,7 +50,7 @@ namespace Grid{ int64_t index64; IndexFromCoorReversed(coor,index64,dims); if ( index64>=2*1024*1024*1024LL ){ - std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "< Date: Sun, 26 Jan 2025 17:14:29 +0100 Subject: [PATCH 09/53] complete merge conflict resolution --- Grid/communicator/Communicator_mpi3.cc | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index b683b7cb40..058a7cdf90 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -325,12 +325,12 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector &lis tag= dir+from*32; int ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator,&rrq); assert(ierr==0); - list.push_back(rrq); + list.push_back({rrq, recv, bytes}); tag= dir+_processor*32; ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator,&xrq); assert(ierr==0); - list.push_back(xrq); + list.push_back({xrq, xmit, bytes}); } void CartesianCommunicator::CommsComplete(std::vector &list) { @@ -339,7 +339,11 @@ void CartesianCommunicator::CommsComplete(std::vector &list) if (nreq==0) return; std::vector status(nreq); - int ierr = MPI_Waitall(nreq,&list[0],&status[0]); + std::vector requests(nreq); + for (int i=0;i Date: Sat, 22 Feb 2025 16:26:12 +0100 Subject: [PATCH 10/53] allow for transpose in gemm of complex matrices --- Grid/algorithms/blas/BatchedBlas.h | 50 +++++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index f4092bc535..ce615173ec 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -208,8 +208,8 @@ class GridBLAS { assert(Bkn.size()==batchCount); assert(Cmn.size()==batchCount); - assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose - assert(OpB!=GridBLAS_OP_T); + //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose + //assert(OpB!=GridBLAS_OP_T); int lda = m; // m x k column major int ldb = k; // k x n column major @@ -376,6 +376,13 @@ class GridBLAS { Eigen::Map eCmn(Cmn[p],m,n); eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; }); + } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],k,m); + Eigen::Map eBkn(Bkn[p],k,n); + Eigen::Map eCmn(Cmn[p],m,n); + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; + }); } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],m,k); @@ -383,6 +390,13 @@ class GridBLAS { Eigen::Map eCmn(Cmn[p],m,n); eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; }); + } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],m,k); + Eigen::Map eBkn(Bkn[p],n,k); + Eigen::Map eCmn(Cmn[p],m,n); + eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; + }); } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); @@ -390,6 +404,13 @@ class GridBLAS { Eigen::Map eCmn(Cmn[p],m,n); eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; } ); + } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],k,m); + Eigen::Map eBkn(Bkn[p],n,k); + Eigen::Map eCmn(Cmn[p],m,n); + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; + } ); } else { assert(0); } @@ -414,8 +435,8 @@ class GridBLAS { RealD t2=usecond(); int32_t batchCount = Amk.size(); - assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose - assert(OpB!=GridBLAS_OP_T); + //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose + //assert(OpB!=GridBLAS_OP_T); int lda = m; // m x k column major int ldb = k; // k x n column major @@ -523,6 +544,13 @@ class GridBLAS { Eigen::Map eCmn(Cmn[p],m,n); eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; }); + } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],k,m); + Eigen::Map eBkn(Bkn[p],k,n); + Eigen::Map eCmn(Cmn[p],m,n); + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; + }); } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],m,k); @@ -530,6 +558,13 @@ class GridBLAS { Eigen::Map eCmn(Cmn[p],m,n); eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; }); + } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],m,k); + Eigen::Map eBkn(Bkn[p],n,k); + Eigen::Map eCmn(Cmn[p],m,n); + eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; + }); } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); @@ -537,6 +572,13 @@ class GridBLAS { Eigen::Map eCmn(Cmn[p],m,n); eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; } ); + } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],k,m); + Eigen::Map eBkn(Bkn[p],n,k); + Eigen::Map eCmn(Cmn[p],m,n); + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; + } ); } else { assert(0); } From 059b6e67f0be0a7fb1595bb2e6df4d67747564e0 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sun, 2 Mar 2025 14:23:01 +0100 Subject: [PATCH 11/53] default gemm needs to ignore C if beta vanishes --- Grid/algorithms/blas/BatchedBlas.h | 107 +++++++++++++++++++++++------ 1 file changed, 85 insertions(+), 22 deletions(-) diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index ce615173ec..88565df1a5 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -367,28 +367,40 @@ class GridBLAS { Eigen::Map eAmk(Amk[p],m,k); Eigen::Map eBkn(Bkn[p],k,n); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk * eBkn ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk * eBkn ; + else + eCmn = alpha * eAmk * eBkn ; }); } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],k,n); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; + else + eCmn = alpha * eAmk.adjoint() * eBkn ; }); } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],k,n); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; + else + eCmn = alpha * eAmk.transpose() * eBkn ; }); } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],m,k); Eigen::Map eBkn(Bkn[p],n,k); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; + else + eCmn = alpha * eAmk * eBkn.adjoint() ; }); } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { thread_for (p, batchCount, { @@ -402,14 +414,20 @@ class GridBLAS { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],n,k); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; + else + eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ; } ); } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],n,k); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; + else + eCmn = alpha * eAmk.transpose() * eBkn.transpose() ; } ); } else { assert(0); @@ -535,49 +553,70 @@ class GridBLAS { Eigen::Map eAmk(Amk[p],m,k); Eigen::Map eBkn(Bkn[p],k,n); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk * eBkn ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk * eBkn ; + else + eCmn = alpha * eAmk * eBkn ; }); } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],k,n); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; + else + eCmn = alpha * eAmk.adjoint() * eBkn ; }); } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],k,n); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; + else + eCmn = alpha * eAmk.transpose() * eBkn ; }); } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],m,k); Eigen::Map eBkn(Bkn[p],n,k); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; + else + eCmn = alpha * eAmk * eBkn.adjoint() ; }); } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],m,k); Eigen::Map eBkn(Bkn[p],n,k); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; + else + eCmn = alpha * eAmk * eBkn.transpose() ; }); } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],n,k); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; + else + eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ; } ); } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],n,k); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; + else + eCmn = alpha * eAmk.transpose() * eBkn.transpose() ; } ); } else { assert(0); @@ -703,29 +742,41 @@ class GridBLAS { Eigen::Map eAmk(Amk[p],m,k); Eigen::Map eBkn(Bkn[p],k,n); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk * eBkn ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk * eBkn ; + else + eCmn = alpha * eAmk * eBkn ; }); } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],k,n); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; + else + eCmn = alpha * eAmk.transpose() * eBkn ; }); } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],m,k); Eigen::Map eBkn(Bkn[p],n,k); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; + else + eCmn = alpha * eAmk * eBkn.transpose() ; }); } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],n,k); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; - } ); + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; + else + eCmn = alpha * eAmk.transpose() * eBkn.transpose() ; + }); } else { assert(0); } @@ -851,28 +902,40 @@ class GridBLAS { Eigen::Map eAmk(Amk[p],m,k); Eigen::Map eBkn(Bkn[p],k,n); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk * eBkn ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk * eBkn ; + else + eCmn = alpha * eAmk * eBkn ; }); } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],k,n); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; + else + eCmn = alpha * eAmk.transpose() * eBkn ; }); } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],m,k); Eigen::Map eBkn(Bkn[p],n,k); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; + else + eCmn = alpha * eAmk * eBkn.transpose() ; }); } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],n,k); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; + else + eCmn = alpha * eAmk.transpose() * eBkn.transpose() ; }); } else { assert(0); From 2acb6578ecf8d7f30478733a1c711039e6745b22 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Mon, 10 Mar 2025 06:58:47 +0100 Subject: [PATCH 12/53] cpu target async --- Grid/threads/Accelerator.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 1cb56dddad..0128856cc5 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -568,6 +568,8 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA spec inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); } inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);} +inline void acceleratorCopyToDeviceAsync(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); } +inline void acceleratorCopyFromDeviceAsync(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);} inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);} inline void acceleratorCopySynchronise(void) {}; From e85b7986cf144f2b852da2e4635b8dd726b1060d Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Thu, 20 Mar 2025 11:38:24 +0100 Subject: [PATCH 13/53] library adjustment for JEDI --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 8e8d67af77..1483bcae35 100644 --- a/configure.ac +++ b/configure.ac @@ -136,7 +136,7 @@ AC_ARG_ENABLE([tracing], case ${ac_TRACING} in nvtx) AC_DEFINE([GRID_TRACING_NVTX],[1],[use NVTX]) - LIBS="${LIBS} -lnvToolsExt64_1" + LIBS="${LIBS} -lnvToolsExt" ;; roctx) AC_DEFINE([GRID_TRACING_ROCTX],[1],[use ROCTX]) From 98cce16f0686da4b8d7a5f971bb6bed31c5b9c9c Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Fri, 21 Mar 2025 12:58:38 +0100 Subject: [PATCH 14/53] Towards inverse and determinant in batched blas on all platforms --- Grid/algorithms/blas/BatchedBlas.h | 186 +++++++++++++++++++++++++++++ 1 file changed, 186 insertions(+) diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index 88565df1a5..9de035d069 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -946,6 +946,192 @@ class GridBLAS { RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount; } + /* + Inverse and Determinant + + - CPU version uses Eigen + - GPU version uses LAPACK-compatible getrf / getri + + Design comment: Eigen does not expose getrf / getri in a LAPACK compatible manner. + Overhead to go through getrf / getri for CPU version too large. + Current interface therefore only guarantees the inverse and determinant + functions on all platforms but not the getrf / getri ones. + */ +#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + + void inverseBatched(int64_t n, + deviceVector &Ann, + deviceVector &Cnn) { + + int64_t batchCount = Ann.size(); + assert(batchCount == Cnn.size()); + thread_for(p,batchCount, { + Eigen::Map eAnn(Ann[p],n,n); + Eigen::Map eCnn(Cnn[p],n,n); + eCnn = eAnn.inverse(); + }); + } + + void inverseBatched(int64_t n, + deviceVector &Ann, + deviceVector &Cnn) { + + int64_t batchCount = Ann.size(); + assert(batchCount == Cnn.size()); + thread_for(p,batchCount, { + Eigen::Map eAnn(Ann[p],n,n); + Eigen::Map eCnn(Cnn[p],n,n); + eCnn = eAnn.inverse(); + }); + } + + void determinantBatched(int64_t n, + deviceVector &Ann, + deviceVector &C) { + + int64_t batchCount = Ann.size(); + assert(batchCount == C.size()); + thread_for(p,batchCount, { + Eigen::Map eAnn(Ann[p],n,n); + *C[p] = eAnn.determinant(); + }); + } + + void determinantBatched(int64_t n, + deviceVector &Ann, + deviceVector &C) { + + int64_t batchCount = Ann.size(); + assert(batchCount == C.size()); + thread_for(p,batchCount, { + Eigen::Map eAnn(Ann[p],n,n); + *C[p] = eAnn.determinant(); + }); + } + +#else + + void getrfBatched(int64_t n, + deviceVector &Ann, + deviceVector &ipiv, + deviceVector &info) + { + int64_t batchCount = Ann.size(); + assert(ipiv.size()==batchCount); + assert(info.size()==batchCount); + +#ifdef GRID_HIP + auto err = hipblasZgetrfBatched(gridblasHandle,(int)n, + (hipblasDoubleComplex **)&Ann[0], (int)n, + (int*) &ipiv[0], + (int*) &info[0], + (int)batchCount); + assert(err==HIPBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_CUDA + auto err = cudablasZgetrfBatched(gridblasHandle, (int)n, + (cublasDoubleComplex **)&Ann[0], (int)n, + (int*) &ipiv[0], + (int*) &info[0], + (int)batchCount); + assert(err==HIPBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_SYCL + assert(0); + /* + TODO: cache scratchpad + + oneapi::mkl::lapack::gerf_batch(*gridblasHandle, + &n, &n, + (ComplexD **)&Ann[0], + &n, + (int64_t)&ipiv[0], + (int64_t)1, &batchCount, + scratchpad, scratchpad_size, + std::vector()); + */ + synchronise(); +#endif + } + + void getriBatched(int64_t n, + deviceVector &Ann, + deviceVector &ipiv, + deviceVector &info, + deviceVector &Cnn) + { + int64_t batchCount = Ann.size(); + assert(ipiv.size()==batchCount); + assert(info.size()==batchCount); + assert(Cnn.size()==batchCount); + +#ifdef GRID_HIP + auto err = hipblasZgetrfBatched(gridblasHandle,(int)n, + (hipblasDoubleComplex **)&Ann[0], (int)n, + (int*) &ipiv[0], + (hipblasDoubleComplex **)&Cnn[0], (int)n, + (int*) &info[0], + (int)batchCount); + assert(err==HIPBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_CUDA + auto err = cudablasZgetrfBatched(gridblasHandle, (int)n, + (cublasDoubleComplex **)&Ann[0], (int)n, + (int*) &ipiv[0], + (cublasDoubleComplex **)&Cnn[0], (int)n, + (int*) &info[0], + (int)batchCount); + assert(err==HIPBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_SYCL + assert(0); + /* + TODO: cache scratchpad + oneapi::mkl::lapack::geri_batch(*gridblasHandle, + &n, &n, + (ComplexD **)&Ann[0], + &n, + (int64_t)&ipiv[0], + (int64_t)1, &batchCount, + scratchpad, scratchpad_size, + std::vector()); + */ + synchronise(); +#endif + } + + template + void inverseBatched(int64_t n, + deviceVector &Ann, + deviceVector &Cnn) { + + int64_t batchCount = Ann.size(); + RealD t0 = usecond(); + deviceVector ipiv(batchCount*n); + deviceVector info(batchCount); + deviceVector _Bnn(batchCount*n*n); + deviceVector Bnn(batchCount); + + accelerator_for(i,batchCount,1,{ + Bnn[i] = &_Bnn[n*n*i]; + }); + accelerator_for(i,batchCount*n*n,1,{ + int64_t a = i / (n*n); + int64_t b = i % (n*n); + _Bnn[i] = Ann[i][b]; + }); + + RealD t1 = usecond(); + getrfBatched(n, Bnn, ipiv, info); + // test info for non-invertibility? set to nan if yes? + RealD t2 = usecond(); + getriBatched(n, Bnn, ipiv, info, Cnn); + RealD t3 = usecond(); + std::cout << GridLogMessage << "Temp " << t1-t0 << " rf " << t2-t1 << " ri " << t3-t2 << std::endl; + } +#endif + + template double benchmark(int M, int N, int K, int BATCH) { From 1d7edc32c2255e83ff66b436433fdada9bd388a7 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Fri, 21 Mar 2025 15:35:59 +0100 Subject: [PATCH 15/53] Continue with cublas --- Grid/algorithms/blas/BatchedBlas.h | 156 ++++++++++++++++++++++++++--- 1 file changed, 144 insertions(+), 12 deletions(-) diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index 9de035d069..52137b48d0 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -1029,13 +1029,56 @@ class GridBLAS { assert(err==HIPBLAS_STATUS_SUCCESS); #endif #ifdef GRID_CUDA - auto err = cudablasZgetrfBatched(gridblasHandle, (int)n, - (cublasDoubleComplex **)&Ann[0], (int)n, - (int*) &ipiv[0], - (int*) &info[0], - (int)batchCount); + auto err = cublasZgetrfBatched(gridblasHandle, (int)n, + (cuDoubleComplex **)&Ann[0], (int)n, + (int*) &ipiv[0], + (int*) &info[0], + (int)batchCount); + assert(err==CUBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_SYCL + assert(0); + /* + TODO: cache scratchpad + + oneapi::mkl::lapack::gerf_batch(*gridblasHandle, + &n, &n, + (ComplexD **)&Ann[0], + &n, + (int64_t)&ipiv[0], + (int64_t)1, &batchCount, + scratchpad, scratchpad_size, + std::vector()); + */ + synchronise(); +#endif + } + +void getrfBatched(int64_t n, + deviceVector &Ann, + deviceVector &ipiv, + deviceVector &info) + { + int64_t batchCount = Ann.size(); + assert(ipiv.size()==batchCount); + assert(info.size()==batchCount); + +#ifdef GRID_HIP + auto err = hipblasCgetrfBatched(gridblasHandle,(int)n, + (hipblasComplex **)&Ann[0], (int)n, + (int*) &ipiv[0], + (int*) &info[0], + (int)batchCount); assert(err==HIPBLAS_STATUS_SUCCESS); #endif +#ifdef GRID_CUDA + auto err = cublasCgetrfBatched(gridblasHandle, (int)n, + (cuComplex **)&Ann[0], (int)n, + (int*) &ipiv[0], + (int*) &info[0], + (int)batchCount); + assert(err==CUBLAS_STATUS_SUCCESS); +#endif #ifdef GRID_SYCL assert(0); /* @@ -1066,7 +1109,7 @@ class GridBLAS { assert(Cnn.size()==batchCount); #ifdef GRID_HIP - auto err = hipblasZgetrfBatched(gridblasHandle,(int)n, + auto err = hipblasZgetriBatched(gridblasHandle,(int)n, (hipblasDoubleComplex **)&Ann[0], (int)n, (int*) &ipiv[0], (hipblasDoubleComplex **)&Cnn[0], (int)n, @@ -1075,14 +1118,60 @@ class GridBLAS { assert(err==HIPBLAS_STATUS_SUCCESS); #endif #ifdef GRID_CUDA - auto err = cudablasZgetrfBatched(gridblasHandle, (int)n, - (cublasDoubleComplex **)&Ann[0], (int)n, - (int*) &ipiv[0], - (cublasDoubleComplex **)&Cnn[0], (int)n, - (int*) &info[0], - (int)batchCount); + auto err = cublasZgetriBatched(gridblasHandle, (int)n, + (cuDoubleComplex **)&Ann[0], (int)n, + (int*) &ipiv[0], + (cuDoubleComplex **)&Cnn[0], (int)n, + (int*) &info[0], + (int)batchCount); + assert(err==CUBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_SYCL + assert(0); + /* + TODO: cache scratchpad + oneapi::mkl::lapack::geri_batch(*gridblasHandle, + &n, &n, + (ComplexD **)&Ann[0], + &n, + (int64_t)&ipiv[0], + (int64_t)1, &batchCount, + scratchpad, scratchpad_size, + std::vector()); + */ + synchronise(); +#endif + } + + void getriBatched(int64_t n, + deviceVector &Ann, + deviceVector &ipiv, + deviceVector &info, + deviceVector &Cnn) + { + int64_t batchCount = Ann.size(); + assert(ipiv.size()==batchCount); + assert(info.size()==batchCount); + assert(Cnn.size()==batchCount); + +#ifdef GRID_HIP + auto err = hipblasCgetriBatched(gridblasHandle,(int)n, + (hipblasComplex **)&Ann[0], (int)n, + (int*) &ipiv[0], + (hipblasComplex **)&Cnn[0], (int)n, + (int*) &info[0], + (int)batchCount); assert(err==HIPBLAS_STATUS_SUCCESS); #endif +#ifdef GRID_CUDA + auto err = cublasCgetriBatched(gridblasHandle, (int)n, + (cuComplex **)&Ann[0], (int)n, + (int*) &ipiv[0], + (cuComplex **)&Cnn[0], (int)n, + (int*) &info[0], + (int)batchCount); + assert(err==CUBLAS_STATUS_SUCCESS); +#endif #ifdef GRID_SYCL assert(0); /* @@ -1129,6 +1218,49 @@ class GridBLAS { RealD t3 = usecond(); std::cout << GridLogMessage << "Temp " << t1-t0 << " rf " << t2-t1 << " ri " << t3-t2 << std::endl; } + + template + void determinantBatched(int64_t n, + deviceVector &Ann, + deviceVector &C) { + + int64_t batchCount = Ann.size(); + RealD t0 = usecond(); + deviceVector ipiv(batchCount*n); + deviceVector info(batchCount); + deviceVector _Bnn(batchCount*n*n); + deviceVector Bnn(batchCount); + + dtype** pAnn = (dtype**)&Ann[0]; + dtype** pBnn = (dtype**)&Bnn[0]; + dtype* p_Bnn = (dtype*)&_Bnn[0]; + dtype* pC = (dtype*)&C[0]; + int64_t* pipiv = (int64_t*)&ipiv[0]; + accelerator_for(i,batchCount,1,{ + pBnn[i] = &p_Bnn[n*n*i]; + }); + accelerator_for(i,batchCount*n*n,1,{ + int64_t a = i / (n*n); + int64_t b = i % (n*n); + p_Bnn[i] = pAnn[i][b]; + }); + + RealD t1 = usecond(); + getrfBatched(n, Bnn, ipiv, info); + RealD t2 = usecond(); + accelerator_for(i,batchCount,1,{ + dtype det = 1.0; + for (int64_t j=0;j Date: Fri, 21 Mar 2025 20:02:08 +0100 Subject: [PATCH 16/53] Complete first version on CUDA/HIP/CPU of GridBlas::inverseBatched and determinantBatched --- Grid/algorithms/blas/BatchedBlas.h | 79 ++++++++++++------------------ 1 file changed, 31 insertions(+), 48 deletions(-) diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index 52137b48d0..aef3246dcb 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -1017,7 +1017,7 @@ class GridBLAS { deviceVector &info) { int64_t batchCount = Ann.size(); - assert(ipiv.size()==batchCount); + assert(ipiv.size()==batchCount*n); assert(info.size()==batchCount); #ifdef GRID_HIP @@ -1060,7 +1060,7 @@ void getrfBatched(int64_t n, deviceVector &info) { int64_t batchCount = Ann.size(); - assert(ipiv.size()==batchCount); + assert(ipiv.size()==batchCount*n); assert(info.size()==batchCount); #ifdef GRID_HIP @@ -1104,7 +1104,7 @@ void getrfBatched(int64_t n, deviceVector &Cnn) { int64_t batchCount = Ann.size(); - assert(ipiv.size()==batchCount); + assert(ipiv.size()==batchCount*n); assert(info.size()==batchCount); assert(Cnn.size()==batchCount); @@ -1150,7 +1150,7 @@ void getrfBatched(int64_t n, deviceVector &Cnn) { int64_t batchCount = Ann.size(); - assert(ipiv.size()==batchCount); + assert(ipiv.size()==batchCount*n); assert(info.size()==batchCount); assert(Cnn.size()==batchCount); @@ -1191,75 +1191,58 @@ void getrfBatched(int64_t n, template void inverseBatched(int64_t n, - deviceVector &Ann, - deviceVector &Cnn) { + deviceVector &Ann, // this will be overwritten with LU decomposition + deviceVector &Cnn // this will be overwritten with the inverse + ) { int64_t batchCount = Ann.size(); RealD t0 = usecond(); deviceVector ipiv(batchCount*n); deviceVector info(batchCount); - deviceVector _Bnn(batchCount*n*n); - deviceVector Bnn(batchCount); - accelerator_for(i,batchCount,1,{ - Bnn[i] = &_Bnn[n*n*i]; - }); - accelerator_for(i,batchCount*n*n,1,{ - int64_t a = i / (n*n); - int64_t b = i % (n*n); - _Bnn[i] = Ann[i][b]; - }); - - RealD t1 = usecond(); - getrfBatched(n, Bnn, ipiv, info); + //RealD t1 = usecond(); + getrfBatched(n, Ann, ipiv, info); // test info for non-invertibility? set to nan if yes? - RealD t2 = usecond(); - getriBatched(n, Bnn, ipiv, info, Cnn); - RealD t3 = usecond(); - std::cout << GridLogMessage << "Temp " << t1-t0 << " rf " << t2-t1 << " ri " << t3-t2 << std::endl; + getriBatched(n, Ann, ipiv, info, Cnn); + //synchronise(); + //RealD t2 = usecond(); + //std::cout << GridLogMessage << "Temp " << t1-t0 << " rf/ri " << t2-t1 << std::endl; } template void determinantBatched(int64_t n, - deviceVector &Ann, - deviceVector &C) { + deviceVector &Ann, // this will be overwritten with LU decomposition + deviceVector &C // this will be overwritten with determinant + ) { int64_t batchCount = Ann.size(); - RealD t0 = usecond(); + //RealD t0 = usecond(); deviceVector ipiv(batchCount*n); deviceVector info(batchCount); - deviceVector _Bnn(batchCount*n*n); - deviceVector Bnn(batchCount); - + dtype** pAnn = (dtype**)&Ann[0]; - dtype** pBnn = (dtype**)&Bnn[0]; - dtype* p_Bnn = (dtype*)&_Bnn[0]; - dtype* pC = (dtype*)&C[0]; + dtype** pC = (dtype**)&C[0]; +#if defined(GRID_CUDA) || defined(GRID_HIP) + int* pipiv = (int*)&ipiv[0]; +#else int64_t* pipiv = (int64_t*)&ipiv[0]; - accelerator_for(i,batchCount,1,{ - pBnn[i] = &p_Bnn[n*n*i]; - }); - accelerator_for(i,batchCount*n*n,1,{ - int64_t a = i / (n*n); - int64_t b = i % (n*n); - p_Bnn[i] = pAnn[i][b]; - }); +#endif - RealD t1 = usecond(); - getrfBatched(n, Bnn, ipiv, info); - RealD t2 = usecond(); + //RealD t1 = usecond(); + getrfBatched(n, Ann, ipiv, info); + //RealD t2 = usecond(); accelerator_for(i,batchCount,1,{ dtype det = 1.0; for (int64_t j=0;j Date: Thu, 10 Apr 2025 20:12:12 +0200 Subject: [PATCH 17/53] CompactWilsonClover5D --- .../fermion/CompactWilsonCloverFermion5D.h | 196 +++++++++ Grid/qcd/action/fermion/Fermion.h | 6 + ...mpactWilsonCloverFermion5DImplementation.h | 376 ++++++++++++++++++ ...lsonCloverFermion5DInstantiation.cc.master | 45 +++ ...CloverFermion5DInstantiationWilsonImplD.cc | 1 + ...CloverFermion5DInstantiationWilsonImplF.cc | 1 + .../instantiation/generate_instantiations.sh | 2 +- 7 files changed, 626 insertions(+), 1 deletion(-) create mode 100644 Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h create mode 100644 Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h create mode 100644 Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermion5DInstantiation.cc.master create mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermion5DInstantiationWilsonImplD.cc create mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermion5DInstantiationWilsonImplF.cc diff --git a/Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h b/Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h new file mode 100644 index 0000000000..2c6aa587d2 --- /dev/null +++ b/Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h @@ -0,0 +1,196 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5D.h + + Copyright (C) 2020 - 2025 + + Author: Daniel Richtmann + Author: Nils Meyer + Author: Christoph Lehner + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ +/* END LEGAL */ + +#pragma once + +#include +#include +#include +#include + +NAMESPACE_BEGIN(Grid); + +// see Grid/qcd/action/fermion/CompactWilsonCloverFermion.h for description + +template +class CompactWilsonCloverFermion5D : public WilsonFermion5D, + public WilsonCloverHelpers, + public CompactWilsonCloverHelpers { + ///////////////////////////////////////////// + // Sizes + ///////////////////////////////////////////// + +public: + + INHERIT_COMPACT_CLOVER_SIZES(Impl); + + ///////////////////////////////////////////// + // Type definitions + ///////////////////////////////////////////// + +public: + + INHERIT_IMPL_TYPES(Impl); + INHERIT_CLOVER_TYPES(Impl); + INHERIT_COMPACT_CLOVER_TYPES(Impl); + + typedef WilsonFermion5D WilsonBase; + typedef WilsonCloverHelpers Helpers; + typedef CompactWilsonCloverHelpers CompactHelpers; + + ///////////////////////////////////////////// + // Constructors + ///////////////////////////////////////////// + +public: + + CompactWilsonCloverFermion5D(GaugeField& _Umu, + GridCartesian &FiveDimGrid, + GridRedBlackCartesian &FiveDimRedBlackGrid, + GridCartesian &FourDimGrid, + GridRedBlackCartesian &FourDimRedBlackGrid, + const RealD _mass, + const RealD _csw_r = 0.0, + const RealD _csw_t = 0.0, + const RealD _cF = 1.0, + const ImplParams& impl_p = ImplParams()); + + ///////////////////////////////////////////// + // Member functions (implementing interface) + ///////////////////////////////////////////// + +public: + + virtual void Instantiatable() {}; + int ConstEE() override { return 0; }; + int isTrivialEE() override { return 0; }; + + void Dhop(const FermionField& in, FermionField& out, int dag) override; + + void DhopOE(const FermionField& in, FermionField& out, int dag) override; + + void DhopEO(const FermionField& in, FermionField& out, int dag) override; + + void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override; + + void DhopDirAll(const FermionField& in, std::vector& out) /* override */; + + void M(const FermionField& in, FermionField& out) override; + + void Mdag(const FermionField& in, FermionField& out) override; + + void Meooe(const FermionField& in, FermionField& out) override; + + void MeooeDag(const FermionField& in, FermionField& out) override; + + void Mooee(const FermionField& in, FermionField& out) override; + + void MooeeDag(const FermionField& in, FermionField& out) override; + + void MooeeInv(const FermionField& in, FermionField& out) override; + + void MooeeInvDag(const FermionField& in, FermionField& out) override; + + void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override; + + void MdirAll(const FermionField& in, std::vector& out) override; + + void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override; + + void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override; + + void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override; + + ///////////////////////////////////////////// + // Member functions (internals) + ///////////////////////////////////////////// + + void MooeeInternal(const FermionField& in, + FermionField& out, + const CloverDiagonalField& diagonal, + const CloverTriangleField& triangle); + + ///////////////////////////////////////////// + // Helpers + ///////////////////////////////////////////// + + void ImportGauge(const GaugeField& _Umu) override; + + ///////////////////////////////////////////// + // Helpers + ///////////////////////////////////////////// + +private: + + template + const MaskField* getCorrectMaskField(const Field &in) const { + if(in.Grid()->_isCheckerBoarded) { + if(in.Checkerboard() == Odd) { + return &this->BoundaryMaskOdd; + } else { + return &this->BoundaryMaskEven; + } + } else { + return &this->BoundaryMask; + } + } + + template + void ApplyBoundaryMask(Field& f) { + const MaskField* m = getCorrectMaskField(f); assert(m != nullptr); + assert(m != nullptr); + CompactHelpers::ApplyBoundaryMask(f, *m); + } + + ///////////////////////////////////////////// + // Member Data + ///////////////////////////////////////////// + +public: + + RealD csw_r; + RealD csw_t; + RealD cF; + int n_rhs; + + bool fixedBoundaries; + + CloverDiagonalField Diagonal, DiagonalEven, DiagonalOdd; + CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd; + + CloverTriangleField Triangle, TriangleEven, TriangleOdd; + CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd; + + FermionField Tmp; + + MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd; +}; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/Fermion.h b/Grid/qcd/action/fermion/Fermion.h index 8d7b3fdc30..a3d96d9b03 100644 --- a/Grid/qcd/action/fermion/Fermion.h +++ b/Grid/qcd/action/fermion/Fermion.h @@ -55,6 +55,7 @@ NAMESPACE_CHECK(Wilson); NAMESPACE_CHECK(WilsonTM); #include // 4d wilson clover fermions #include // 4d compact wilson clover fermions +#include // 5d compact wilson clover fermions NAMESPACE_CHECK(WilsonClover); #include // 5d base used by all 5d overlap types NAMESPACE_CHECK(Wilson5D); @@ -164,12 +165,17 @@ typedef WilsonClover WilsonCloverTwoIndexAntiS // Compact Clover fermions template using CompactWilsonClover = CompactWilsonCloverFermion>; +template using CompactWilsonClover5D = CompactWilsonCloverFermion5D>; template using CompactWilsonExpClover = CompactWilsonCloverFermion>; typedef CompactWilsonClover CompactWilsonCloverFermionD2; typedef CompactWilsonClover CompactWilsonCloverFermionF; typedef CompactWilsonClover CompactWilsonCloverFermionD; +typedef CompactWilsonClover5D CompactWilsonCloverFermion5DD2; +typedef CompactWilsonClover5D CompactWilsonCloverFermion5DF; +typedef CompactWilsonClover5D CompactWilsonCloverFermion5DD; + typedef CompactWilsonExpClover CompactWilsonExpCloverFermionD2; typedef CompactWilsonExpClover CompactWilsonExpCloverFermionF; typedef CompactWilsonExpClover CompactWilsonExpCloverFermionD; diff --git a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h new file mode 100644 index 0000000000..e65fb6d675 --- /dev/null +++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h @@ -0,0 +1,376 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5DImplementation.h + + Copyright (C) 2017 - 2025 + + Author: paboyle + Author: Guido Cossu + Author: Daniel Richtmann + Author: Christoph Lehner + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ +/* END LEGAL */ + +#include +#include +#include + + +NAMESPACE_BEGIN(Grid); +template +CompactWilsonCloverFermion5D::CompactWilsonCloverFermion5D(GaugeField& _Umu, + GridCartesian &FiveDimGrid, + GridRedBlackCartesian &FiveDimRedBlackGrid, + GridCartesian &FourDimGrid, + GridRedBlackCartesian &FourDimRedBlackGrid, + const RealD _mass, + const RealD _csw_r, + const RealD _csw_t, + const RealD _cF, + const ImplParams& impl_p) + : WilsonBase(_Umu, FiveDimGrid, FiveDimRedBlackGrid, FourDimGrid, FourDimRedBlackGrid, _mass, impl_p) + , csw_r(_csw_r) + , csw_t(_csw_t) + , cF(_cF) + , fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0) + , Diagonal(&FourDimGrid), Triangle(&FourDimGrid) + , DiagonalEven(&FourDimRedBlackGrid), TriangleEven(&FourDimRedBlackGrid) + , DiagonalOdd(&FourDimRedBlackGrid), TriangleOdd(&FourDimRedBlackGrid) + , DiagonalInv(&FourDimGrid), TriangleInv(&FourDimGrid) + , DiagonalInvEven(&FourDimRedBlackGrid), TriangleInvEven(&FourDimRedBlackGrid) + , DiagonalInvOdd(&FourDimRedBlackGrid), TriangleInvOdd(&FourDimRedBlackGrid) + , Tmp(&FiveDimGrid) + , BoundaryMask(&FiveDimGrid) + , BoundaryMaskEven(&FiveDimRedBlackGrid), BoundaryMaskOdd(&FiveDimRedBlackGrid) +{ + assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3); + + csw_r *= 0.5; + csw_t *= 0.5; + //if (clover_anisotropy.isAnisotropic) + // csw_r /= clover_anisotropy.xi_0; + + ImportGauge(_Umu); + if (fixedBoundaries) { + this->BoundaryMaskEven.Checkerboard() = Even; + this->BoundaryMaskOdd.Checkerboard() = Odd; + CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd); + } +} + +template +void CompactWilsonCloverFermion5D::Dhop(const FermionField& in, FermionField& out, int dag) { + WilsonBase::Dhop(in, out, dag); + if(fixedBoundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion5D::DhopOE(const FermionField& in, FermionField& out, int dag) { + WilsonBase::DhopOE(in, out, dag); + if(fixedBoundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion5D::DhopEO(const FermionField& in, FermionField& out, int dag) { + WilsonBase::DhopEO(in, out, dag); + if(fixedBoundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion5D::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) { + WilsonBase::DhopDir(in, out, dir, disp); + if(this->fixedBoundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion5D::DhopDirAll(const FermionField& in, std::vector& out) { + WilsonBase::DhopDirAll(in, out); + if(this->fixedBoundaries) { + for(auto& o : out) ApplyBoundaryMask(o); + } +} + +template +void CompactWilsonCloverFermion5D::M(const FermionField& in, FermionField& out) { + out.Checkerboard() = in.Checkerboard(); + WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc + Mooee(in, Tmp); + axpy(out, 1.0, out, Tmp); + if(fixedBoundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion5D::Mdag(const FermionField& in, FermionField& out) { + out.Checkerboard() = in.Checkerboard(); + WilsonBase::Dhop(in, out, DaggerYes); // call base to save applying bc + MooeeDag(in, Tmp); + axpy(out, 1.0, out, Tmp); + if(fixedBoundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion5D::Meooe(const FermionField& in, FermionField& out) { + WilsonBase::Meooe(in, out); + if(fixedBoundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion5D::MeooeDag(const FermionField& in, FermionField& out) { + WilsonBase::MeooeDag(in, out); + if(fixedBoundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion5D::Mooee(const FermionField& in, FermionField& out) { + if(in.Grid()->_isCheckerBoarded) { + if(in.Checkerboard() == Odd) { + MooeeInternal(in, out, DiagonalOdd, TriangleOdd); + } else { + MooeeInternal(in, out, DiagonalEven, TriangleEven); + } + } else { + MooeeInternal(in, out, Diagonal, Triangle); + } + if(fixedBoundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion5D::MooeeDag(const FermionField& in, FermionField& out) { + Mooee(in, out); // blocks are hermitian +} + +template +void CompactWilsonCloverFermion5D::MooeeInv(const FermionField& in, FermionField& out) { + if(in.Grid()->_isCheckerBoarded) { + if(in.Checkerboard() == Odd) { + MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd); + } else { + MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven); + } + } else { + MooeeInternal(in, out, DiagonalInv, TriangleInv); + } + if(fixedBoundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion5D::MooeeInvDag(const FermionField& in, FermionField& out) { + MooeeInv(in, out); // blocks are hermitian +} + +template +void CompactWilsonCloverFermion5D::Mdir(const FermionField& in, FermionField& out, int dir, int disp) { + DhopDir(in, out, dir, disp); +} + +template +void CompactWilsonCloverFermion5D::MdirAll(const FermionField& in, std::vector& out) { + DhopDirAll(in, out); +} + +template +void CompactWilsonCloverFermion5D::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) { + assert(!fixedBoundaries); // TODO check for changes required for open bc + + // NOTE: code copied from original clover term + conformable(X.Grid(), Y.Grid()); + conformable(X.Grid(), force.Grid()); + GaugeLinkField force_mu(force.Grid()), lambda(force.Grid()); + GaugeField clover_force(force.Grid()); + PropagatorField Lambda(force.Grid()); + + // Guido: Here we are hitting some performance issues: + // need to extract the components of the DoubledGaugeField + // for each call + // Possible solution + // Create a vector object to store them? (cons: wasting space) + std::vector U(Nd, this->Umu.Grid()); + + Impl::extractLinkField(U, this->Umu); + + force = Zero(); + // Derivative of the Wilson hopping term + this->DhopDeriv(force, X, Y, dag); + + /////////////////////////////////////////////////////////// + // Clover term derivative + /////////////////////////////////////////////////////////// + Impl::outerProductImpl(Lambda, X, Y); + //std::cout << "Lambda:" << Lambda << std::endl; + + Gamma::Algebra sigma[] = { + Gamma::Algebra::SigmaXY, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::MinusSigmaZT}; + + /* + sigma_{\mu \nu}= + | 0 sigma[0] sigma[1] sigma[2] | + | sigma[3] 0 sigma[4] sigma[5] | + | sigma[6] sigma[7] 0 sigma[8] | + | sigma[9] sigma[10] sigma[11] 0 | + */ + + int count = 0; + clover_force = Zero(); + for (int mu = 0; mu < 4; mu++) + { + force_mu = Zero(); + for (int nu = 0; nu < 4; nu++) + { + if (mu == nu) + continue; + + RealD factor; + if (nu == 4 || mu == 4) + { + factor = 2.0 * csw_t; + } + else + { + factor = 2.0 * csw_r; + } + PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked + Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok + force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu); // checked + count++; + } + + pokeLorentz(clover_force, U[mu] * force_mu, mu); + } + //clover_force *= csw; + force += clover_force; +} + +template +void CompactWilsonCloverFermion5D::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) { + assert(0); +} + +template +void CompactWilsonCloverFermion5D::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) { + assert(0); +} + +template +void CompactWilsonCloverFermion5D::MooeeInternal(const FermionField& in, + FermionField& out, + const CloverDiagonalField& diagonal, + const CloverTriangleField& triangle) { + assert(in.Checkerboard() == Odd || in.Checkerboard() == Even); + out.Checkerboard() = in.Checkerboard(); + conformable(in, out); + CompactHelpers::MooeeKernel(diagonal.oSites(), this->Ls, in, out, diagonal, triangle); +} + +template +void CompactWilsonCloverFermion5D::ImportGauge(const GaugeField& _Umu) { + // NOTE: parts copied from original implementation + + // Import gauge into base class + double t0 = usecond(); + WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that + + // Initialize temporary variables + double t1 = usecond(); + conformable(_Umu.Grid(), this->GaugeGrid()); + GridBase* grid = _Umu.Grid(); + typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid); + CloverField TmpOriginal(grid); + CloverField TmpInverse(grid); + + // Compute the field strength terms mu>nu + double t2 = usecond(); + WilsonLoops::FieldStrength(Bx, _Umu, Zdir, Ydir); + WilsonLoops::FieldStrength(By, _Umu, Zdir, Xdir); + WilsonLoops::FieldStrength(Bz, _Umu, Ydir, Xdir); + WilsonLoops::FieldStrength(Ex, _Umu, Tdir, Xdir); + WilsonLoops::FieldStrength(Ey, _Umu, Tdir, Ydir); + WilsonLoops::FieldStrength(Ez, _Umu, Tdir, Zdir); + + // Compute the Clover Operator acting on Colour and Spin + // multiply here by the clover coefficients for the anisotropy + double t3 = usecond(); + TmpOriginal = Helpers::fillCloverYZ(Bx) * csw_r; + TmpOriginal += Helpers::fillCloverXZ(By) * csw_r; + TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r; + TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t; + TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t; + TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t; + + // Instantiate the clover term + // - In case of the standard clover the mass term is added + // - In case of the exponential clover the clover term is exponentiated + double t4 = usecond(); + CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, 4.0 + this->M5 /*this->diag_mass*/); + + // Convert the data layout of the clover term + double t5 = usecond(); + CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle); + + // Modify the clover term at the temporal boundaries in case of open boundary conditions + double t6 = usecond(); + if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, 4.0 + this->M5 /*this->diag_mass*/); + + // Invert the Clover term + // In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved + // in TmpInverse can be used. In all other cases the clover term has to be explictly inverted. + // TODO: For now this inversion is explictly done on the CPU + double t7 = usecond(); + CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries); + + // Fill the remaining clover fields + double t8 = usecond(); + pickCheckerboard(Even, DiagonalEven, Diagonal); + pickCheckerboard(Even, TriangleEven, Triangle); + pickCheckerboard(Odd, DiagonalOdd, Diagonal); + pickCheckerboard(Odd, TriangleOdd, Triangle); + pickCheckerboard(Even, DiagonalInvEven, DiagonalInv); + pickCheckerboard(Even, TriangleInvEven, TriangleInv); + pickCheckerboard(Odd, DiagonalInvOdd, DiagonalInv); + pickCheckerboard(Odd, TriangleInvOdd, TriangleInv); + + // Report timings + double t9 = usecond(); + + std::cout << GridLogDebug << "CompactWilsonCloverFermion5D::ImportGauge timings:" << std::endl; + std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl; + std::cout << GridLogDebug << "allocations = " << (t2 - t1) / 1e6 << std::endl; + std::cout << GridLogDebug << "field strength = " << (t3 - t2) / 1e6 << std::endl; + std::cout << GridLogDebug << "fill clover = " << (t4 - t3) / 1e6 << std::endl; + std::cout << GridLogDebug << "instantiate clover = " << (t5 - t4) / 1e6 << std::endl; + std::cout << GridLogDebug << "convert layout = " << (t6 - t5) / 1e6 << std::endl; + std::cout << GridLogDebug << "modify boundaries = " << (t7 - t6) / 1e6 << std::endl; + std::cout << GridLogDebug << "invert clover = " << (t8 - t7) / 1e6 << std::endl; + std::cout << GridLogDebug << "pick cbs = " << (t9 - t8) / 1e6 << std::endl; + std::cout << GridLogDebug << "total = " << (t9 - t0) / 1e6 << std::endl; +} + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermion5DInstantiation.cc.master b/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermion5DInstantiation.cc.master new file mode 100644 index 0000000000..fd9c787bfa --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermion5DInstantiation.cc.master @@ -0,0 +1,45 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/ qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation5D.cc.master + + Copyright (C) 2017 - 2025 + + Author: paboyle + Author: Guido Cossu + Author: Daniel Richtmann + Author: Mattia Bruno + Author: Christoph Lehner + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ +/* END LEGAL */ + +#include +#include +#include +#include +#include + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class CompactWilsonCloverFermion5D>; +template class CompactWilsonCloverFermion5D>; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermion5DInstantiationWilsonImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermion5DInstantiationWilsonImplD.cc new file mode 120000 index 0000000000..ad6eba2865 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermion5DInstantiationWilsonImplD.cc @@ -0,0 +1 @@ +../CompactWilsonCloverFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermion5DInstantiationWilsonImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermion5DInstantiationWilsonImplF.cc new file mode 120000 index 0000000000..ad6eba2865 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermion5DInstantiationWilsonImplF.cc @@ -0,0 +1 @@ +../CompactWilsonCloverFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh index 728dc5e783..18194a23b8 100755 --- a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh +++ b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh @@ -62,7 +62,7 @@ do done done -CC_LIST="CompactWilsonCloverFermionInstantiation" +CC_LIST="CompactWilsonCloverFermionInstantiation CompactWilsonCloverFermion5DInstantiation" for impl in $COMPACT_WILSON_IMPL_LIST do From 147c2eba751138c92847e2eb48397ef5e03f392d Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Thu, 10 Apr 2025 22:14:55 +0200 Subject: [PATCH 18/53] Add checkerboarding to 5D compact clover --- Grid/qcd/action/fermion/WilsonFermion5D.h | 12 ++--- .../WilsonFermion5DImplementation.h | 49 +++++++++++++++++++ 2 files changed, 55 insertions(+), 6 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h index 0b07d320ab..44d624efc3 100644 --- a/Grid/qcd/action/fermion/WilsonFermion5D.h +++ b/Grid/qcd/action/fermion/WilsonFermion5D.h @@ -91,13 +91,13 @@ class WilsonFermion5D : public WilsonKernels, public WilsonFermion5DStatic virtual void Mdag (const FermionField &in, FermionField &out){assert(0);}; // half checkerboard operations; leave unimplemented as abstract for now - virtual void Meooe (const FermionField &in, FermionField &out){assert(0);}; - virtual void Mooee (const FermionField &in, FermionField &out){assert(0);}; - virtual void MooeeInv (const FermionField &in, FermionField &out){assert(0);}; + virtual void Meooe (const FermionField &in, FermionField &out); + virtual void Mooee (const FermionField &in, FermionField &out); + virtual void MooeeInv (const FermionField &in, FermionField &out); - virtual void MeooeDag (const FermionField &in, FermionField &out){assert(0);}; - virtual void MooeeDag (const FermionField &in, FermionField &out){assert(0);}; - virtual void MooeeInvDag (const FermionField &in, FermionField &out){assert(0);}; + virtual void MeooeDag (const FermionField &in, FermionField &out); + virtual void MooeeDag (const FermionField &in, FermionField &out); + virtual void MooeeInvDag (const FermionField &in, FermionField &out); virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac virtual void MdirAll(const FermionField &in, std::vector &out){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index 95af4c3897..dd22a5eb37 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -14,6 +14,7 @@ Author: paboyle Author: Guido Cossu Author: Andrew Lawson Author: Vera Guelpers +Author: Christoph Lehner This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -450,6 +451,54 @@ void WilsonFermion5D::DW(const FermionField &in, FermionField &out,int dag Dhop(in,out,dag); // -0.5 is included axpy(out,4.0-M5,in,out); } +template +void WilsonFermion5D::Meooe(const FermionField &in, FermionField &out) +{ + if (in.Checkerboard() == Odd) { + DhopEO(in, out, DaggerNo); + } else { + DhopOE(in, out, DaggerNo); + } +} + +template +void WilsonFermion5D::MeooeDag(const FermionField &in, FermionField &out) +{ + if (in.Checkerboard() == Odd) { + DhopEO(in, out, DaggerYes); + } else { + DhopOE(in, out, DaggerYes); + } +} + +template +void WilsonFermion5D::Mooee(const FermionField &in, FermionField &out) +{ + out.Checkerboard() = in.Checkerboard(); + typename FermionField::scalar_type scal(4.0 + M5); + out = scal * in; +} + +template +void WilsonFermion5D::MooeeDag(const FermionField &in, FermionField &out) +{ + out.Checkerboard() = in.Checkerboard(); + Mooee(in, out); +} + +template +void WilsonFermion5D::MooeeInv(const FermionField &in, FermionField &out) +{ + out.Checkerboard() = in.Checkerboard(); + out = (1.0/(4.0 + M5))*in; +} + +template +void WilsonFermion5D::MooeeInvDag(const FermionField &in, FermionField &out) +{ + out.Checkerboard() = in.Checkerboard(); + MooeeInv(in,out); +} template void WilsonFermion5D::MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in, RealD mass,std::vector twist) From 5e3a6f25b6adf6b12fbffa4ca116bc2bf1d74889 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Thu, 17 Apr 2025 18:38:10 +0200 Subject: [PATCH 19/53] Allow for reduced-precision GEMM --- Grid/algorithms/blas/BatchedBlas.h | 122 +++++++++++++++++++---------- 1 file changed, 80 insertions(+), 42 deletions(-) diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index aef3246dcb..d25077345f 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -65,6 +65,7 @@ NAMESPACE_BEGIN(Grid); #endif enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ; +enum GridBLASPrecision_t { GridBLAS_PRECISION_DEFAULT, GridBLAS_PRECISION_16F, GridBLAS_PRECISION_16BF, GridBLAS_PRECISION_TF32 }; class GridBLAS { public: @@ -97,7 +98,21 @@ class GridBLAS { gridblasInit=1; } } - + +#ifdef GRID_CUDA + cudaDataType toDataType(GridBLASPrecision_t p) { + switch (p) { + case GridBLAS_PRECISION_16F: + return CUBLAS_COMPUTE_32F_FAST_16F; + case GridBLAS_PRECISION_16BF: + return CUBLAS_COMPUTE_32F_FAST_16BF; + case GridBLAS_PRECISION_TF32: + return CUBLAS_COMPUTE_32F_FAST_TF32; + default: + assert(0); + } + } +#endif // Force construct once GridBLAS() { Init(); }; ~GridBLAS() { }; @@ -138,8 +153,10 @@ class GridBLAS { deviceVector &Amk, // pointer list to matrices deviceVector &Bkn, ComplexD beta, - deviceVector &Cmn) + deviceVector &Cmn, + GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT) { + assert(precision == GridBLAS_PRECISION_DEFAULT); gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, m,n,k, alpha, @@ -201,8 +218,10 @@ class GridBLAS { deviceVector &Amk, // pointer list to matrices deviceVector &Bkn, ComplexD beta, - deviceVector &Cmn) + deviceVector &Cmn, + GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT) { + assert(precision == GridBLAS_PRECISION_DEFAULT); RealD t2=usecond(); int32_t batchCount = Amk.size(); assert(Bkn.size()==batchCount); @@ -448,7 +467,8 @@ class GridBLAS { deviceVector &Amk, // pointer list to matrices deviceVector &Bkn, ComplexF beta, - deviceVector &Cmn) + deviceVector &Cmn, + GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT) { RealD t2=usecond(); int32_t batchCount = Amk.size(); @@ -473,6 +493,7 @@ class GridBLAS { assert(Bkn.size()==batchCount); assert(Cmn.size()==batchCount); #ifdef GRID_HIP + assert(precision == GridBLAS_PRECISION_DEFAULT); hipblasOperation_t hOpA; hipblasOperation_t hOpB; if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N; @@ -503,50 +524,67 @@ class GridBLAS { if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N; if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T; if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C; - auto err = cublasCgemmBatched(gridblasHandle, - hOpA, - hOpB, - m,n,k, - (cuComplex *) &alpha_p[0], - (cuComplex **)&Amk[0], lda, - (cuComplex **)&Bkn[0], ldb, - (cuComplex *) &beta_p[0], - (cuComplex **)&Cmn[0], ldc, - batchCount); + cublasStatus_t err; + if (precision == GridBLAS_PRECISION_DEFAULT) { + err = cublasCgemmBatched(gridblasHandle, + hOpA, + hOpB, + m,n,k, + (cuComplex *) &alpha_p[0], + (cuComplex **)&Amk[0], lda, + (cuComplex **)&Bkn[0], ldb, + (cuComplex *) &beta_p[0], + (cuComplex **)&Cmn[0], ldc, + batchCount); + } else { + cudaDataType compute_precision = toDataType(precision); + err = cublasGemmBatchedEx(gridblasHandle, + hOpA, + hOpB, + m,n,k, + (cuComplex *) &alpha_p[0], + (cuComplex **)&Amk[0], CUDA_C_32F, lda, + (cuComplex **)&Bkn[0], CUDA_C_32F, ldb, + (cuComplex *) &beta_p[0], + (cuComplex **)&Cmn[0], CUDA_C_32F, ldc, + batchCount, compute_precision, CUBLAS_GEMM_DEFAULT); + } assert(err==CUBLAS_STATUS_SUCCESS); #endif #ifdef GRID_SYCL - int64_t m64=m; - int64_t n64=n; - int64_t k64=k; - int64_t lda64=lda; - int64_t ldb64=ldb; - int64_t ldc64=ldc; - int64_t batchCount64=batchCount; - - oneapi::mkl::transpose iOpA; - oneapi::mkl::transpose iOpB; - - if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N; - if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T; - if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C; - if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N; - if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T; - if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C; - - oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, - &iOpA, - &iOpB, - &m64,&n64,&k64, - (ComplexF *) &alpha_p[0], - (const ComplexF **)&Amk[0], (const int64_t *)&lda64, - (const ComplexF **)&Bkn[0], (const int64_t *)&ldb64, - (ComplexF *) &beta_p[0], - (ComplexF **)&Cmn[0], (const int64_t *)&ldc64, - (int64_t)1,&batchCount64,std::vector()); + assert(precision == GridBLAS_PRECISION_DEFAULT); + int64_t m64=m; + int64_t n64=n; + int64_t k64=k; + int64_t lda64=lda; + int64_t ldb64=ldb; + int64_t ldc64=ldc; + int64_t batchCount64=batchCount; + + oneapi::mkl::transpose iOpA; + oneapi::mkl::transpose iOpB; + + if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N; + if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T; + if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C; + if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N; + if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T; + if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C; + + oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, + &iOpA, + &iOpB, + &m64,&n64,&k64, + (ComplexF *) &alpha_p[0], + (const ComplexF **)&Amk[0], (const int64_t *)&lda64, + (const ComplexF **)&Bkn[0], (const int64_t *)&ldb64, + (ComplexF *) &beta_p[0], + (ComplexF **)&Cmn[0], (const int64_t *)&ldc64, + (int64_t)1,&batchCount64,std::vector()); synchronise(); #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + assert(precision == GridBLAS_PRECISION_DEFAULT); // Need a default/reference implementation; use Eigen if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) { thread_for (p, batchCount, { From f0573d04c76dd67a0af2ef1ce18ddf6b227567e2 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Thu, 17 Apr 2025 19:18:54 +0200 Subject: [PATCH 20/53] fix cuda BatchedGemmEx --- Grid/algorithms/blas/BatchedBlas.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index d25077345f..8c188321aa 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -100,7 +100,7 @@ class GridBLAS { } #ifdef GRID_CUDA - cudaDataType toDataType(GridBLASPrecision_t p) { + cublasComputeType_t toDataType(GridBLASPrecision_t p) { switch (p) { case GridBLAS_PRECISION_16F: return CUBLAS_COMPUTE_32F_FAST_16F; @@ -537,16 +537,16 @@ class GridBLAS { (cuComplex **)&Cmn[0], ldc, batchCount); } else { - cudaDataType compute_precision = toDataType(precision); + cublasComputeType_t compute_precision = toDataType(precision); err = cublasGemmBatchedEx(gridblasHandle, hOpA, hOpB, m,n,k, - (cuComplex *) &alpha_p[0], - (cuComplex **)&Amk[0], CUDA_C_32F, lda, - (cuComplex **)&Bkn[0], CUDA_C_32F, ldb, - (cuComplex *) &beta_p[0], - (cuComplex **)&Cmn[0], CUDA_C_32F, ldc, + (void *) &alpha_p[0], + (void **)&Amk[0], CUDA_C_32F, lda, + (void **)&Bkn[0], CUDA_C_32F, ldb, + (void *) &beta_p[0], + (void **)&Cmn[0], CUDA_C_32F, ldc, batchCount, compute_precision, CUBLAS_GEMM_DEFAULT); } assert(err==CUBLAS_STATUS_SUCCESS); From 0fb69d6fa3bd284d75779be0fdec02f059cd08c9 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Thu, 8 May 2025 08:32:02 +0000 Subject: [PATCH 21/53] no thread_for for identity operator --- Grid/lattice/Lattice_base.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index 515c847f95..d8b9a618b9 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -236,7 +236,7 @@ class Lattice : public LatticeAccelerator template inline Lattice & operator = (const sobj & r){ vobj vtmp; vtmp = r; -#if 0 +#if 1 deviceVector vvtmp(1); acceleratorPut(vvtmp[0],vtmp); vobj *vvtmp_p = & vvtmp[0]; From f666dea5cf91e219021c2a6e0d778e72338c83e9 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 23 May 2025 20:58:16 +0000 Subject: [PATCH 22/53] Making running on Aurora more debuggable --- Grid/algorithms/approx/Chebyshev.h | 2 ++ Grid/communicator/Communicator_mpi3.cc | 12 +++++++++ Grid/stencil/Stencil.h | 1 + Grid/util/Init.cc | 7 +++-- HMC/ComputeWilsonFlow.cc | 9 ++++++- visualisation/CMakeLists.txt | 37 ++++++++++++++++++++++++++ visualisation/cmake-command | 17 ++++++++++++ 7 files changed, 80 insertions(+), 5 deletions(-) create mode 100644 visualisation/CMakeLists.txt create mode 100644 visualisation/cmake-command diff --git a/Grid/algorithms/approx/Chebyshev.h b/Grid/algorithms/approx/Chebyshev.h index 3c6796855d..5863e948d0 100644 --- a/Grid/algorithms/approx/Chebyshev.h +++ b/Grid/algorithms/approx/Chebyshev.h @@ -269,7 +269,9 @@ class Chebyshev : public OperatorFunction { RealD xscale = 2.0/(hi-lo); RealD mscale = -(hi+lo)/(hi-lo); Linop.HermOp(T0,y); + grid->Barrier(); axpby(T1,xscale,mscale,y,in); + grid->Barrier(); // sum = .5 c[0] T0 + c[1] T1 // out = ()*T0 + Coeffs[1]*T1; diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 7dc706dfb7..55febcee01 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -259,32 +259,39 @@ CartesianCommunicator::~CartesianCommunicator() } #ifdef USE_GRID_REDUCTION void CartesianCommunicator::GlobalSum(float &f){ + FlightRecorder::StepLog("GlobalSumP2P"); CartesianCommunicator::GlobalSumP2P(f); } void CartesianCommunicator::GlobalSum(double &d) { + FlightRecorder::StepLog("GlobalSumP2P"); CartesianCommunicator::GlobalSumP2P(d); } #else void CartesianCommunicator::GlobalSum(float &f){ + FlightRecorder::StepLog("AllReduce"); int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); assert(ierr==0); } void CartesianCommunicator::GlobalSum(double &d) { + FlightRecorder::StepLog("AllReduce"); int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator); assert(ierr==0); } #endif void CartesianCommunicator::GlobalSum(uint32_t &u){ + FlightRecorder::StepLog("AllReduce"); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); assert(ierr==0); } void CartesianCommunicator::GlobalSum(uint64_t &u){ + FlightRecorder::StepLog("AllReduce"); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); assert(ierr==0); } void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){ + FlightRecorder::StepLog("AllReduceVector"); int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator); assert(ierr==0); } @@ -715,6 +722,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &list) @@ -722,11 +730,13 @@ void CartesianCommunicator::StencilBarrier(void) //} void CartesianCommunicator::Barrier(void) { + FlightRecorder::StepLog("GridBarrier"); int ierr = MPI_Barrier(communicator); assert(ierr==0); } void CartesianCommunicator::Broadcast(int root,void* data, int bytes) { + FlightRecorder::StepLog("Broadcast"); int ierr=MPI_Bcast(data, bytes, MPI_BYTE, @@ -745,6 +755,7 @@ void CartesianCommunicator::BarrierWorld(void){ } void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { + FlightRecorder::StepLog("BroadcastWorld"); int ierr= MPI_Bcast(data, bytes, MPI_BYTE, @@ -767,6 +778,7 @@ void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words, } void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes) { + FlightRecorder::StepLog("AllToAll"); // MPI is a pain and uses "int" arguments // 64*64*64*128*16 == 500Million elements of data. // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug. diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 1142891ae5..53e6c56a48 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -385,6 +385,7 @@ class CartesianStencil : public CartesianStencilAccelerator void writeFile(T& in, std::string const fname){ Grid::emptyUserRecord record; Grid::ScidacWriter WR(in.Grid()->IsBoss()); WR.open(fname); - WR.writeScidacFieldRecord(in,record,0); + WR.writeScidacFieldRecord(in,record,0); // Lexico WR.close(); #endif // What is the appropriate way to throw error? @@ -171,6 +171,12 @@ int main(int argc, char **argv) { std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post; writeFile(qfield,tfile); + std::string ufile = file_pre + "U_" + std::to_string(tau) + "_" + file_post; + { + // PeriodicGimplR::GaugeField Ucopy = U; + // NerscIO::writeConfiguration(Ucopy,ufile); + } + RealD E = real(sum(R))/ RealD(U.Grid()->gSites()); RealD T = real( sum(qfield) ); Coordinate scoor; for (int mu=0; mu < Nd; mu++) scoor[mu] = 0; @@ -183,6 +189,7 @@ int main(int argc, char **argv) { int t=WFPar.maxTau; WF.smear(Uflow, Umu); + // NerscIO::writeConfiguration(Uflow,filesmr); RealD WFlow_plaq = WilsonLoops::avgPlaquette(Uflow); RealD WFlow_TC = WilsonLoops::TopologicalCharge(Uflow); diff --git a/visualisation/CMakeLists.txt b/visualisation/CMakeLists.txt new file mode 100644 index 0000000000..4e9cf7b6b7 --- /dev/null +++ b/visualisation/CMakeLists.txt @@ -0,0 +1,37 @@ +cmake_minimum_required(VERSION 3.12 FATAL_ERROR) + +project(GridViewer) + +list(APPEND CMAKE_PREFIX_PATH "/home/paboyle/Visualisation/install/") + +find_package(VTK COMPONENTS + CommonColor + CommonCore + FiltersCore + FiltersModeling + IOImage + IOFFMPEG + InteractionStyle + InteractionWidgets + RenderingContextOpenGL2 + RenderingCore + RenderingFreeType + RenderingGL2PSOpenGL2 + RenderingOpenGL2 +) + +if (NOT VTK_FOUND) + message(FATAL_ERROR "GridViewer: Unable to find the VTK build folder.") +endif() + +# Prevent a "command line is too long" failure in Windows. +set(CMAKE_NINJA_FORCE_RESPONSE_FILE "ON" CACHE BOOL "Force Ninja to use response files.") + +add_executable(FieldDensityAnimate MACOSX_BUNDLE FieldDensityAnimate.cxx ) + target_link_libraries(FieldDensityAnimate PRIVATE ${VTK_LIBRARIES} +) +# vtk_module_autoinit is needed +vtk_module_autoinit( + TARGETS FieldDensityAnimate + MODULES ${VTK_LIBRARIES} +) diff --git a/visualisation/cmake-command b/visualisation/cmake-command new file mode 100644 index 0000000000..4f18c11016 --- /dev/null +++ b/visualisation/cmake-command @@ -0,0 +1,17 @@ +export grid_config=/home/paboyle/GPT/install/bin/grid-config +libs=`$grid_config --libs` +ldflags=`$grid_config --ldflags` +cxxflags=`$grid_config --cxxflags` +cxx=`$grid_config --cxx` +cc=icx + +mkdir build +cd build + +echo CC $cc +echo CXX $cxx +echo CXXFLAGS $cxxflags +echo LDFLAGS $ldflags +echo LIBS $libs + +LDFLAGS="$ldflags $libs " cmake .. -DCMAKE_C_COMPILER=$cc -DCMAKE_CXX_COMPILER="$cxx" -DCMAKE_CXX_FLAGS="$cxxflags " From 1d234921dbbfea80f10c3f72423aa081d280e8ed Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sun, 22 Jun 2025 13:11:17 +0000 Subject: [PATCH 23/53] --debug-stdout in subdirectory --- Grid/util/Init.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 3e7f5a6ddd..02a70100ac 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -382,15 +382,20 @@ void Grid_init(int *argc,char ***argv) ///////////////////////////////////////////////////////////////// if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){ Grid_quiesce_nodes(); - } else { + } else { + char* root = getenv("GRID_STDOUT_ROOT"); FILE *fp; std::ostringstream fname; + if (root) + fname << root << "/"; fname<<"Grid.stdout."; fname< Date: Thu, 26 Jun 2025 21:01:28 +0200 Subject: [PATCH 24/53] fix communicator_none --- Grid/communicator/Communicator_base.cc | 2 -- Grid/communicator/Communicator_base.h | 10 +++------- Grid/communicator/Communicator_none.cc | 14 +++++++------- configure.ac | 4 ++-- 4 files changed, 12 insertions(+), 18 deletions(-) diff --git a/Grid/communicator/Communicator_base.cc b/Grid/communicator/Communicator_base.cc index a52e85a465..f9a4c442d9 100644 --- a/Grid/communicator/Communicator_base.cc +++ b/Grid/communicator/Communicator_base.cc @@ -41,8 +41,6 @@ bool Stencil_force_mpi = true; CartesianCommunicator::CommunicatorPolicy_t CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent; int CartesianCommunicator::nCommThreads = -1; -CartesianCommunicator::StencilCompressionPolicy_t -CartesianCommunicator::StencilCompressionPolicy= CartesianCommunicator::StencilCompressionPolicyNone; ///////////////////////////////// // Grid information queries diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index 1f985b1777..8fd8ec347f 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -51,10 +51,6 @@ class CartesianCommunicator : public SharedMemory { static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; } static int nCommThreads; - enum StencilCompressionPolicy_t { StencilCompressionPolicyNone, StencilCompressionPolicyBfloat16 }; - static StencilCompressionPolicy_t StencilCompressionPolicy; - static void SetStencilCompressionPolicy(StencilCompressionPolicy_t policy ) { StencilCompressionPolicy = policy; } - //////////////////////////////////////////// // Communicator should know nothing of the physics grid, only processor grid. //////////////////////////////////////////// @@ -192,7 +188,7 @@ class CartesianCommunicator : public SharedMemory { int xmit_to_rank,int do_xmit, void *recv, int recv_from_rank,int do_recv, - int bytes,int dir,size_t word_size); + int bytes,int dir); double StencilSendToRecvFromPrepare(std::vector &list, void *xmit, @@ -210,10 +206,10 @@ class CartesianCommunicator : public SharedMemory { int xmit_to_rank,int do_xmit, void *recv,void *recv_comp, int recv_from_rank,int do_recv, - int xbytes,int rbytes,int dir,size_t word_size); + int xbytes,int rbytes,int dir); - void StencilSendToRecvFromComplete(std::vector &waitall,int i,size_t word_size); + void StencilSendToRecvFromComplete(std::vector &waitall,int i); void StencilBarrier(void); //////////////////////////////////////////////////////////// diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc index 3048841ca5..4be3c1feff 100644 --- a/Grid/communicator/Communicator_none.cc +++ b/Grid/communicator/Communicator_none.cc @@ -130,7 +130,7 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, int xmit_to_rank,int dox, void *recv, int recv_from_rank,int dor, - int bytes, int dir,size_t word_size) + int bytes, int dir) { return 2.0*bytes; } @@ -146,15 +146,15 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector &list, - void *xmit, - int xmit_to_rank,int dox, - void *recv, - int recv_from_rank,int dor, - int xbytes,int rbytes, int dir,size_t word_size) + void *xmit,void *xmit_comp, + int dest,int dox, + void *recv,void *recv_comp, + int from,int dor, + int xbytes,int rbytes,int dir) { return xbytes+rbytes; } -void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &waitall,int dir,size_t word_size) +void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &waitall,int dir) { } diff --git a/configure.ac b/configure.ac index 9664a6758d..1aa24dd4e2 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_PREREQ([2.69]) +AC_PREREQ([2.72]) AC_INIT([Grid],[0.7.0],[https://github.com/paboyle/Grid],[Grid]) AC_CANONICAL_BUILD AC_CANONICAL_HOST @@ -214,7 +214,7 @@ esac ############### Symplectic group AC_ARG_ENABLE([Sp], - [AC_HELP_STRING([--enable-Sp=yes|no], [enable gauge group Sp2n])], + [AS_HELP_STRING([--enable-Sp=yes|no],[enable gauge group Sp2n])], [ac_ENABLE_SP=${enable_Sp}], [ac_ENABLE_SP=no]) AM_CONDITIONAL(BUILD_SP, [ test "${ac_ENABLE_SP}X" == "yesX" ]) From 1016ed514089a56bc8bdfb2872679c98959402cf Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Fri, 27 Jun 2025 08:14:08 +0200 Subject: [PATCH 25/53] experiment with AC 2.71 --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 1aa24dd4e2..29c71b0650 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_PREREQ([2.72]) +AC_PREREQ([2.71]) AC_INIT([Grid],[0.7.0],[https://github.com/paboyle/Grid],[Grid]) AC_CANONICAL_BUILD AC_CANONICAL_HOST From ffc6dbda2403e84c34be20e0c71b05429f826685 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Fri, 27 Jun 2025 20:16:07 +0000 Subject: [PATCH 26/53] towards argonne compatibility --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 29c71b0650..dc09f87c3b 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_PREREQ([2.71]) +AC_PREREQ([2.69]) AC_INIT([Grid],[0.7.0],[https://github.com/paboyle/Grid],[Grid]) AC_CANONICAL_BUILD AC_CANONICAL_HOST From dc734001bf65c24baeb8a81111d44afd3ac6c436 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sat, 28 Jun 2025 08:26:52 +0000 Subject: [PATCH 27/53] SYCL blas compatibility --- Grid/algorithms/blas/BatchedBlas.h | 137 +++++++++++++++++------------ 1 file changed, 83 insertions(+), 54 deletions(-) diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index a769e011b6..c1025b5953 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -1049,6 +1049,40 @@ class GridBLAS { #else +#ifdef GRID_SYCL + template + void getrfBatchedSYCL(int64_t n, + deviceVector &Ann, + deviceVector &ipiv, + deviceVector &info) { + + int64_t batchCount = Ann.size(); + + static deviceVector scratchpad; + int64_t sp_size = oneapi::mkl::lapack::getrf_batch_scratchpad_size(*gridblasHandle, &n, &n, &n, (int64_t)1, &batchCount); + if (sp_size > scratchpad.size()) + scratchpad.resize(sp_size); + + static deviceVector _ipiv; + if (batchCount > _ipiv.size()) + _ipiv.resize(batchCount); + int64_t** p_ipiv = &_ipiv[0]; + int64_t* pipiv = &ipiv[0]; + + accelerator_for(i, batchCount, 1, { p_ipiv[i] = &pipiv[i*n]; }); + + oneapi::mkl::lapack::getrf_batch(*gridblasHandle, + &n, &n, + (T **)&Ann[0], + &n, + (int64_t**)&_ipiv[0], + (int64_t)1, &batchCount, + (T*)&scratchpad[0], (int64_t)scratchpad.size(), + std::vector()); + synchronise(); + } +#endif + void getrfBatched(int64_t n, deviceVector &Ann, deviceVector &ipiv, @@ -1075,24 +1109,11 @@ class GridBLAS { assert(err==CUBLAS_STATUS_SUCCESS); #endif #ifdef GRID_SYCL - assert(0); - /* - TODO: cache scratchpad - - oneapi::mkl::lapack::gerf_batch(*gridblasHandle, - &n, &n, - (ComplexD **)&Ann[0], - &n, - (int64_t)&ipiv[0], - (int64_t)1, &batchCount, - scratchpad, scratchpad_size, - std::vector()); - */ - synchronise(); + getrfBatchedSYCL(n, Ann, ipiv, info); #endif } -void getrfBatched(int64_t n, + void getrfBatched(int64_t n, deviceVector &Ann, deviceVector &ipiv, deviceVector &info) @@ -1118,23 +1139,55 @@ void getrfBatched(int64_t n, assert(err==CUBLAS_STATUS_SUCCESS); #endif #ifdef GRID_SYCL - assert(0); - /* - TODO: cache scratchpad + getrfBatchedSYCL(n, Ann, ipiv, info); +#endif + } + +#ifdef GRID_SYCL + template + void getriBatchedSYCL(int64_t n, + deviceVector &Ann, + deviceVector &ipiv, + deviceVector &info, + deviceVector &Cnn) { + + int64_t batchCount = Ann.size(); + + static deviceVector scratchpad; + int64_t sp_size = oneapi::mkl::lapack::getri_batch_scratchpad_size(*gridblasHandle, &n, &n, (int64_t)1, &batchCount); + if (sp_size > scratchpad.size()) + scratchpad.resize(sp_size); + + static deviceVector _ipiv; + if (batchCount > _ipiv.size()) + _ipiv.resize(batchCount); + int64_t** p_ipiv = &_ipiv[0]; + int64_t* pipiv = &ipiv[0]; + + accelerator_for(i, batchCount, 1, { p_ipiv[i] = &pipiv[i*n]; }); + + oneapi::mkl::lapack::getri_batch(*gridblasHandle, + &n, + (T **)&Ann[0], + &n, + (int64_t**)p_ipiv, + (int64_t)1, &batchCount, + (T *)&scratchpad[0], (int64_t)scratchpad.size(), + std::vector()); - oneapi::mkl::lapack::gerf_batch(*gridblasHandle, - &n, &n, - (ComplexD **)&Ann[0], - &n, - (int64_t)&ipiv[0], - (int64_t)1, &batchCount, - scratchpad, scratchpad_size, - std::vector()); - */ synchronise(); -#endif + + T** pA = &Ann[0]; + T** pC = &Cnn[0]; + accelerator_for(i, batchCount*n*n, 1, { + auto j = i / batchCount; + auto k = i % batchCount; + pC[k][j] = pA[k][j]; + }); } +#endif + void getriBatched(int64_t n, deviceVector &Ann, deviceVector &ipiv, @@ -1165,19 +1218,7 @@ void getrfBatched(int64_t n, assert(err==CUBLAS_STATUS_SUCCESS); #endif #ifdef GRID_SYCL - assert(0); - /* - TODO: cache scratchpad - oneapi::mkl::lapack::geri_batch(*gridblasHandle, - &n, &n, - (ComplexD **)&Ann[0], - &n, - (int64_t)&ipiv[0], - (int64_t)1, &batchCount, - scratchpad, scratchpad_size, - std::vector()); - */ - synchronise(); + getriBatchedSYCL(n, Ann, ipiv, info, Cnn); #endif } @@ -1211,19 +1252,7 @@ void getrfBatched(int64_t n, assert(err==CUBLAS_STATUS_SUCCESS); #endif #ifdef GRID_SYCL - assert(0); - /* - TODO: cache scratchpad - oneapi::mkl::lapack::geri_batch(*gridblasHandle, - &n, &n, - (ComplexD **)&Ann[0], - &n, - (int64_t)&ipiv[0], - (int64_t)1, &batchCount, - scratchpad, scratchpad_size, - std::vector()); - */ - synchronise(); + getriBatchedSYCL(n, Ann, ipiv, info, Cnn); #endif } From 5cca827323b27e1cba33ae3fa5e4d09cf0bc2851 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Mon, 30 Jun 2025 13:08:42 +0000 Subject: [PATCH 28/53] Rollback half-prec comms for now --- Grid/communicator/Communicator_base.h | 5 +- Grid/communicator/Communicator_mpi3.cc | 37 +-- Grid/communicator/Communicator_none.cc | 12 +- Grid/communicator/SharedMemoryMPI.cc | 56 +++- .../action/fermion/ImprovedStaggeredFermion.h | 6 - .../fermion/ImprovedStaggeredFermion5D.h | 6 - .../action/fermion/NaiveStaggeredFermion.h | 6 - Grid/qcd/action/fermion/WilsonCompressor.h | 206 ++++++++++++- Grid/qcd/action/fermion/WilsonFermion.h | 6 - Grid/qcd/action/fermion/WilsonFermion5D.h | 9 +- Grid/stencil/Stencil.cc | 9 +- Grid/stencil/Stencil.h | 283 ++++-------------- 12 files changed, 326 insertions(+), 315 deletions(-) diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index 8fd8ec347f..62fcfa7b01 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -183,7 +183,6 @@ class CartesianCommunicator : public SharedMemory { int recv_from_rank, int bytes); - int IsOffNode(int rank); double StencilSendToRecvFrom(void *xmit, int xmit_to_rank,int do_xmit, void *recv, @@ -202,9 +201,9 @@ class CartesianCommunicator : public SharedMemory { void StencilSendToRecvFromPollIRecv(std::vector &list); double StencilSendToRecvFromBegin(std::vector &list, - void *xmit,void *xmit_comp, + void *xmit, int xmit_to_rank,int do_xmit, - void *recv,void *recv_comp, + void *recv, int recv_from_rank,int do_recv, int xbytes,int rbytes,int dir); diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 552d7f198b..8f3bf9afc3 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -270,24 +270,24 @@ void CartesianCommunicator::GlobalSum(double &d) } #else void CartesianCommunicator::GlobalSum(float &f){ - FlightRecorder::StepLog("AllReduce float"); + FlightRecorder::StepLog("AllReduce"); int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); assert(ierr==0); } void CartesianCommunicator::GlobalSum(double &d) { - FlightRecorder::StepLog("AllReduce double"); + FlightRecorder::StepLog("AllReduce"); int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator); assert(ierr==0); } #endif void CartesianCommunicator::GlobalSum(uint32_t &u){ - FlightRecorder::StepLog("AllReduce uint32_t"); + FlightRecorder::StepLog("AllReduce"); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); assert(ierr==0); } void CartesianCommunicator::GlobalSum(uint64_t &u){ - FlightRecorder::StepLog("AllReduce uint64_t"); + FlightRecorder::StepLog("AllReduce"); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); assert(ierr==0); } @@ -301,31 +301,26 @@ void CartesianCommunicator::GlobalXOR(uint32_t &u){ assert(ierr==0); } void CartesianCommunicator::GlobalXOR(uint64_t &u){ - FlightRecorder::StepLog("GlobalXOR"); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator); assert(ierr==0); } void CartesianCommunicator::GlobalMax(float &f) { - FlightRecorder::StepLog("GlobalMax"); int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator); assert(ierr==0); } void CartesianCommunicator::GlobalMax(double &d) { - FlightRecorder::StepLog("GlobalMax"); int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator); assert(ierr==0); } void CartesianCommunicator::GlobalSumVector(float *f,int N) { - FlightRecorder::StepLog("GlobalSumVector(float *)"); int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator); assert(ierr==0); } void CartesianCommunicator::GlobalSumVector(double *d,int N) { - FlightRecorder::StepLog("GlobalSumVector(double *)"); int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); assert(ierr==0); } @@ -400,16 +395,11 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, { std::vector list; double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); - offbytes += StencilSendToRecvFromBegin(list,xmit,xmit,dest,dox,recv,recv,from,dor,bytes,bytes,dir); + offbytes += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); StencilSendToRecvFromComplete(list,dir); return offbytes; } -int CartesianCommunicator::IsOffNode(int rank) -{ - int grank = ShmRanks[rank]; - if ( grank == MPI_UNDEFINED ) return true; - else return false; -} + #ifdef ACCELERATOR_AWARE_MPI void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector &list) {}; @@ -424,9 +414,9 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector &list, - void *xmit,void *xmit_comp, + void *xmit, int dest,int dox, - void *recv,void *recv_comp, + void *recv, int from,int dor, int xbytes,int rbytes,int dir) { @@ -450,8 +440,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorShmBufferTranslate(from,xmit); assert(shm!=NULL); - // std::cout << " StencilSendToRecvFrom "< &list, - void *xmit,void *xmit_comp, + void *xmit, int dest,int dox, - void *recv,void *recv_comp, + void *recv, int from,int dor, int xbytes,int rbytes,int dir) { @@ -841,7 +829,6 @@ int CartesianCommunicator::RankWorld(void){ return r; } void CartesianCommunicator::BarrierWorld(void){ - FlightRecorder::StepLog("BarrierWorld"); int ierr = MPI_Barrier(communicator_world); assert(ierr==0); } diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc index 4be3c1feff..3dee8f4d11 100644 --- a/Grid/communicator/Communicator_none.cc +++ b/Grid/communicator/Communicator_none.cc @@ -124,8 +124,6 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest dest=0; } -int CartesianCommunicator::IsOffNode(int rank) { return false; } - double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, int xmit_to_rank,int dox, void *recv, @@ -146,11 +144,11 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector &list, - void *xmit,void *xmit_comp, - int dest,int dox, - void *recv,void *recv_comp, - int from,int dor, - int xbytes,int rbytes,int dir) + void *xmit, + int xmit_to_rank,int dox, + void *recv, + int recv_from_rank,int dor, + int xbytes,int rbytes, int dir) { return xbytes+rbytes; } diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 469ec3bb09..b5fd4197b9 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -543,21 +543,49 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) /////////////////////////////////////////////////////////////////////////////////////////////////////////// #ifndef ACCELERATOR_AWARE_MPI // printf("Host buffer allocate for GPU non-aware MPI\n"); +#if 0 + HostCommBuf= acceleratorAllocHost(bytes); +#else HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host +#if 0 + #warning "Moving host buffers to specific NUMA domain" + int numa; + char *numa_name=(char *)getenv("MPI_BUF_NUMA"); + if(numa_name) { + unsigned long page_size = sysconf(_SC_PAGESIZE); + numa = atoi(numa_name); + unsigned long page_count = bytes/page_size; + std::vector pages(page_count); + std::vector nodes(page_count,numa); + std::vector status(page_count,-1); + for(unsigned long p=0;p, public ImprovedS StencilImpl Stencil; StencilImpl StencilEven; StencilImpl StencilOdd; - void SloppyComms(int sloppy) - { - Stencil.SetSloppyComms(sloppy); - StencilEven.SetSloppyComms(sloppy); - StencilOdd.SetSloppyComms(sloppy); - } // Copy of the gauge field , with even and odd subsets DoubledGaugeField Umu; diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h index ce65bfa3c9..2641a6b8a5 100644 --- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -179,12 +179,6 @@ class ImprovedStaggeredFermion5D : public StaggeredKernels, public Improv StencilImpl Stencil; StencilImpl StencilEven; StencilImpl StencilOdd; - void SloppyComms(int sloppy) - { - Stencil.SetSloppyComms(sloppy); - StencilEven.SetSloppyComms(sloppy); - StencilOdd.SetSloppyComms(sloppy); - } // Copy of the gauge field , with even and odd subsets DoubledGaugeField Umu; diff --git a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h index 53b1826792..9ec6be9016 100644 --- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h +++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h @@ -146,12 +146,6 @@ class NaiveStaggeredFermion : public StaggeredKernels, public NaiveStagger StencilImpl Stencil; StencilImpl StencilEven; StencilImpl StencilOdd; - void SloppyComms(int sloppy) - { - Stencil.SetSloppyComms(sloppy); - StencilEven.SetSloppyComms(sloppy); - StencilOdd.SetSloppyComms(sloppy); - } // Copy of the gauge field , with even and odd subsets DoubledGaugeField Umu; diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index 458f2c8364..22f2d8e31f 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -32,6 +32,209 @@ Author: paboyle NAMESPACE_BEGIN(Grid); +/////////////////////////////////////////////////////////////// +// Wilson compressor will need FaceGather policies for: +// Periodic, Dirichlet, and partial Dirichlet for DWF +/////////////////////////////////////////////////////////////// +const int dwf_compressor_depth=2; +#define DWF_COMPRESS +class FaceGatherPartialDWF +{ +public: +#ifdef DWF_COMPRESS + static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);}; +#else + static int PartialCompressionFactor(GridBase *grid) { return 1;} +#endif + template + static void Gather_plane_simple (deviceVector >& table, + const Lattice &rhs, + cobj *buffer, + compressor &compress, + int off,int so,int partial) + { + //DWF only hack: If a direction that is OFF node we use Partial Dirichlet + // Shrinks local and remote comms buffers + GridBase *Grid = rhs.Grid(); + int Ls = Grid->_rdimensions[0]; +#ifdef DWF_COMPRESS + int depth=dwf_compressor_depth; +#else + int depth=Ls/2; +#endif + std::pair *table_v = & table[0]; + auto rhs_v = rhs.View(AcceleratorRead); + int vol=table.size()/Ls; + accelerator_forNB( idx,table.size(), vobj::Nsimd(), { + Integer i=idx/Ls; + Integer s=idx%Ls; + Integer sc=depth+s-(Ls-depth); + if(s=Ls-depth) compress.Compress(buffer[off+i+sc*vol],rhs_v[so+table_v[idx].second]); + }); + rhs_v.ViewClose(); + } + template + static void DecompressFace(decompressor decompress,Decompression &dd) + { + auto Ls = dd.dims[0]; +#ifdef DWF_COMPRESS + int depth=dwf_compressor_depth; +#else + int depth=Ls/2; +#endif + // Just pass in the Grid + auto kp = dd.kernel_p; + auto mp = dd.mpi_p; + int size= dd.buffer_size; + int vol= size/Ls; + accelerator_forNB(o,size,1,{ + int idx=o/Ls; + int s=o%Ls; + if ( s < depth ) { + int oo=s*vol+idx; + kp[o]=mp[oo]; + } else if ( s >= Ls-depth ) { + int sc = depth + s - (Ls-depth); + int oo=sc*vol+idx; + kp[o]=mp[oo]; + } else { + kp[o] = Zero();//fill rest with zero if partial dirichlet + } + }); + } + //////////////////////////////////////////////////////////////////////////////////////////// + // Need to gather *interior portions* for ALL s-slices in simd directions + // Do the gather as need to treat SIMD lanes differently, and insert zeroes on receive side + // Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2. + //////////////////////////////////////////////////////////////////////////////////////////// + template + static void Gather_plane_exchange(deviceVector >& table,const Lattice &rhs, + std::vector pointers,int dimension,int plane,int cbmask, + compressor &compress,int type,int partial) + { + GridBase *Grid = rhs.Grid(); + int Ls = Grid->_rdimensions[0]; +#ifdef DWF_COMPRESS + int depth=dwf_compressor_depth; +#else + int depth = Ls/2; +#endif + + // insertion of zeroes... + assert( (table.size()&0x1)==0); + int num=table.size()/2; + int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane + + auto rhs_v = rhs.View(AcceleratorRead); + auto p0=&pointers[0][0]; + auto p1=&pointers[1][0]; + auto tp=&table[0]; + int nnum=num/Ls; + accelerator_forNB(j, num, vobj::Nsimd(), { + // Reorders both local and remote comms buffers + // + int s = j % Ls; + int sp1 = (s+depth)%Ls; // peri incremented s slice + + int hxyz= j/Ls; + + int xyz0= hxyz*2; // xyzt part of coor + int xyz1= hxyz*2+1; + + int jj= hxyz + sp1*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice .... + + int kk0= xyz0*Ls + s ; // s=0 goes to s=1 + int kk1= xyz1*Ls + s ; // s=Ls-1 -> s=0 + compress.CompressExchange(p0[jj],p1[jj], + rhs_v[so+tp[kk0 ].second], // Same s, consecutive xyz sites + rhs_v[so+tp[kk1 ].second], + type); + }); + rhs_v.ViewClose(); + } + // Merge routine is for SIMD faces + template + static void MergeFace(decompressor decompress,Merger &mm) + { + auto Ls = mm.dims[0]; +#ifdef DWF_COMPRESS + int depth=dwf_compressor_depth; +#else + int depth = Ls/2; +#endif + int num= mm.buffer_size/2; // relate vol and Ls to buffer size + auto mp = &mm.mpointer[0]; + auto vp0= &mm.vpointers[0][0]; // First arg is exchange first + auto vp1= &mm.vpointers[1][0]; + auto type= mm.type; + int nnum = num/Ls; + accelerator_forNB(o,num,Merger::Nsimd,{ + + int s=o%Ls; + int hxyz=o/Ls; // xyzt related component + int xyz0=hxyz*2; + int xyz1=hxyz*2+1; + + int sp = (s+depth)%Ls; + int jj= hxyz + sp*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice .... + + int oo0= s+xyz0*Ls; + int oo1= s+xyz1*Ls; + + // same ss0, ss1 pair goes to new layout + decompress.Exchange(mp[oo0],mp[oo1],vp0[jj],vp1[jj],type); + }); + } +}; +class FaceGatherDWFMixedBCs +{ +public: +#ifdef DWF_COMPRESS + static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);}; +#else + static int PartialCompressionFactor(GridBase *grid) {return 1;} +#endif + + template + static void Gather_plane_simple (deviceVector >& table, + const Lattice &rhs, + cobj *buffer, + compressor &compress, + int off,int so,int partial) + { + // std::cout << " face gather simple DWF partial "< + static void Gather_plane_exchange(deviceVector >& table,const Lattice &rhs, + std::vector pointers,int dimension,int plane,int cbmask, + compressor &compress,int type,int partial) + { + // std::cout << " face gather exch DWF partial "< + static void MergeFace(decompressor decompress,Merger &mm) + { + int partial = mm.partial; + // std::cout << " merge DWF partial "< + static void DecompressFace(decompressor decompress,Decompression &dd) + { + int partial = dd.partial; + // std::cout << " decompress DWF partial "< -class WilsonCompressorTemplate : public FaceGatherSimple +class WilsonCompressorTemplate : public FaceGatherDWFMixedBCs +// : public FaceGatherSimple { public: diff --git a/Grid/qcd/action/fermion/WilsonFermion.h b/Grid/qcd/action/fermion/WilsonFermion.h index 41a2471e2b..16320a9330 100644 --- a/Grid/qcd/action/fermion/WilsonFermion.h +++ b/Grid/qcd/action/fermion/WilsonFermion.h @@ -165,12 +165,6 @@ class WilsonFermion : public WilsonKernels, public WilsonFermionStatic StencilImpl Stencil; StencilImpl StencilEven; StencilImpl StencilOdd; - void SloppyComms(int sloppy) - { - Stencil.SetSloppyComms(sloppy); - StencilEven.SetSloppyComms(sloppy); - StencilOdd.SetSloppyComms(sloppy); - } // Copy of the gauge field , with even and odd subsets DoubledGaugeField Umu; diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h index ea0eaa32e0..9dadc866f6 100644 --- a/Grid/qcd/action/fermion/WilsonFermion5D.h +++ b/Grid/qcd/action/fermion/WilsonFermion5D.h @@ -204,14 +204,7 @@ class WilsonFermion5D : public WilsonKernels, public WilsonFermion5DStatic DoubledGaugeField Umu; DoubledGaugeField UmuEven; DoubledGaugeField UmuOdd; - - - void SloppyComms(int sloppy) - { - Stencil.SetSloppyComms(sloppy); - StencilEven.SetSloppyComms(sloppy); - StencilOdd.SetSloppyComms(sloppy); - } + // Comms buffer // std::vector > comm_buf; diff --git a/Grid/stencil/Stencil.cc b/Grid/stencil/Stencil.cc index 1d04716918..27dc75eddd 100644 --- a/Grid/stencil/Stencil.cc +++ b/Grid/stencil/Stencil.cc @@ -30,26 +30,25 @@ NAMESPACE_BEGIN(Grid); uint64_t DslashFullCount; -//uint64_t DslashPartialCount; +uint64_t DslashPartialCount; uint64_t DslashDirichletCount; void DslashResetCounts(void) { DslashFullCount=0; - // DslashPartialCount=0; + DslashPartialCount=0; DslashDirichletCount=0; } void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full) { dirichlet = DslashDirichletCount; - partial = 0; + partial = DslashPartialCount; full = DslashFullCount; } void DslashLogFull(void) { DslashFullCount++;} -//void DslashLogPartial(void) { DslashPartialCount++;} +void DslashLogPartial(void) { DslashPartialCount++;} void DslashLogDirichlet(void){ DslashDirichletCount++;} -deviceVector StencilBuffer::DeviceCommBuf; void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, int off,std::vector > & table) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index cf38db27eb..20237732df 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid); // These can move into a params header and be given MacroMagic serialisation struct DefaultImplParams { Coordinate dirichlet; // Blocksize of dirichlet BCs - // int partialDirichlet; + int partialDirichlet; DefaultImplParams() { dirichlet.resize(0); - // partialDirichlet=0; + partialDirichlet=0; }; }; @@ -69,12 +69,6 @@ struct DefaultImplParams { void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, int off,std::vector > & table); -class StencilBuffer -{ -public: - static deviceVector DeviceCommBuf; // placed in Stencil.cc -}; - void DslashResetCounts(void); void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full); void DslashLogFull(void); @@ -119,8 +113,8 @@ class CartesianStencilAccelerator { /////////////////////////////////////////////////// // If true, this is partially communicated per face /////////////////////////////////////////////////// - // StencilVector _comms_partial_send; - // StencilVector _comms_partial_recv; + StencilVector _comms_partial_send; + StencilVector _comms_partial_recv; // StencilVector _comm_buf_size; StencilVector _permute_type; @@ -211,16 +205,16 @@ class CartesianStencil : public CartesianStencilAccelerator vpointers; Integer buffer_size; Integer type; - // Integer partial; // partial dirichlet BCs + Integer partial; // partial dirichlet BCs Coordinate dims; }; struct Decompress { @@ -237,7 +231,7 @@ class CartesianStencil : public CartesianStencilAccelerator device_heap_size ) { - std::cout << "DeviceBufferMalloc overflow bytes "< > > face_table ; deviceVector surface_list; @@ -403,145 +361,24 @@ class CartesianStencil : public CartesianStencilAcceleratorIsOffNode(packet.from_rank) ) { - - typedef typename getPrecision::real_scalar_type word; - uint64_t words = packet.rbytes/sizeof(word); - const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word); - const uint64_t outer = words/nsimd; - - if(sizeof(word)==8) { - - // Can either choose to represent as float vs double and prec change - // OR - // truncate the mantissa bfp16 style - double *dbuf =(double *) packet.recv_buf; - float *fbuf =(float *) packet.compressed_recv_buf; - - accelerator_forNB(ss,outer,nsimd,{ - int lane = acceleratorSIMTlane(nsimd); - dbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]; //conversion - }); - - } else if ( sizeof(word)==4){ - // Can either choose to represent as half vs float and prec change - // OR - // truncate the mantissa bfp16 style - - uint32_t *fbuf =(uint32_t *) packet.recv_buf; - uint16_t *hbuf =(uint16_t *) packet.compressed_recv_buf; - - accelerator_forNB(ss,outer,nsimd,{ - int lane = acceleratorSIMTlane(nsimd); - fbuf[ss*nsimd+lane] = ((uint32_t)hbuf[ss*nsimd+lane])<<16; //copy back and pad each word with zeroes - }); - - } else { - assert(0 && "unknown floating point precision"); - } - } - } - void CompressPacket(Packet &packet) - { - packet.xbytes_compressed = packet.xbytes; - packet.compressed_send_buf = packet.send_buf; - - packet.rbytes_compressed = packet.rbytes; - packet.compressed_recv_buf = packet.recv_buf; - - if ( !SloppyComms ) { - return; - } - - typedef typename getPrecision::real_scalar_type word; - uint64_t words = packet.xbytes/sizeof(word); - const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word); - const uint64_t outer = words/nsimd; - - if (packet.do_recv && _grid->IsOffNode(packet.from_rank) ) { - - packet.rbytes_compressed = packet.rbytes/2; - packet.compressed_recv_buf = DeviceBufferMalloc(packet.rbytes_compressed); - // std::cout << " CompressPacket recv from "<IsOffNode(packet.to_rank) ) { - - packet.xbytes_compressed = packet.xbytes/2; - packet.compressed_send_buf = DeviceBufferMalloc(packet.xbytes_compressed); - // std::cout << " CompressPacket send to "<>16; // convert as in Bagel/BFM ; bfloat16 ; s7e8 Intel patent - }); - - } else { - assert(0 && "unknown floating point precision"); - } - - } - // else { - // std::cout << " CompressPacket send is uncompressed to "< > &reqs) { + // std::cout << "Communicate Begin "<Barrier(); FlightRecorder::StepLog("Communicate begin"); - /////////////////////////////////////////////// // All GPU kernel tasks must complete - // accelerator_barrier(); All kernels should ALREADY be complete - //Everyone is here, so noone running slow and still using receive buffer - _grid->StencilBarrier(); - // But the HaloGather had a barrier too. - /////////////////////////////////////////////// - if (SloppyComms) { - DeviceBufferFreeAll(); - } - for(int i=0;iCompressPacket(Packets[i]); - } - if (SloppyComms) { - accelerator_barrier(); -#ifdef NVLINK_GET - _grid->StencilBarrier(); -#endif - } - + // accelerator_barrier(); // All kernels should ALREADY be complete + // _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer + // But the HaloGather had a barrier too. for(int i=0;iBarrier(); _grid->StencilSendToRecvFromPrepare(MpiReqs, - Packets[i].compressed_send_buf, + Packets[i].send_buf, Packets[i].to_rank,Packets[i].do_send, - Packets[i].compressed_recv_buf, + Packets[i].recv_buf, Packets[i].from_rank,Packets[i].do_recv, - Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i); + Packets[i].xbytes,Packets[i].rbytes,i); } // std::cout << "Communicate PollDtoH "<Barrier(); @@ -552,22 +389,19 @@ class CartesianStencil : public CartesianStencilAcceleratorBarrier(); _grid->StencilSendToRecvFromBegin(MpiReqs, - Packets[i].send_buf,Packets[i].compressed_send_buf, + Packets[i].send_buf, Packets[i].to_rank,Packets[i].do_send, - Packets[i].recv_buf,Packets[i].compressed_recv_buf, + Packets[i].recv_buf, Packets[i].from_rank,Packets[i].do_recv, - Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i); - // std::cout << "Communicate Begin started "<Barrier(); + Packets[i].xbytes,Packets[i].rbytes,i); } FlightRecorder::StepLog("Communicate begin has finished"); // Get comms started then run checksums // Having this PRIOR to the dslash seems to make Sunspot work... (!) for(int i=0;iBarrier(); _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done - // if ( this->partialDirichlet ) DslashLogPartial(); - if ( this->fullDirichlet ) DslashLogDirichlet(); + if ( this->partialDirichlet ) DslashLogPartial(); + else if ( this->fullDirichlet ) DslashLogDirichlet(); else DslashLogFull(); // acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete // accelerator_barrier(); for(int i=0;iDecompressPacket(Packets[i]); if ( Packets[i].do_recv ) - FlightRecorder::recvLog(Packets[i].compressed_recv_buf,Packets[i].rbytes_compressed,Packets[i].from_rank); + FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank); } FlightRecorder::StepLog("Finish communicate complete"); } @@ -785,7 +618,7 @@ class CartesianStencil : public CartesianStencilAccelerator &dv) { Decompress d; - // d.partial = this->partialDirichlet; + d.partial = this->partialDirichlet; d.dims = _grid->_fdimensions; d.kernel_p = k_p; d.mpi_p = m_p; @@ -794,7 +627,7 @@ class CartesianStencil : public CartesianStencilAccelerator &rpointers,Integer buffer_size,Integer type,std::vector &mv) { Merge m; - // m.partial = this->partialDirichlet; + m.partial = this->partialDirichlet; m.dims = _grid->_fdimensions; m.type = type; m.mpointer = merge_p; @@ -899,8 +732,8 @@ class CartesianStencil : public CartesianStencilAccelerator_comms_send[ii] = comm_dim; this->_comms_recv[ii] = comm_dim; - // this->_comms_partial_send[ii] = 0; - // this->_comms_partial_recv[ii] = 0; + this->_comms_partial_send[ii] = 0; + this->_comms_partial_recv[ii] = 0; if ( block && comm_dim ) { assert(abs(displacement) < ld ); // Quiesce communication across block boundaries @@ -921,10 +754,10 @@ class CartesianStencil : public CartesianStencilAccelerator_comms_send[ii] = 0; if ( ( (ld*pc ) % block ) == 0 ) this->_comms_recv[ii] = 0; } - // if ( partialDirichlet ) { - // this->_comms_partial_send[ii] = !this->_comms_send[ii]; - // this->_comms_partial_recv[ii] = !this->_comms_recv[ii]; - // } + if ( partialDirichlet ) { + this->_comms_partial_send[ii] = !this->_comms_send[ii]; + this->_comms_partial_recv[ii] = !this->_comms_recv[ii]; + } } } } @@ -936,7 +769,6 @@ class CartesianStencil : public CartesianStencilAcceleratorparameters=p; @@ -954,7 +786,7 @@ class CartesianStencil : public CartesianStencilAcceleratorsame_node.resize(npoints); if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0); - // partialDirichlet = p.partialDirichlet; + partialDirichlet = p.partialDirichlet; DirichletBlock(p.dirichlet); // comms send/recv set up fullDirichlet=0; for(int d=0;dNsimd(); - // Allow for multiple stencils to be communicated simultaneously + // Allow for multiple stencils to exist simultaneously if (!preserve_shm) _grid->ShmBufferFreeAll(); @@ -1103,8 +935,7 @@ class CartesianStencil : public CartesianStencilAcceleratorNsimd(); - // int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ; - int comms_recv = this->_comms_recv[point]; + int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ; int fd = _grid->_fdimensions[dimension]; int ld = _grid->_ldimensions[dimension]; int rd = _grid->_rdimensions[dimension]; @@ -1293,8 +1124,8 @@ class CartesianStencil : public CartesianStencilAccelerator_comms_send[point]; int comms_recv = this->_comms_recv[point]; - // int comms_partial_send = this->_comms_partial_send[point] ; - // int comms_partial_recv = this->_comms_partial_recv[point] ; + int comms_partial_send = this->_comms_partial_send[point] ; + int comms_partial_recv = this->_comms_partial_recv[point] ; assert(rhs.Grid()==_grid); // conformable(_grid,rhs.Grid()); @@ -1329,11 +1160,11 @@ class CartesianStencil : public CartesianStencilAccelerator_ostride[dimension]; // base offset for start of plane @@ -1360,8 +1191,7 @@ class CartesianStencil : public CartesianStencilAcceleratoru_recv_buf_p; @@ -1395,8 +1225,7 @@ class CartesianStencil : public CartesianStencilAcceleratoru_recv_buf_p[comm_off], &recv_buf[comm_off], words,Decompressions); @@ -1436,8 +1265,8 @@ class CartesianStencil : public CartesianStencilAccelerator_comms_send[point]; int comms_recv = this->_comms_recv[point]; - // int comms_partial_send = this->_comms_partial_send[point] ; - // int comms_partial_recv = this->_comms_partial_recv[point] ; + int comms_partial_send = this->_comms_partial_send[point] ; + int comms_partial_recv = this->_comms_partial_recv[point] ; int fd = _grid->_fdimensions[dimension]; int rd = _grid->_rdimensions[dimension]; @@ -1512,20 +1341,18 @@ class CartesianStencil : public CartesianStencilAcceleratoru_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers); } From 87350e03b8ac068e3e693443da1675ef04681060 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sun, 6 Jul 2025 16:33:45 +0000 Subject: [PATCH 29/53] checksum comms --- Grid/communicator/Communicator_base.h | 2 +- Grid/communicator/Communicator_mpi3.cc | 56 +++++++++++++++++++++++- Grid/cshift/Cshift_mpi.h | 59 ++++++++++++++++++++++++-- benchmarks/Benchmark_comms.cc | 3 +- benchmarks/Benchmark_dwf_fp32.cc | 2 +- configure.ac | 11 +++++ 6 files changed, 125 insertions(+), 8 deletions(-) diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index 62fcfa7b01..1c08762355 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -243,7 +243,7 @@ class CartesianCommunicator : public SharedMemory { Broadcast(root,(void *)&data,sizeof(data)); } -}; +}; NAMESPACE_END(Grid); diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 8f3bf9afc3..c4524a4616 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -32,6 +32,7 @@ NAMESPACE_BEGIN(Grid); Grid_MPI_Comm CartesianCommunicator::communicator_world; +extern void * Grid_backtrace_buffer[_NBACKTRACE]; //////////////////////////////////////////// // First initialise of comms system @@ -555,12 +556,19 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vectorHostBufferMalloc(rbytes); ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq); + assert(ierr==0); CommsRequest_t srq; srq.PacketType = InterNodeRecv; @@ -579,9 +587,17 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vectorHostBufferMalloc(xbytes); + CommsRequest_t srq; +#ifdef GRID_CHECKSUM_COMMS + uint64_t xbytes_data = xbytes - 8; + srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes_data); // Make this Asynch + assert(xbytes % 8 == 0); + *(uint64_t*)(((char*)host_xmit) + xbytes_data) = svm_xor((uint64_t*)xmit, xbytes_data / 8) ^ 1; // flip one bit so that a zero buffer is not consistent +#else srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch +#endif // ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); // assert(ierr==0); @@ -623,7 +639,11 @@ void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vectorHostBufferFreeAll(); // Clean up the buffer allocs diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h index 05ee946b3a..8119bafc30 100644 --- a/Grid/cshift/Cshift_mpi.h +++ b/Grid/cshift/Cshift_mpi.h @@ -125,11 +125,20 @@ template void Cshift_comms(Lattice &ret,const Lattice &r int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; static deviceVector send_buf; send_buf.resize(buffer_size); static deviceVector recv_buf; recv_buf.resize(buffer_size); + #ifndef ACCELERATOR_AWARE_MPI +#ifdef GRID_CHECKSUM_COMMS + buffer_size += (8 + sizeof(vobj) - 1) / sizeof(vobj); +#endif + static hostVector hsend_buf; hsend_buf.resize(buffer_size); static hostVector hrecv_buf; hrecv_buf.resize(buffer_size); + +#ifdef GRID_CHECKSUM_COMMS + buffer_size -= (8 + sizeof(vobj) - 1) / sizeof(vobj); #endif - +#endif + int cb= (cbmask==0x2)? Odd : Even; int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); RealD tcopy=0.0; @@ -180,12 +189,29 @@ template void Cshift_comms(Lattice &ret,const Lattice &r #else // bouncy bouncy acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes); + +#ifdef GRID_CHECKSUM_COMMS + assert(bytes % 8 == 0); + *(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = svm_xor((uint64_t*)&send_buf[0], bytes / 8) ^ 1; + bytes += 8; +#endif + grid->SendToRecvFrom((void *)&hsend_buf[0], xmit_to_rank, (void *)&hrecv_buf[0], recv_from_rank, bytes); + +#ifdef GRID_CHECKSUM_COMMS + bytes -= 8; acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes); + uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes); + uint64_t computed_cs = svm_xor((uint64_t*)&recv_buf[0], bytes / 8) ^ 1; + assert(expected_cs == computed_cs); +#else + acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes); +#endif + #endif FlightRecorder::StepLog("Cshift_SendRecv_complete"); @@ -255,9 +281,18 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice hsend_buf; hsend_buf.resize(buffer_size); - hostVector hrecv_buf; hrecv_buf.resize(buffer_size); +#ifdef GRID_CHECKSUM_COMMS + buffer_size += (8 + sizeof(vobj) - 1) / sizeof(vobj); +#endif + + static hostVector hsend_buf; hsend_buf.resize(buffer_size); + static hostVector hrecv_buf; hrecv_buf.resize(buffer_size); + +#ifdef GRID_CHECKSUM_COMMS + buffer_size -= (8 + sizeof(vobj) - 1) / sizeof(vobj); +#endif #endif int bytes = buffer_size*sizeof(scalar_object); @@ -320,12 +355,30 @@ template void Cshift_comms_simd(Lattice &ret,const LatticeSendToRecvFrom((void *)&hsend_buf[0], xmit_to_rank, (void *)&hrecv_buf[0], recv_from_rank, bytes); + + +#ifdef GRID_CHECKSUM_COMMS + bytes -= 8; acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes); + uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes); + uint64_t computed_cs = svm_xor((uint64_t*)recv_buf_extract_mpi, bytes / 8) ^ 1; + assert(expected_cs == computed_cs); +#else + acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes); +#endif + #endif xbytes+=bytes; diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc index 6696c8ebae..5feb97227c 100644 --- a/benchmarks/Benchmark_comms.cc +++ b/benchmarks/Benchmark_comms.cc @@ -219,7 +219,8 @@ int main (int argc, char ** argv) int comm_proc = mpi_layout[mu]-1; Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); } - int tid = omp_get_thread_num(); + //int tid = omp_get_thread_num(); + int tid=0; tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,1, (void *)&rbuf[dir][0], recv_from_rank,1, bytes,tid); diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc index 583def29b0..a722346449 100644 --- a/benchmarks/Benchmark_dwf_fp32.cc +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -262,7 +262,7 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy) FermionAction::ImplParams p; p.dirichlet=Dirichlet; FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p); - Dw.SloppyComms(sloppy); + //Dw.SloppyComms(sloppy); Dw.ImportGauge(Umu); int ncall =300; diff --git a/configure.ac b/configure.ac index c6a4d187ba..f4fcf20c95 100644 --- a/configure.ac +++ b/configure.ac @@ -263,6 +263,17 @@ case ${ac_ACCELERATOR_AWARE_MPI} in *);; esac +############### CHECKSUM COMMS +AC_ARG_ENABLE([checksum-comms], + [AS_HELP_STRING([--enable-checksum-comms=yes|no],[checksum all communication])], + [ac_CHECKSUM_COMMS=${enable_checksum_comms}], [ac_CHECKSUM_COMMS=yes]) + +case ${ac_CHECKSUM_COMMS} in + yes) + AC_DEFINE([GRID_CHECKSUM_COMMS],[1],[checksum all communication]);; + *);; +esac + ############### SYCL/CUDA/HIP/none AC_ARG_ENABLE([accelerator], [AS_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none],[enable none,cuda,sycl,hip acceleration])], From 912b6cf389194e58e233f28c44edfaea3bf959a9 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sun, 6 Jul 2025 20:34:40 +0000 Subject: [PATCH 30/53] refined checksum --- Grid/communicator/Communicator_mpi3.cc | 10 ++++++++-- Grid/cshift/Cshift_mpi.h | 16 +++++++++++----- Grid/lattice/Lattice_reduction_sycl.h | 20 ++++++++++++++++++++ 3 files changed, 39 insertions(+), 7 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index c4524a4616..c58a3439e9 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -32,7 +32,11 @@ NAMESPACE_BEGIN(Grid); Grid_MPI_Comm CartesianCommunicator::communicator_world; + +#ifdef GRID_CHECKSUM_COMMS extern void * Grid_backtrace_buffer[_NBACKTRACE]; +uint64_t checksum_index = 1; +#endif //////////////////////////////////////////// // First initialise of comms system @@ -558,6 +562,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector #ifndef _GRID_CSHIFT_MPI_H_ #define _GRID_CSHIFT_MPI_H_ +NAMESPACE_BEGIN(Grid); + +#ifdef GRID_CHECKSUM_COMMS +extern uint64_t checksum_index; +#endif -NAMESPACE_BEGIN(Grid); const int Cshift_verbose=0; template Lattice Cshift(const Lattice &rhs,int dimension,int shift) { @@ -192,7 +196,8 @@ template void Cshift_comms(Lattice &ret,const Lattice &r #ifdef GRID_CHECKSUM_COMMS assert(bytes % 8 == 0); - *(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = svm_xor((uint64_t*)&send_buf[0], bytes / 8) ^ 1; + checksum_index++; + *(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = checksum_gpu((uint64_t*)&send_buf[0], bytes / 8) ^ (1 + checksum_index); bytes += 8; #endif @@ -206,7 +211,7 @@ template void Cshift_comms(Lattice &ret,const Lattice &r bytes -= 8; acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes); uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes); - uint64_t computed_cs = svm_xor((uint64_t*)&recv_buf[0], bytes / 8) ^ 1; + uint64_t computed_cs = checksum_gpu((uint64_t*)&recv_buf[0], bytes / 8) ^ (1 + checksum_index); assert(expected_cs == computed_cs); #else acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes); @@ -358,7 +363,8 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice void Cshift_comms_simd(Lattice &ret,const Lattice Word svm_xor(Word *vec,uint64_t L) return ret; } +template Word checksum_gpu(Word *vec,uint64_t L) +{ + Word identity; identity=0; + Word ret = 0; + { + sycl::buffer abuff(&ret, {1}); + theGridAccelerator->submit([&](sycl::handler &cgh) { + auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>()); + cgh.parallel_for(sycl::range<1>{L}, + Reduction, + [=] (sycl::id<1> index, auto &sum) { + auto l = index % 61; + sum ^= vec[index]<>(64-l); + }); + }); + } + theGridAccelerator->wait(); + return ret; +} + NAMESPACE_END(Grid); From c0ca7f05db5fa2b37c668472aab78578001c92cd Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Mon, 7 Jul 2025 06:02:47 +0000 Subject: [PATCH 31/53] checksum fix --- Grid/communicator/Communicator_mpi3.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index c58a3439e9..2dad9698fb 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -562,7 +562,6 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vectorHostBufferFreeAll(); // Clean up the buffer allocs #ifndef NVLINK_GET From 6307fcb43936e928c80f6cf2d23d0b1102e7b93f Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Wed, 9 Jul 2025 09:00:25 +0200 Subject: [PATCH 32/53] view logging --- Grid/lattice/Lattice_view.h | 44 ++++++++++++++++++++++++++++++++++ Grid/stencil/Stencil.h | 6 +++++ Grid/util/FlightRecorder.cc | 48 +++++++++++++++++++++++++++++++++++++ Grid/util/FlightRecorder.h | 18 ++++++++++++++ configure.ac | 11 +++++++++ 5 files changed, 127 insertions(+) diff --git a/Grid/lattice/Lattice_view.h b/Grid/lattice/Lattice_view.h index 064c10e6f5..63065efdad 100644 --- a/Grid/lattice/Lattice_view.h +++ b/Grid/lattice/Lattice_view.h @@ -106,6 +106,49 @@ class LatticeView : public LatticeAccelerator } }; + +#ifdef GRID_LOG_VIEWS + +// Little autoscope assister +template +class ViewCloser +{ + View v; // Take a copy of view and call view close when I go out of scope automatically + const char* filename; int line, mode; + public: + ViewCloser(View &_v, const char* _filename, int _line, int _mode) : + v(_v), filename(_filename), line(_line), mode(_mode) { + + switch (mode){ + case AcceleratorRead: + case AcceleratorWrite: + case CpuRead: + case CpuWrite: + ViewLogger::Log(filename, line, 1, mode, &v[0], v.size() * sizeof(v[0])); + break; + } + + }; + ~ViewCloser() { + + switch (mode) { + case AcceleratorWrite: + case CpuWrite: + ViewLogger::Log(filename, line, -1, mode, &v[0], v.size() * sizeof(v[0])); + break; + } + + v.ViewClose(); + } +}; + +#define autoView(l_v,l,mode) \ + auto l_v = l.View(mode); \ + ViewCloser _autoView##l_v(l_v,__FILE__,__LINE__,mode); + +#else + + // Little autoscope assister template class ViewCloser @@ -119,6 +162,7 @@ class ViewCloser #define autoView(l_v,l,mode) \ auto l_v = l.View(mode); \ ViewCloser _autoView##l_v(l_v); +#endif ///////////////////////////////////////////////////////////////////////////////////////// // Lattice expression types used by ET to assemble the AST diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 20237732df..c7c14658af 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -186,6 +186,12 @@ class CartesianStencilView : public CartesianStencilAccelerator ViewLogger::LogVector; + +void ViewLogger::Begin() { Enabled = true; LogVector.resize(0); } +void ViewLogger::End() { Enabled = false; } + +void ViewLogger::Log(const char* filename, int line, int index, int mode, void* data, uint64_t bytes) { + if (!Enabled) + return; + + size_t i = LogVector.size(); + LogVector.resize(i + 1); + auto & n = LogVector[i]; + + n.filename = filename; + n.line = line; + n.index = index; + + if (bytes < sizeof(uint64_t)) { + + n.head = n.tail = 0; + + } else { + + switch (mode) { + case AcceleratorRead: + case AcceleratorWrite: + case AcceleratorWriteDiscard: + acceleratorCopyFromDevice((char*)data, &n.head, sizeof(uint64_t)); + acceleratorCopyFromDevice((char*)data + bytes - sizeof(uint64_t), &n.tail, sizeof(uint64_t)); + break; + + case CpuRead: + case CpuWrite: + //case CpuWriteDiscard: + n.head = *(uint64_t*)data; + n.tail = *(uint64_t*)((char*)data + bytes - sizeof(uint64_t)); + break; + } + } +} + +#endif + NAMESPACE_END(Grid); diff --git a/Grid/util/FlightRecorder.h b/Grid/util/FlightRecorder.h index 7c1cd17191..ee35f22591 100644 --- a/Grid/util/FlightRecorder.h +++ b/Grid/util/FlightRecorder.h @@ -42,5 +42,23 @@ class FlightRecorder { static void xmitLog(void *,uint64_t bytes); static void recvLog(void *,uint64_t bytes,int rank); }; + +#ifdef GRID_LOG_VIEWS +class ViewLogger { + struct Entry_t { + const char* filename; + int line; + int index; + uint64_t head, tail; + }; + +public: + static bool Enabled; + static std::vector LogVector; + static void Begin(); + static void End(); + static void Log(const char* filename, int line, int index, int mode, void* data, uint64_t bytes); +}; +#endif NAMESPACE_END(Grid); diff --git a/configure.ac b/configure.ac index f4fcf20c95..84e8f46ef1 100644 --- a/configure.ac +++ b/configure.ac @@ -274,6 +274,17 @@ case ${ac_CHECKSUM_COMMS} in *);; esac +############### LOG VIEWS +AC_ARG_ENABLE([log-views], + [AS_HELP_STRING([--enable-log-views=yes|no],[log information on all view open/close])], + [ac_LOG_VIEWS=${enable_log_views}], [ac_LOG_VIEWS=yes]) + +case ${ac_LOG_VIEWS} in + yes) + AC_DEFINE([GRID_LOG_VIEWS],[1],[log information on all view open/close]);; + *);; +esac + ############### SYCL/CUDA/HIP/none AC_ARG_ENABLE([accelerator], [AS_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none],[enable none,cuda,sycl,hip acceleration])], From 5ad20a90917e893287f5bd0f659dbea6d793cc9e Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Thu, 10 Jul 2025 20:15:53 +0000 Subject: [PATCH 33/53] more appropriate view modes --- Grid/cshift/Cshift_common.h | 6 +++--- .../action/fermion/implementation/CayleyFermion5Dcache.h | 8 ++++---- .../fermion/implementation/WilsonKernelsImplementation.h | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h index fdb98cd4e3..b8099c2788 100644 --- a/Grid/cshift/Cshift_common.h +++ b/Grid/cshift/Cshift_common.h @@ -202,7 +202,7 @@ template void Scatter_plane_simple (Lattice &rhs,deviceVector< { auto buffer_p = & buffer[0]; auto table = MapCshiftTable(); - autoView( rhs_v, rhs, AcceleratorWrite); + autoView( rhs_v, rhs, AcceleratorWriteDiscard); accelerator_for(i,ent,vobj::Nsimd(),{ coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second])); }); @@ -228,7 +228,7 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA if(cbmask ==0x3 ) { int _slice_stride = rhs.Grid()->_slice_stride[dimension]; int _slice_block = rhs.Grid()->_slice_block[dimension]; - autoView( rhs_v , rhs, AcceleratorWrite); + autoView( rhs_v , rhs, AcceleratorWriteDiscard); accelerator_for(nn,e1*e2,1,{ int n = nn%e1; int b = nn/e1; @@ -302,7 +302,7 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs { auto table = MapCshiftTable(); autoView(rhs_v , rhs, AcceleratorRead); - autoView(lhs_v , lhs, AcceleratorWrite); + autoView(lhs_v , lhs, AcceleratorWriteDiscard); accelerator_for(i,ent,vobj::Nsimd(),{ coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second])); }); diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h index 5fbc7612b5..f3e4aa026f 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h @@ -52,7 +52,7 @@ CayleyFermion5D::M5D(const FermionField &psi_i, GridBase *grid=psi_i.Grid(); autoView(psi , psi_i,AcceleratorRead); autoView(phi , phi_i,AcceleratorRead); - autoView(chi , chi_i,AcceleratorWrite); + autoView(chi , chi_i,AcceleratorWriteDiscard); assert(phi.Checkerboard() == psi.Checkerboard()); int Ls =this->Ls; @@ -94,7 +94,7 @@ CayleyFermion5D::M5Ddag(const FermionField &psi_i, GridBase *grid=psi_i.Grid(); autoView(psi , psi_i,AcceleratorRead); autoView(phi , phi_i,AcceleratorRead); - autoView(chi , chi_i,AcceleratorWrite); + autoView(chi , chi_i,AcceleratorWriteDiscard); assert(phi.Checkerboard() == psi.Checkerboard()); int Ls=this->Ls; @@ -130,7 +130,7 @@ CayleyFermion5D::MooeeInv (const FermionField &psi_i, FermionField &chi GridBase *grid=psi_i.Grid(); autoView(psi , psi_i,AcceleratorRead); - autoView(chi , chi_i,AcceleratorWrite); + autoView(chi , chi_i,AcceleratorWriteDiscard); int Ls=this->Ls; @@ -194,7 +194,7 @@ CayleyFermion5D::MooeeInvDag (const FermionField &psi_i, FermionField &chi int Ls=this->Ls; autoView(psi , psi_i,AcceleratorRead); - autoView(chi , chi_i,AcceleratorWrite); + autoView(chi , chi_i,AcceleratorWriteDiscard); acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 26e80838d2..0908f72fcf 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -500,7 +500,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField { autoView(U_v , U,AcceleratorRead); autoView(in_v , in,AcceleratorRead); - autoView(out_v,out,AcceleratorWrite); + autoView(out_v,out,AcceleratorWriteDiscard); autoView(st_v , st,AcceleratorRead); if( interior && exterior ) { @@ -535,7 +535,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField { autoView(U_v , U,AcceleratorRead); autoView(in_v , in,AcceleratorRead); - autoView(out_v,out,AcceleratorWrite); + autoView(out_v,out,AcceleratorWriteDiscard); autoView(st_v , st,AcceleratorRead); KERNEL_CALL_ID(GenericDhopSite); } @@ -546,7 +546,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField { autoView(U_v ,U,AcceleratorRead); autoView(in_v ,in,AcceleratorRead); - autoView(out_v,out,AcceleratorWrite); + autoView(out_v,out,AcceleratorWriteDiscard); autoView(st_v ,st,AcceleratorRead); if( interior && exterior ) { From 84c7196709db5f1ca156d04e407d44de2725bc26 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Fri, 11 Jul 2025 06:59:48 +0000 Subject: [PATCH 34/53] stencil view plays nicely with view logging --- Grid/stencil/Stencil.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index c7c14658af..5be82aa459 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -188,7 +188,7 @@ class CartesianStencilView : public CartesianStencilAccelerator Date: Mon, 14 Jul 2025 18:08:20 +0000 Subject: [PATCH 35/53] towards cleanup of LogViews --- Grid/lattice/Lattice_view.h | 2 ++ .../fermion/implementation/WilsonKernelsImplementation.h | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Grid/lattice/Lattice_view.h b/Grid/lattice/Lattice_view.h index 63065efdad..2f6a29e613 100644 --- a/Grid/lattice/Lattice_view.h +++ b/Grid/lattice/Lattice_view.h @@ -133,7 +133,9 @@ class ViewCloser switch (mode) { case AcceleratorWrite: + case AcceleratorWriteDiscard: case CpuWrite: + // case CpuWriteDiscard: <- this is a duplicate of CpuWrite (does not make sense to differentiate) ViewLogger::Log(filename, line, -1, mode, &v[0], v.size() * sizeof(v[0])); break; } diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 0908f72fcf..26e80838d2 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -500,7 +500,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField { autoView(U_v , U,AcceleratorRead); autoView(in_v , in,AcceleratorRead); - autoView(out_v,out,AcceleratorWriteDiscard); + autoView(out_v,out,AcceleratorWrite); autoView(st_v , st,AcceleratorRead); if( interior && exterior ) { @@ -535,7 +535,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField { autoView(U_v , U,AcceleratorRead); autoView(in_v , in,AcceleratorRead); - autoView(out_v,out,AcceleratorWriteDiscard); + autoView(out_v,out,AcceleratorWrite); autoView(st_v , st,AcceleratorRead); KERNEL_CALL_ID(GenericDhopSite); } @@ -546,7 +546,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField { autoView(U_v ,U,AcceleratorRead); autoView(in_v ,in,AcceleratorRead); - autoView(out_v,out,AcceleratorWriteDiscard); + autoView(out_v,out,AcceleratorWrite); autoView(st_v ,st,AcceleratorRead); if( interior && exterior ) { From d7cdab503d0ca84e5a30b5b829726332a61d3ff7 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Tue, 22 Jul 2025 05:31:18 +0000 Subject: [PATCH 36/53] double up sycl fences and barriers --- Grid/threads/Accelerator.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 2b91165cf4..8b4d51d093 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -346,7 +346,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { }); \ }); -#define accelerator_barrier(dummy) { theGridAccelerator->wait(); } +#define accelerator_barrier(dummy) { theGridAccelerator->wait(); theGridAccelerator->wait(); } inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);}; inline void *acceleratorAllocHost(size_t bytes) { return malloc_host(bytes,*theGridAccelerator);}; @@ -355,7 +355,7 @@ inline void acceleratorFreeHost(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; -inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); } +inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); theCopyAccelerator->wait(); } /////// @@ -377,9 +377,9 @@ inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *t inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); } inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); } -inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} -inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} -inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();} +inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();theCopyAccelerator->wait();} +inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();theCopyAccelerator->wait();} +inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();theCopyAccelerator->wait();} inline int acceleratorIsCommunicable(void *ptr) { @@ -652,7 +652,7 @@ inline void acceleratorFreeCpu (void *ptr){free(ptr);}; ////////////////////////////////////////////// #ifdef GRID_SYCL -inline void acceleratorFenceComputeStream(void){ theGridAccelerator->ext_oneapi_submit_barrier(); }; +inline void acceleratorFenceComputeStream(void){ theGridAccelerator->ext_oneapi_submit_barrier(); theGridAccelerator->ext_oneapi_submit_barrier(); }; #else // Ordering within a stream guaranteed on Nvidia & AMD inline void acceleratorFenceComputeStream(void){ }; From 81cc867748be6cba303eec2e6f0d776afc04dc95 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Wed, 23 Jul 2025 08:13:22 +0200 Subject: [PATCH 37/53] SYCL lane --- Grid/threads/Accelerator.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 2b91165cf4..71dd2a3ab1 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -317,13 +317,11 @@ extern sycl::queue *theCopyAccelerator; #define accelerator #define accelerator_inline strong_inline -accelerator_inline int acceleratorSIMTlane(int Nsimd) { #ifdef GRID_SIMT - return __spirv::initLocalInvocationId<3, sycl::id<3>>()[2]; +#define acceleratorSIMTlane(Nsimd) _lane #else - return 0; +#define acceleratorSIMTlane(Nsimd) 0 #endif -} // SYCL specific #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ theGridAccelerator->submit([&](sycl::handler &cgh) { \ @@ -341,7 +339,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { { \ auto iter1 = item.get_global_id(0); \ auto iter2 = item.get_global_id(1); \ - auto lane = item.get_global_id(2); \ + auto _lane = item.get_global_id(2); \ { if (iter1 < unum1){ __VA_ARGS__ } }; \ }); \ }); From 8fbeee866191b5e8cd8e2cd1423ef7a60bc736a6 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Wed, 23 Jul 2025 06:37:53 +0000 Subject: [PATCH 38/53] cshift test --- Grid/cshift/Cshift_mpi.h | 4 ++++ Grid/threads/Accelerator.h | 8 +++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h index 199721b478..86d4305354 100644 --- a/Grid/cshift/Cshift_mpi.h +++ b/Grid/cshift/Cshift_mpi.h @@ -380,6 +380,10 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice %ld\n", expected_cs, computed_cs); + fflush(stderr); + } assert(expected_cs == computed_cs); #else acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes); diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index fc3531ca79..8b4d51d093 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -317,11 +317,13 @@ extern sycl::queue *theCopyAccelerator; #define accelerator #define accelerator_inline strong_inline +accelerator_inline int acceleratorSIMTlane(int Nsimd) { #ifdef GRID_SIMT -#define acceleratorSIMTlane(Nsimd) _lane + return __spirv::initLocalInvocationId<3, sycl::id<3>>()[2]; #else -#define acceleratorSIMTlane(Nsimd) 0 + return 0; #endif +} // SYCL specific #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ theGridAccelerator->submit([&](sycl::handler &cgh) { \ @@ -339,7 +341,7 @@ extern sycl::queue *theCopyAccelerator; { \ auto iter1 = item.get_global_id(0); \ auto iter2 = item.get_global_id(1); \ - auto _lane = item.get_global_id(2); \ + auto lane = item.get_global_id(2); \ { if (iter1 < unum1){ __VA_ARGS__ } }; \ }); \ }); From 08b3844d54d0a86e03d88dcfb47eb661e2ec01fa Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Wed, 23 Jul 2025 16:08:19 +0200 Subject: [PATCH 39/53] remove error message since fallback to small-sum path is taken --- Grid/lattice/Lattice_reduction_gpu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index 91cb8226bd..3485114c40 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -52,7 +52,7 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator & // let the number of threads in a block be a multiple of 2, starting from warpSize threads = warpSize; if ( threads*sizeofsobj > sharedMemPerBlock ) { - std::cout << GridLogError << "The object is too large for the shared memory." << std::endl; + //std::cout << GridLogError << "The object is too large for the shared memory." << std::endl; return 0; } while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2; From d22d07bda5c7f09cdc7798f0d5761ee96cc62868 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sat, 26 Jul 2025 05:20:44 +0000 Subject: [PATCH 40/53] checksum index per Grid to play nice with split grid --- Grid/communicator/Communicator_base.h | 5 +++++ Grid/communicator/Communicator_mpi3.cc | 11 +++++++---- Grid/cshift/Cshift_mpi.h | 8 ++------ 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index 1c08762355..a0cb01341c 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -243,6 +243,11 @@ class CartesianCommunicator : public SharedMemory { Broadcast(root,(void *)&data,sizeof(data)); } +#ifdef GRID_CHECKSUM_COMMS + uint64_t checksumIndex; + + uint64_t incrementChecksumIndex() { return checksumIndex++; }; +#endif }; NAMESPACE_END(Grid); diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 2dad9698fb..b992f59fb6 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -35,7 +35,6 @@ Grid_MPI_Comm CartesianCommunicator::communicator_world; #ifdef GRID_CHECKSUM_COMMS extern void * Grid_backtrace_buffer[_NBACKTRACE]; -uint64_t checksum_index = 1; #endif //////////////////////////////////////////// @@ -250,6 +249,10 @@ void CartesianCommunicator::InitFromMPICommunicator(const Coordinate &processors MPI_Comm_dup(communicator,&communicator_halo[i]); } assert(Size==_Nprocessors); + +#ifdef GRID_CHECKSUM_COMMS + checksumIndex = 0; +#endif } CartesianCommunicator::~CartesianCommunicator() @@ -599,7 +602,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector NAMESPACE_BEGIN(Grid); -#ifdef GRID_CHECKSUM_COMMS -extern uint64_t checksum_index; -#endif - const int Cshift_verbose=0; template Lattice Cshift(const Lattice &rhs,int dimension,int shift) { @@ -196,7 +192,7 @@ template void Cshift_comms(Lattice &ret,const Lattice &r #ifdef GRID_CHECKSUM_COMMS assert(bytes % 8 == 0); - checksum_index++; + uint64_t checksum_index = grid->incrementChecksumIndex(); *(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = checksum_gpu((uint64_t*)&send_buf[0], bytes / 8) ^ (1 + checksum_index); bytes += 8; #endif @@ -363,7 +359,7 @@ template void Cshift_comms_simd(Lattice &ret,const LatticeincrementChecksumIndex(); *(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = checksum_gpu((uint64_t*)send_buf_extract_mpi, bytes / 8) ^ (1 + checksum_index); bytes += 8; #endif From b83023567bb79194f62f8ad84a1bb2f0a7ab1602 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Mon, 28 Jul 2025 21:34:18 +0200 Subject: [PATCH 41/53] DeviceMemoryAllocator --- Grid/allocator/Allocator.h | 1 + Grid/allocator/DeviceMemoryAllocator.cc | 114 ++++++++++++++++++++++++ Grid/allocator/DeviceMemoryAllocator.h | 37 ++++++++ Grid/threads/Accelerator.h | 10 +++ configure.ac | 11 +++ 5 files changed, 173 insertions(+) create mode 100644 Grid/allocator/DeviceMemoryAllocator.cc create mode 100644 Grid/allocator/DeviceMemoryAllocator.h diff --git a/Grid/allocator/Allocator.h b/Grid/allocator/Allocator.h index 589ea36f83..f151bc0d54 100644 --- a/Grid/allocator/Allocator.h +++ b/Grid/allocator/Allocator.h @@ -2,3 +2,4 @@ #include #include #include +#include diff --git a/Grid/allocator/DeviceMemoryAllocator.cc b/Grid/allocator/DeviceMemoryAllocator.cc new file mode 100644 index 0000000000..0965207c52 --- /dev/null +++ b/Grid/allocator/DeviceMemoryAllocator.cc @@ -0,0 +1,114 @@ +#include + +NAMESPACE_BEGIN(Grid); + +#define DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE (64*1024) + + +struct DeviceMemoryAllocator { + + bool initialized; + char* base; + size_t size; + size_t offset; + + DeviceMemoryAllocator() { + initialized = false; + base = 0; + size = 0; + offset = 0; + } + + ~DeviceMemoryAllocator() { + if (initialized) { + acceleratorFreeDevice(base); + initialized = false; + } + } + + std::vector pages; + std::map > size_map; + + void Init(size_t _size) { + assert(!initialized); + + size_t n_pages = (_size + DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE - 1) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; + size = n_pages * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; + std::cout << GridLogMessage << "Init with " << + size << " bytes" << std::endl; + + base = (char*)acceleratorAllocDeviceInternal(size); + assert(base); + + std::cout << GridLogMessage << "Initialize memory to zero" << std::endl; + + { + uint64_t* ba = (uint64_t*)base; + size_t n = size / sizeof(uint64_t); + accelerator_for(i, n, 1, { + ba[i] = (uint64_t)-1; + }); + } + + std::cout << GridLogMessage << "Done" << std::endl; + + offset = 0; + + pages.resize(n_pages, 0); + + std::cout << GridLogMessage << "Pages initialized" << std::endl; + + initialized = true; + } +}; + +static DeviceMemoryAllocator dma; + +void *acceleratorAllocDevice(size_t bytes) { + if (!dma.initialized) + dma.Init(MemoryManager::DeviceMaxBytes); + + if (!bytes) + bytes++; + + size_t n_pages = (bytes + DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE - 1) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; + + // first check if + auto & sm = dma.size_map[n_pages]; + if (sm.size()) { + size_t index = sm.back(); + sm.pop_back(); + assert(dma.pages[index] == 0); + dma.pages[index] = n_pages; + + return dma.base + index * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; + } + + size_t end = (dma.offset + n_pages) * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; + assert(end <= dma.size); + dma.pages[dma.offset] = n_pages; + + void* ptr = dma.base + dma.offset * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; + dma.offset += n_pages; + + std::cout << GridLogMessage << (dma.size - end) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE << " pages left to allocate (" + << (dma.size - end) * 100 / dma.size << "% free)" << std::endl; + + return ptr; + +} + +void acceleratorFreeDevice(void *ptr) { + if (!dma.initialized) + return; + + size_t index = ((size_t)((char*)ptr - dma.base)) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; + size_t n_pages = dma.pages[index]; + //std::cout << GridLogMessage << "Freeing ptr " << ptr << " has " << n_pages << " pages" << std::endl; + dma.pages[index] = 0; + auto & sm = dma.size_map[n_pages]; + sm.push_back(index); + +} + +NAMESPACE_END(Grid); diff --git a/Grid/allocator/DeviceMemoryAllocator.h b/Grid/allocator/DeviceMemoryAllocator.h new file mode 100644 index 0000000000..d3401d7204 --- /dev/null +++ b/Grid/allocator/DeviceMemoryAllocator.h @@ -0,0 +1,37 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/DeviceMemoryAllocator.h + + Copyright (C) 2025 + +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + +NAMESPACE_BEGIN(Grid); + +#ifdef GRID_DEVICE_MEMORY_ALLOCATOR +void *acceleratorAllocDevice(size_t bytes); +void acceleratorFreeDevice(void *ptr); +#endif + +NAMESPACE_END(Grid); diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 8b4d51d093..9d9f765d2b 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -44,6 +44,11 @@ Author: paboyle inline void *memalign(size_t align, size_t bytes) { return malloc(bytes); } #endif +#ifdef GRID_DEVICE_MEMORY_ALLOCATOR +#define acceleratorAllocDevice acceleratorAllocDeviceInternal +#define acceleratorFreeDevice acceleratorFreeDeviceInternal +#endif + NAMESPACE_BEGIN(Grid); ////////////////////////////////////////////////////////////////////////////////// @@ -722,3 +727,8 @@ template T acceleratorGet(T& dev) NAMESPACE_END(Grid); + +#ifdef GRID_DEVICE_MEMORY_ALLOCATOR +#undef acceleratorAllocDevice +#undef acceleratorFreeDevice +#endif diff --git a/configure.ac b/configure.ac index 84e8f46ef1..82954e753a 100644 --- a/configure.ac +++ b/configure.ac @@ -274,6 +274,17 @@ case ${ac_CHECKSUM_COMMS} in *);; esac +############### DEVICE MEMORY ALLOCATOR +AC_ARG_ENABLE([device-memory-allocator], + [AS_HELP_STRING([--enable-device-memory-allocator=yes|no],[device memory allocator])], + [ac_DEVICE_MEMORY_ALLOCATOR=${device_memory_allocator}], [ac_DEVICE_MEMORY_ALLOCATOR=yes]) + +case ${ac_DEVICE_MEMORY_ALLOCATOR} in + yes) + AC_DEFINE([GRID_DEVICE_MEMORY_ALLOCATOR],[1],[device memory allocator]);; + *);; +esac + ############### LOG VIEWS AC_ARG_ENABLE([log-views], [AS_HELP_STRING([--enable-log-views=yes|no],[log information on all view open/close])], From 928abe5c6df5e6d8375acd66530354963d830ccc Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Mon, 28 Jul 2025 21:51:07 +0200 Subject: [PATCH 42/53] DeviceMemoryAllocator --- Grid/allocator/DeviceMemoryAllocator.cc | 3 ++- configure.ac | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Grid/allocator/DeviceMemoryAllocator.cc b/Grid/allocator/DeviceMemoryAllocator.cc index 0965207c52..9599d8ddfb 100644 --- a/Grid/allocator/DeviceMemoryAllocator.cc +++ b/Grid/allocator/DeviceMemoryAllocator.cc @@ -4,7 +4,7 @@ NAMESPACE_BEGIN(Grid); #define DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE (64*1024) - +#ifdef GRID_DEVICE_MEMORY_ALLOCATOR struct DeviceMemoryAllocator { bool initialized; @@ -110,5 +110,6 @@ void acceleratorFreeDevice(void *ptr) { sm.push_back(index); } +#endif NAMESPACE_END(Grid); diff --git a/configure.ac b/configure.ac index 82954e753a..20f4c9d520 100644 --- a/configure.ac +++ b/configure.ac @@ -266,7 +266,7 @@ esac ############### CHECKSUM COMMS AC_ARG_ENABLE([checksum-comms], [AS_HELP_STRING([--enable-checksum-comms=yes|no],[checksum all communication])], - [ac_CHECKSUM_COMMS=${enable_checksum_comms}], [ac_CHECKSUM_COMMS=yes]) + [ac_CHECKSUM_COMMS=${enable_checksum_comms}], [ac_CHECKSUM_COMMS=no]) case ${ac_CHECKSUM_COMMS} in yes) @@ -277,7 +277,7 @@ esac ############### DEVICE MEMORY ALLOCATOR AC_ARG_ENABLE([device-memory-allocator], [AS_HELP_STRING([--enable-device-memory-allocator=yes|no],[device memory allocator])], - [ac_DEVICE_MEMORY_ALLOCATOR=${device_memory_allocator}], [ac_DEVICE_MEMORY_ALLOCATOR=yes]) + [ac_DEVICE_MEMORY_ALLOCATOR=${device_memory_allocator}], [ac_DEVICE_MEMORY_ALLOCATOR=no]) case ${ac_DEVICE_MEMORY_ALLOCATOR} in yes) @@ -288,7 +288,7 @@ esac ############### LOG VIEWS AC_ARG_ENABLE([log-views], [AS_HELP_STRING([--enable-log-views=yes|no],[log information on all view open/close])], - [ac_LOG_VIEWS=${enable_log_views}], [ac_LOG_VIEWS=yes]) + [ac_LOG_VIEWS=${enable_log_views}], [ac_LOG_VIEWS=no]) case ${ac_LOG_VIEWS} in yes) From c973aa2cb71319c016d61aeab49fb5bdbce16d1e Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Tue, 29 Jul 2025 05:00:52 +0000 Subject: [PATCH 43/53] fix header --- Grid/allocator/DeviceMemoryAllocator.cc | 28 +++++++++++++++++++++++++ Grid/allocator/DeviceMemoryAllocator.h | 3 +-- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/Grid/allocator/DeviceMemoryAllocator.cc b/Grid/allocator/DeviceMemoryAllocator.cc index 9599d8ddfb..1a1763861e 100644 --- a/Grid/allocator/DeviceMemoryAllocator.cc +++ b/Grid/allocator/DeviceMemoryAllocator.cc @@ -1,3 +1,31 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/DeviceMemoryAllocator.h + + Copyright (C) 2025 + +Author: Christoph Lehner + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + #include NAMESPACE_BEGIN(Grid); diff --git a/Grid/allocator/DeviceMemoryAllocator.h b/Grid/allocator/DeviceMemoryAllocator.h index d3401d7204..13c7eb8e18 100644 --- a/Grid/allocator/DeviceMemoryAllocator.h +++ b/Grid/allocator/DeviceMemoryAllocator.h @@ -6,8 +6,7 @@ Copyright (C) 2025 -Author: Peter Boyle -Author: paboyle +Author: Christoph Lehner This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by From 5cdf5baf77b7c3a95020d2657867ae9e2ea3f170 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Tue, 29 Jul 2025 06:07:33 +0000 Subject: [PATCH 44/53] fix configure.ac --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 20f4c9d520..d747077a85 100644 --- a/configure.ac +++ b/configure.ac @@ -277,7 +277,7 @@ esac ############### DEVICE MEMORY ALLOCATOR AC_ARG_ENABLE([device-memory-allocator], [AS_HELP_STRING([--enable-device-memory-allocator=yes|no],[device memory allocator])], - [ac_DEVICE_MEMORY_ALLOCATOR=${device_memory_allocator}], [ac_DEVICE_MEMORY_ALLOCATOR=no]) + [ac_DEVICE_MEMORY_ALLOCATOR=${enable_device_memory_allocator}], [ac_DEVICE_MEMORY_ALLOCATOR=no]) case ${ac_DEVICE_MEMORY_ALLOCATOR} in yes) From 7a939edeb6c875f3064d45fdfbe84cdb09b50c4d Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Tue, 29 Jul 2025 15:01:50 +0000 Subject: [PATCH 45/53] oversubscription factor --- Grid/allocator/DeviceMemoryAllocator.cc | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/Grid/allocator/DeviceMemoryAllocator.cc b/Grid/allocator/DeviceMemoryAllocator.cc index 1a1763861e..b958cfa1b9 100644 --- a/Grid/allocator/DeviceMemoryAllocator.cc +++ b/Grid/allocator/DeviceMemoryAllocator.cc @@ -31,6 +31,7 @@ Author: Christoph Lehner NAMESPACE_BEGIN(Grid); #define DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE (64*1024) +#define OVERALLOCATION_FACTOR 1.2 #ifdef GRID_DEVICE_MEMORY_ALLOCATOR struct DeviceMemoryAllocator { @@ -59,6 +60,13 @@ struct DeviceMemoryAllocator { void Init(size_t _size) { assert(!initialized); + + char* str; + if ((str = getenv("GRID_OVERALLOCATION_FACTOR"))) { + _size = (size_t)(_size * atof(str)); + } else { + _size = (size_t)(_size * OVERALLOCATION_FACTOR); + } size_t n_pages = (_size + DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE - 1) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; size = n_pages * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; @@ -73,9 +81,17 @@ struct DeviceMemoryAllocator { { uint64_t* ba = (uint64_t*)base; size_t n = size / sizeof(uint64_t); - accelerator_for(i, n, 1, { - ba[i] = (uint64_t)-1; - }); + size_t MAX_BLOCK_INIT = 128*1024*1024; + while (n > 0) { + size_t n0 = n; + if (n0 > MAX_BLOCK_INIT) + n0 = MAX_BLOCK_INIT; + accelerator_for(i, n0, 1, { + ba[i] = (uint64_t)-1; + }); + ba += n0; + n -= n0; + } } std::cout << GridLogMessage << "Done" << std::endl; @@ -108,6 +124,8 @@ void *acceleratorAllocDevice(size_t bytes) { sm.pop_back(); assert(dma.pages[index] == 0); dma.pages[index] = n_pages; + + std::cout << GridLogMessage << "Can re-use pointer for " << n_pages << " pages" << std::endl; return dma.base + index * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; } From 8bd4562a7b1d8a7010ce1420098c94f0751feff1 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Tue, 29 Jul 2025 20:38:26 +0200 Subject: [PATCH 46/53] quote reusable pages --- Grid/allocator/DeviceMemoryAllocator.cc | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/Grid/allocator/DeviceMemoryAllocator.cc b/Grid/allocator/DeviceMemoryAllocator.cc index b958cfa1b9..f107aa4cc9 100644 --- a/Grid/allocator/DeviceMemoryAllocator.cc +++ b/Grid/allocator/DeviceMemoryAllocator.cc @@ -118,27 +118,37 @@ void *acceleratorAllocDevice(size_t bytes) { size_t n_pages = (bytes + DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE - 1) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; // first check if - auto & sm = dma.size_map[n_pages]; - if (sm.size()) { - size_t index = sm.back(); - sm.pop_back(); + auto sm = dma.size_map.find(n_pages); + if (sm != dma.size_map.end() && sm->second.size() > 0) { + size_t index = sm->second.back(); + sm->second.pop_back(); + + if (sm->second.size() == 0) + dma.size_map.erase(sm); + assert(dma.pages[index] == 0); dma.pages[index] = n_pages; - std::cout << GridLogMessage << "Can re-use pointer for " << n_pages << " pages" << std::endl; + std::cout << GridLogMessage << "Can re-use perfect pointer for " << n_pages << " pages" << std::endl; return dma.base + index * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; } size_t end = (dma.offset + n_pages) * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; + + // TODO: if there are reusable pages, use one that is slightly too large; iterator goes from small to large assert(end <= dma.size); dma.pages[dma.offset] = n_pages; void* ptr = dma.base + dma.offset * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; dma.offset += n_pages; + size_t reusable_pages = 0; + for (auto & sm : dma.size_map) + reusable_pages += sm.first * sm.second.size(); + std::cout << GridLogMessage << (dma.size - end) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE << " pages left to allocate (" - << (dma.size - end) * 100 / dma.size << "% free)" << std::endl; + << (dma.size - end) * 100 / dma.size << "% free, " << reusable_pages << " reusable pages)" << std::endl; return ptr; From bc4f7bdfc4dea17c00c82f33500ad090d8dcdc55 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Wed, 30 Jul 2025 08:41:15 +0200 Subject: [PATCH 47/53] cleanup device allocator --- Grid/allocator/DeviceMemoryAllocator.cc | 172 ++++++++++++++++-------- 1 file changed, 118 insertions(+), 54 deletions(-) diff --git a/Grid/allocator/DeviceMemoryAllocator.cc b/Grid/allocator/DeviceMemoryAllocator.cc index f107aa4cc9..c6711deba2 100644 --- a/Grid/allocator/DeviceMemoryAllocator.cc +++ b/Grid/allocator/DeviceMemoryAllocator.cc @@ -40,12 +40,14 @@ struct DeviceMemoryAllocator { char* base; size_t size; size_t offset; + bool verbose; DeviceMemoryAllocator() { initialized = false; base = 0; size = 0; offset = 0; + verbose = false; } ~DeviceMemoryAllocator() { @@ -67,16 +69,18 @@ struct DeviceMemoryAllocator { } else { _size = (size_t)(_size * OVERALLOCATION_FACTOR); } + + verbose = (getenv("GRID_DEBUG_DEVICE_ALLOCATOR") != 0); size_t n_pages = (_size + DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE - 1) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; size = n_pages * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; - std::cout << GridLogMessage << "Init with " << - size << " bytes" << std::endl; + std::cout << GridLogMessage << "Init device allocator with " << size << " bytes" << std::endl; base = (char*)acceleratorAllocDeviceInternal(size); assert(base); - - std::cout << GridLogMessage << "Initialize memory to zero" << std::endl; + + if (verbose) + std::cout << GridLogMessage << "Initialize memory to zero" << std::endl; { uint64_t* ba = (uint64_t*)base; @@ -93,78 +97,138 @@ struct DeviceMemoryAllocator { n -= n0; } } - - std::cout << GridLogMessage << "Done" << std::endl; + + if (verbose) + std::cout << GridLogMessage << "Done" << std::endl; offset = 0; pages.resize(n_pages, 0); - - std::cout << GridLogMessage << "Pages initialized" << std::endl; + + if (verbose) + std::cout << GridLogMessage << "Pages initialized" << std::endl; initialized = true; } -}; -static DeviceMemoryAllocator dma; + void* attemptReuseExactSize(size_t n_pages) { + auto sm = size_map.find(n_pages); + if (sm != size_map.end() && sm->second.size() > 0) { + size_t index = sm->second.back(); + sm->second.pop_back(); + + if (sm->second.size() == 0) + size_map.erase(sm); + + assert(pages[index] == 0); + pages[index] = n_pages; + + return base + index * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; + } + return 0; + } -void *acceleratorAllocDevice(size_t bytes) { - if (!dma.initialized) - dma.Init(MemoryManager::DeviceMaxBytes); - - if (!bytes) - bytes++; - - size_t n_pages = (bytes + DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE - 1) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; - - // first check if - auto sm = dma.size_map.find(n_pages); - if (sm != dma.size_map.end() && sm->second.size() > 0) { - size_t index = sm->second.back(); - sm->second.pop_back(); - - if (sm->second.size() == 0) - dma.size_map.erase(sm); + void* attemptAllocUnused(size_t n_pages) { + size_t end = (offset + n_pages) * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; + void* ptr = 0; - assert(dma.pages[index] == 0); - dma.pages[index] = n_pages; - - std::cout << GridLogMessage << "Can re-use perfect pointer for " << n_pages << " pages" << std::endl; + if (end <= size) { + pages[offset] = n_pages; - return dma.base + index * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; + ptr = base + offset * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; + offset += n_pages; + + if (verbose) { + size_t reusable_pages = 0; + for (auto & sm : size_map) + reusable_pages += sm.first * sm.second.size(); + + std::cout << GridLogMessage << (size - end) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE << " pages left to allocate (" + << (size - end) * 100 / size << "% unallocated, " << reusable_pages << " reusable pages)" << std::endl; + } + } + + return ptr; } - size_t end = (dma.offset + n_pages) * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; + void* alloc(size_t bytes) { + if (!initialized) + Init(MemoryManager::DeviceMaxBytes); + + if (!bytes) + bytes++; + + size_t n_pages = (bytes + DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE - 1) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; + + // first check if block of perfect size is available + void* ptr; + if ((ptr = attemptReuseExactSize(n_pages))) { + + if (verbose) + std::cout << GridLogMessage << "Can re-use perfect pointer for " << n_pages << " pages" << std::endl; + + return ptr; + } - // TODO: if there are reusable pages, use one that is slightly too large; iterator goes from small to large - assert(end <= dma.size); - dma.pages[dma.offset] = n_pages; + // if not, attempt to allocate in the unused area + if ((ptr = attemptAllocUnused(n_pages))) + return ptr; + + // last attempt, find a re-usable region that barely fits and return it + // for loop of std::map iterates in ascending order + size_t reusable_pages = 0; + size_t n_pages_usable = 0; + for (auto & sm : size_map) { + assert(sm.second.size() > 0); // should never be empty + reusable_pages += sm.first * sm.second.size(); + if (n_pages_usable == 0 && sm.first > n_pages) + n_pages_usable = sm.first; + } + + if (n_pages_usable == 0) { + std::cout << GridLogMessage << "Out of memory for " << n_pages << " pages! Re-usable pages at time of death:" << std::endl; + + for (auto & sm : size_map) { + std::cout << GridLogMessage << sm.second.size() << " x " << sm.first << " pages" << std::endl; + } + + exit(1); + } - void* ptr = dma.base + dma.offset * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; - dma.offset += n_pages; + if ((ptr = attemptReuseExactSize(n_pages_usable))) { + + if (verbose) + std::cout << GridLogMessage << "Can re-use pointer for " << n_pages_usable << " pages when " << n_pages << " were needed; " << reusable_pages << " reusable pages" << std::endl; + + return ptr; + } - size_t reusable_pages = 0; - for (auto & sm : dma.size_map) - reusable_pages += sm.first * sm.second.size(); + // this should never be reached + assert(0); + return ptr; + } - std::cout << GridLogMessage << (dma.size - end) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE << " pages left to allocate (" - << (dma.size - end) * 100 / dma.size << "% free, " << reusable_pages << " reusable pages)" << std::endl; + void free(void* ptr) { + if (!initialized) + return; + + size_t index = ((size_t)((char*)ptr - base)) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; + size_t n_pages = pages[index]; + //std::cout << GridLogMessage << "Freeing ptr " << ptr << " has " << n_pages << " pages" << std::endl; + pages[index] = 0; + auto & sm = size_map[n_pages]; + sm.push_back(index); + } +}; - return ptr; +static DeviceMemoryAllocator dma; +void *acceleratorAllocDevice(size_t bytes) { + return dma.alloc(bytes); } void acceleratorFreeDevice(void *ptr) { - if (!dma.initialized) - return; - - size_t index = ((size_t)((char*)ptr - dma.base)) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; - size_t n_pages = dma.pages[index]; - //std::cout << GridLogMessage << "Freeing ptr " << ptr << " has " << n_pages << " pages" << std::endl; - dma.pages[index] = 0; - auto & sm = dma.size_map[n_pages]; - sm.push_back(index); - + dma.free(ptr); } #endif From faed6d3e9d8bfa45326a2720b8916228b941b69b Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Fri, 12 Dec 2025 14:25:00 +0100 Subject: [PATCH 48/53] M5D virtual --- Grid/qcd/action/fermion/CayleyFermion5D.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/qcd/action/fermion/CayleyFermion5D.h b/Grid/qcd/action/fermion/CayleyFermion5D.h index ec80b692b9..6cba3363a8 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5D.h +++ b/Grid/qcd/action/fermion/CayleyFermion5D.h @@ -87,14 +87,14 @@ class CayleyFermion5D : public WilsonFermion5D ///////////////////////////////////////////////////// // Instantiate different versions depending on Impl ///////////////////////////////////////////////////// - void M5D(const FermionField &psi, + virtual void M5D(const FermionField &psi, const FermionField &phi, FermionField &chi, std::vector &lower, std::vector &diag, std::vector &upper); - void M5Ddag(const FermionField &psi, + virtual void M5Ddag(const FermionField &psi, const FermionField &phi, FermionField &chi, std::vector &lower, From afd4423bef303fd176bcdb9f3ea89255cba06d53 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sat, 28 Feb 2026 09:23:48 +0000 Subject: [PATCH 49/53] try a different barrier setup at Aurora --- Grid/threads/Accelerator.cc | 4 ++-- Grid/threads/Accelerator.h | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index aa39f88c93..f750b56c94 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -210,8 +210,8 @@ void acceleratorInit(void) // sycl::gpu_selector selector; // sycl::device selectedDevice { selector }; theGridAccelerator = new sycl::queue (sycl::gpu_selector_v); - theCopyAccelerator = new sycl::queue (sycl::gpu_selector_v); - // theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway. + //theCopyAccelerator = new sycl::queue (sycl::gpu_selector_v); + theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway. #ifdef GRID_SYCL_LEVEL_ZERO_IPC zeInit(0); diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 9d9f765d2b..566a815987 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -351,7 +351,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { }); \ }); -#define accelerator_barrier(dummy) { theGridAccelerator->wait(); theGridAccelerator->wait(); } +#define accelerator_barrier(dummy) { theGridAccelerator->wait_and_throw(); theGridAccelerator->wait_and_throw(); } inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);}; inline void *acceleratorAllocHost(size_t bytes) { return malloc_host(bytes,*theGridAccelerator);}; @@ -360,7 +360,7 @@ inline void acceleratorFreeHost(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; -inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); theCopyAccelerator->wait(); } +inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait_and_throw(); theCopyAccelerator->wait_and_throw(); } /////// @@ -370,7 +370,7 @@ typedef sycl::event acceleratorEvent_t; inline void acceleratorEventWait(acceleratorEvent_t ev) { - ev.wait(); + ev.wait_and_throw(); } inline int acceleratorEventIsComplete(acceleratorEvent_t ev) @@ -382,9 +382,9 @@ inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *t inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); } inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); } -inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();theCopyAccelerator->wait();} -inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();theCopyAccelerator->wait();} -inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();theCopyAccelerator->wait();} +inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait_and_throw();theCopyAccelerator->wait_and_throw();} +inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait_and_throw();theCopyAccelerator->wait_and_throw();} +inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait_and_throw();theCopyAccelerator->wait_and_throw();} inline int acceleratorIsCommunicable(void *ptr) { From a8706981d24024d04cd2b80e54dbf29766137c35 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sat, 9 May 2026 08:11:33 +0000 Subject: [PATCH 50/53] clean compile merge --- Grid/communicator/Communicator_base.h | 12 +- Grid/communicator/Communicator_mpi3.cc | 42 +-- Grid/cshift/Cshift_mpi.h | 1 - Grid/lattice/Lattice_reduction_sycl.h | 20 -- .../action/fermion/ImprovedStaggeredFermion.h | 6 + .../fermion/ImprovedStaggeredFermion5D.h | 6 + .../action/fermion/NaiveStaggeredFermion.h | 6 + Grid/qcd/action/fermion/WilsonCompressor.h | 206 +------------- Grid/qcd/action/fermion/WilsonFermion.h | 6 + Grid/qcd/action/fermion/WilsonFermion5D.h | 9 +- Grid/stencil/Stencil.cc | 9 +- Grid/stencil/Stencil.h | 253 ++++++++++++++---- 12 files changed, 262 insertions(+), 314 deletions(-) diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index 690b5b23a0..deb93faee8 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -183,6 +183,7 @@ class CartesianCommunicator : public SharedMemory { int recv_from_rank, uint64_t bytes); + int IsOffNode(int rank); double StencilSendToRecvFrom(void *xmit, int xmit_to_rank,int do_xmit, void *recv, @@ -201,9 +202,9 @@ class CartesianCommunicator : public SharedMemory { void StencilSendToRecvFromPollIRecv(std::vector &list); double StencilSendToRecvFromBegin(std::vector &list, - void *xmit, + void *xmit,void *xmit_comp, int xmit_to_rank,int do_xmit, - void *recv, + void *recv,void *recv_comp, int recv_from_rank,int do_recv, uint64_t xbytes,uint64_t rbytes,int dir); @@ -243,12 +244,7 @@ class CartesianCommunicator : public SharedMemory { Broadcast(root,(void *)&data,sizeof(data)); } -#ifdef GRID_CHECKSUM_COMMS - uint64_t checksumIndex; - - uint64_t incrementChecksumIndex() { return checksumIndex++; }; -#endif -}; +}; NAMESPACE_END(Grid); diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 59f0dcac8e..3b8561d3ce 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -40,10 +40,6 @@ uint64_t checksum_index = 1; #endif -#ifdef GRID_CHECKSUM_COMMS -extern void * Grid_backtrace_buffer[_NBACKTRACE]; -#endif - //////////////////////////////////////////// // First initialise of comms system //////////////////////////////////////////// @@ -255,11 +251,6 @@ void CartesianCommunicator::InitFromMPICommunicator(const Coordinate &processors for(int i=0;i<_ndimension*2;i++){ MPI_Comm_dup(communicator,&communicator_halo[i]); } - -#ifdef GRID_CHECKSUM_COMMS - checksumIndex = 0; -#endif - GRID_ASSERT(Size==_Nprocessors); } @@ -286,24 +277,24 @@ void CartesianCommunicator::GlobalSum(double &d) } #else void CartesianCommunicator::GlobalSum(float &f){ - FlightRecorder::StepLog("AllReduce"); + FlightRecorder::StepLog("AllReduce float"); int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); GRID_ASSERT(ierr==0); } void CartesianCommunicator::GlobalSum(double &d) { - FlightRecorder::StepLog("AllReduce"); + FlightRecorder::StepLog("AllReduce double"); int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator); GRID_ASSERT(ierr==0); } #endif void CartesianCommunicator::GlobalSum(uint32_t &u){ - FlightRecorder::StepLog("AllReduce"); + FlightRecorder::StepLog("AllReduce uint32_t"); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); GRID_ASSERT(ierr==0); } void CartesianCommunicator::GlobalSum(uint64_t &u){ - FlightRecorder::StepLog("AllReduce"); + FlightRecorder::StepLog("AllReduce uint64_t"); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); GRID_ASSERT(ierr==0); } @@ -317,26 +308,31 @@ void CartesianCommunicator::GlobalXOR(uint32_t &u){ GRID_ASSERT(ierr==0); } void CartesianCommunicator::GlobalXOR(uint64_t &u){ + FlightRecorder::StepLog("GlobalXOR"); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator); GRID_ASSERT(ierr==0); } void CartesianCommunicator::GlobalMax(float &f) { + FlightRecorder::StepLog("GlobalMax"); int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator); GRID_ASSERT(ierr==0); } void CartesianCommunicator::GlobalMax(double &d) { + FlightRecorder::StepLog("GlobalMax"); int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator); GRID_ASSERT(ierr==0); } void CartesianCommunicator::GlobalSumVector(float *f,int N) { + FlightRecorder::StepLog("GlobalSumVector(float *)"); int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator); GRID_ASSERT(ierr==0); } void CartesianCommunicator::GlobalSumVector(double *d,int N) { + FlightRecorder::StepLog("GlobalSumVector(double *)"); int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); GRID_ASSERT(ierr==0); } @@ -410,11 +406,16 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, { std::vector list; double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); - offbytes += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); + offbytes += StencilSendToRecvFromBegin(list,xmit,xmit,dest,dox,recv,recv,from,dor,bytes,bytes,dir); StencilSendToRecvFromComplete(list,dir); return offbytes; } - +int CartesianCommunicator::IsOffNode(int rank) +{ + int grank = ShmRanks[rank]; + if ( grank == MPI_UNDEFINED ) return true; + else return false; +} #ifdef ACCELERATOR_AWARE_MPI void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector &list) {}; @@ -429,9 +430,9 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector &list, - void *xmit, + void *xmit,void *xmit_comp, int dest,int dox, - void *recv, + void *recv,void *recv_comp, int from,int dor, uint64_t xbytes,uint64_t rbytes,int dir) { @@ -581,7 +582,6 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vectorHostBufferMalloc(rbytes); ierr=MPI_Irecv(host_recv,(int)(rbytes/sizeof(int32_t)), MPI_INT32_T,from,tag,communicator_halo[commdir],&rrq); GRID_ASSERT(ierr==0); @@ -603,7 +603,6 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vectorHostBufferMalloc(xbytes); - CommsRequest_t srq; #ifdef GRID_CHECKSUM_COMMS @@ -714,9 +713,9 @@ void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector &list, - void *xmit, + void *xmit,void *xmit_comp, int dest,int dox, - void *recv, + void *recv,void *recv_comp, int from,int dor, uint64_t xbytes,uint64_t rbytes,int dir) { @@ -900,6 +899,7 @@ int CartesianCommunicator::RankWorld(void){ return r; } void CartesianCommunicator::BarrierWorld(void){ + FlightRecorder::StepLog("BarrierWorld"); int ierr = MPI_Barrier(communicator_world); GRID_ASSERT(ierr==0); } diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h index 5755965e5f..916213f061 100644 --- a/Grid/cshift/Cshift_mpi.h +++ b/Grid/cshift/Cshift_mpi.h @@ -197,7 +197,6 @@ template void Cshift_comms(Lattice &ret,const Lattice &r int pad = (8 + sizeof(vobj) - 1) / sizeof(vobj); static hostVector hsend_buf; hsend_buf.resize(buffer_size+pad); static hostVector hrecv_buf; hrecv_buf.resize(buffer_size+pad); -#endif #endif int cb= (cbmask==0x2)? Odd : Even; diff --git a/Grid/lattice/Lattice_reduction_sycl.h b/Grid/lattice/Lattice_reduction_sycl.h index 73ca4cbe5d..c0d1d07810 100644 --- a/Grid/lattice/Lattice_reduction_sycl.h +++ b/Grid/lattice/Lattice_reduction_sycl.h @@ -87,26 +87,6 @@ template Word svm_xor(Word *vec,uint64_t L) theGridAccelerator->wait(); return ret; } -template Word checksum_gpu(Word *vec,uint64_t L) -{ - Word identity; identity=0; - Word ret = 0; - { - sycl::buffer abuff(&ret, {1}); - theGridAccelerator->submit([&](sycl::handler &cgh) { - auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>()); - cgh.parallel_for(sycl::range<1>{L}, - Reduction, - [=] (sycl::id<1> index, auto &sum) { - auto l = index % 61; - sum ^= vec[index]<>(64-l); - }); - }); - } - theGridAccelerator->wait(); - return ret; -} - template Word checksum_gpu(Word *vec,uint64_t L) { Word identity; identity=0; diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h index 1f3ea83338..b202e3011a 100644 --- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h +++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h @@ -154,6 +154,12 @@ class ImprovedStaggeredFermion : public StaggeredKernels, public ImprovedS StencilImpl Stencil; StencilImpl StencilEven; StencilImpl StencilOdd; + void SloppyComms(int sloppy) + { + Stencil.SetSloppyComms(sloppy); + StencilEven.SetSloppyComms(sloppy); + StencilOdd.SetSloppyComms(sloppy); + } // Copy of the gauge field , with even and odd subsets DoubledGaugeField Umu; diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h index 0706e7039c..5ce6d24115 100644 --- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -179,6 +179,12 @@ class ImprovedStaggeredFermion5D : public StaggeredKernels, public Improv StencilImpl Stencil; StencilImpl StencilEven; StencilImpl StencilOdd; + void SloppyComms(int sloppy) + { + Stencil.SetSloppyComms(sloppy); + StencilEven.SetSloppyComms(sloppy); + StencilOdd.SetSloppyComms(sloppy); + } // Copy of the gauge field , with even and odd subsets DoubledGaugeField Umu; diff --git a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h index 9ec6be9016..53b1826792 100644 --- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h +++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h @@ -146,6 +146,12 @@ class NaiveStaggeredFermion : public StaggeredKernels, public NaiveStagger StencilImpl Stencil; StencilImpl StencilEven; StencilImpl StencilOdd; + void SloppyComms(int sloppy) + { + Stencil.SetSloppyComms(sloppy); + StencilEven.SetSloppyComms(sloppy); + StencilOdd.SetSloppyComms(sloppy); + } // Copy of the gauge field , with even and odd subsets DoubledGaugeField Umu; diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index e23d55fc27..c3a5e4cfc3 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -32,209 +32,6 @@ Author: paboyle NAMESPACE_BEGIN(Grid); -/////////////////////////////////////////////////////////////// -// Wilson compressor will need FaceGather policies for: -// Periodic, Dirichlet, and partial Dirichlet for DWF -/////////////////////////////////////////////////////////////// -const int dwf_compressor_depth=2; -#define DWF_COMPRESS -class FaceGatherPartialDWF -{ -public: -#ifdef DWF_COMPRESS - static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);}; -#else - static int PartialCompressionFactor(GridBase *grid) { return 1;} -#endif - template - static void Gather_plane_simple (deviceVector >& table, - const Lattice &rhs, - cobj *buffer, - compressor &compress, - int off,int so,int partial) - { - //DWF only hack: If a direction that is OFF node we use Partial Dirichlet - // Shrinks local and remote comms buffers - GridBase *Grid = rhs.Grid(); - int Ls = Grid->_rdimensions[0]; -#ifdef DWF_COMPRESS - int depth=dwf_compressor_depth; -#else - int depth=Ls/2; -#endif - std::pair *table_v = & table[0]; - auto rhs_v = rhs.View(AcceleratorRead); - int vol=table.size()/Ls; - accelerator_forNB( idx,table.size(), vobj::Nsimd(), { - Integer i=idx/Ls; - Integer s=idx%Ls; - Integer sc=depth+s-(Ls-depth); - if(s=Ls-depth) compress.Compress(buffer[off+i+sc*vol],rhs_v[so+table_v[idx].second]); - }); - rhs_v.ViewClose(); - } - template - static void DecompressFace(decompressor decompress,Decompression &dd) - { - auto Ls = dd.dims[0]; -#ifdef DWF_COMPRESS - int depth=dwf_compressor_depth; -#else - int depth=Ls/2; -#endif - // Just pass in the Grid - auto kp = dd.kernel_p; - auto mp = dd.mpi_p; - int size= dd.buffer_size; - int vol= size/Ls; - accelerator_forNB(o,size,1,{ - int idx=o/Ls; - int s=o%Ls; - if ( s < depth ) { - int oo=s*vol+idx; - kp[o]=mp[oo]; - } else if ( s >= Ls-depth ) { - int sc = depth + s - (Ls-depth); - int oo=sc*vol+idx; - kp[o]=mp[oo]; - } else { - kp[o] = Zero();//fill rest with zero if partial dirichlet - } - }); - } - //////////////////////////////////////////////////////////////////////////////////////////// - // Need to gather *interior portions* for ALL s-slices in simd directions - // Do the gather as need to treat SIMD lanes differently, and insert zeroes on receive side - // Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2. - //////////////////////////////////////////////////////////////////////////////////////////// - template - static void Gather_plane_exchange(deviceVector >& table,const Lattice &rhs, - std::vector pointers,int dimension,int plane,int cbmask, - compressor &compress,int type,int partial) - { - GridBase *Grid = rhs.Grid(); - int Ls = Grid->_rdimensions[0]; -#ifdef DWF_COMPRESS - int depth=dwf_compressor_depth; -#else - int depth = Ls/2; -#endif - - // insertion of zeroes... - assert( (table.size()&0x1)==0); - int num=table.size()/2; - int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane - - auto rhs_v = rhs.View(AcceleratorRead); - auto p0=&pointers[0][0]; - auto p1=&pointers[1][0]; - auto tp=&table[0]; - int nnum=num/Ls; - accelerator_forNB(j, num, vobj::Nsimd(), { - // Reorders both local and remote comms buffers - // - int s = j % Ls; - int sp1 = (s+depth)%Ls; // peri incremented s slice - - int hxyz= j/Ls; - - int xyz0= hxyz*2; // xyzt part of coor - int xyz1= hxyz*2+1; - - int jj= hxyz + sp1*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice .... - - int kk0= xyz0*Ls + s ; // s=0 goes to s=1 - int kk1= xyz1*Ls + s ; // s=Ls-1 -> s=0 - compress.CompressExchange(p0[jj],p1[jj], - rhs_v[so+tp[kk0 ].second], // Same s, consecutive xyz sites - rhs_v[so+tp[kk1 ].second], - type); - }); - rhs_v.ViewClose(); - } - // Merge routine is for SIMD faces - template - static void MergeFace(decompressor decompress,Merger &mm) - { - auto Ls = mm.dims[0]; -#ifdef DWF_COMPRESS - int depth=dwf_compressor_depth; -#else - int depth = Ls/2; -#endif - int num= mm.buffer_size/2; // relate vol and Ls to buffer size - auto mp = &mm.mpointer[0]; - auto vp0= &mm.vpointers[0][0]; // First arg is exchange first - auto vp1= &mm.vpointers[1][0]; - auto type= mm.type; - int nnum = num/Ls; - accelerator_forNB(o,num,Merger::Nsimd,{ - - int s=o%Ls; - int hxyz=o/Ls; // xyzt related component - int xyz0=hxyz*2; - int xyz1=hxyz*2+1; - - int sp = (s+depth)%Ls; - int jj= hxyz + sp*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice .... - - int oo0= s+xyz0*Ls; - int oo1= s+xyz1*Ls; - - // same ss0, ss1 pair goes to new layout - decompress.Exchange(mp[oo0],mp[oo1],vp0[jj],vp1[jj],type); - }); - } -}; -class FaceGatherDWFMixedBCs -{ -public: -#ifdef DWF_COMPRESS - static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);}; -#else - static int PartialCompressionFactor(GridBase *grid) {return 1;} -#endif - - template - static void Gather_plane_simple (deviceVector >& table, - const Lattice &rhs, - cobj *buffer, - compressor &compress, - int off,int so,int partial) - { - // std::cout << " face gather simple DWF partial "< - static void Gather_plane_exchange(deviceVector >& table,const Lattice &rhs, - std::vector pointers,int dimension,int plane,int cbmask, - compressor &compress,int type,int partial) - { - // std::cout << " face gather exch DWF partial "< - static void MergeFace(decompressor decompress,Merger &mm) - { - int partial = mm.partial; - // std::cout << " merge DWF partial "< - static void DecompressFace(decompressor decompress,Decompression &dd) - { - int partial = dd.partial; - // std::cout << " decompress DWF partial "< -class WilsonCompressorTemplate : public FaceGatherDWFMixedBCs -// : public FaceGatherSimple +class WilsonCompressorTemplate : public FaceGatherSimple { public: diff --git a/Grid/qcd/action/fermion/WilsonFermion.h b/Grid/qcd/action/fermion/WilsonFermion.h index 16320a9330..41a2471e2b 100644 --- a/Grid/qcd/action/fermion/WilsonFermion.h +++ b/Grid/qcd/action/fermion/WilsonFermion.h @@ -165,6 +165,12 @@ class WilsonFermion : public WilsonKernels, public WilsonFermionStatic StencilImpl Stencil; StencilImpl StencilEven; StencilImpl StencilOdd; + void SloppyComms(int sloppy) + { + Stencil.SetSloppyComms(sloppy); + StencilEven.SetSloppyComms(sloppy); + StencilOdd.SetSloppyComms(sloppy); + } // Copy of the gauge field , with even and odd subsets DoubledGaugeField Umu; diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h index 9dadc866f6..ea0eaa32e0 100644 --- a/Grid/qcd/action/fermion/WilsonFermion5D.h +++ b/Grid/qcd/action/fermion/WilsonFermion5D.h @@ -204,7 +204,14 @@ class WilsonFermion5D : public WilsonKernels, public WilsonFermion5DStatic DoubledGaugeField Umu; DoubledGaugeField UmuEven; DoubledGaugeField UmuOdd; - + + + void SloppyComms(int sloppy) + { + Stencil.SetSloppyComms(sloppy); + StencilEven.SetSloppyComms(sloppy); + StencilOdd.SetSloppyComms(sloppy); + } // Comms buffer // std::vector > comm_buf; diff --git a/Grid/stencil/Stencil.cc b/Grid/stencil/Stencil.cc index 27dc75eddd..1d04716918 100644 --- a/Grid/stencil/Stencil.cc +++ b/Grid/stencil/Stencil.cc @@ -30,25 +30,26 @@ NAMESPACE_BEGIN(Grid); uint64_t DslashFullCount; -uint64_t DslashPartialCount; +//uint64_t DslashPartialCount; uint64_t DslashDirichletCount; void DslashResetCounts(void) { DslashFullCount=0; - DslashPartialCount=0; + // DslashPartialCount=0; DslashDirichletCount=0; } void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full) { dirichlet = DslashDirichletCount; - partial = DslashPartialCount; + partial = 0; full = DslashFullCount; } void DslashLogFull(void) { DslashFullCount++;} -void DslashLogPartial(void) { DslashPartialCount++;} +//void DslashLogPartial(void) { DslashPartialCount++;} void DslashLogDirichlet(void){ DslashDirichletCount++;} +deviceVector StencilBuffer::DeviceCommBuf; void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, int off,std::vector > & table) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 881c467377..090b7fe02f 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid); // These can move into a params header and be given MacroMagic serialisation struct DefaultImplParams { Coordinate dirichlet; // Blocksize of dirichlet BCs - int partialDirichlet; + // int partialDirichlet; DefaultImplParams() { dirichlet.resize(0); - partialDirichlet=0; + // partialDirichlet=0; }; }; @@ -69,6 +69,12 @@ struct DefaultImplParams { void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, int off,std::vector > & table); +class StencilBuffer +{ +public: + static deviceVector DeviceCommBuf; // placed in Stencil.cc +}; + void DslashResetCounts(void); void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full); void DslashLogFull(void); @@ -113,8 +119,8 @@ class CartesianStencilAccelerator { /////////////////////////////////////////////////// // If true, this is partially communicated per face /////////////////////////////////////////////////// - StencilVector _comms_partial_send; - StencilVector _comms_partial_recv; + // StencilVector _comms_partial_send; + // StencilVector _comms_partial_recv; // StencilVector _comm_buf_size; StencilVector _permute_type; @@ -210,16 +216,16 @@ class CartesianStencil : public CartesianStencilAccelerator vpointers; Integer buffer_size; Integer type; - Integer partial; // partial dirichlet BCs + // Integer partial; // partial dirichlet BCs Coordinate dims; }; struct Decompress { @@ -236,7 +242,7 @@ class CartesianStencil : public CartesianStencilAccelerator > > face_table ; deviceVector surface_list; @@ -396,24 +408,145 @@ class CartesianStencil : public CartesianStencilAcceleratorIsOffNode(packet.from_rank) ) { + + typedef typename getPrecision::real_scalar_type word; + uint64_t words = packet.rbytes/sizeof(word); + const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word); + const uint64_t outer = words/nsimd; + + if(sizeof(word)==8) { + + // Can either choose to represent as float vs double and prec change + // OR + // truncate the mantissa bfp16 style + double *dbuf =(double *) packet.recv_buf; + float *fbuf =(float *) packet.compressed_recv_buf; + + accelerator_forNB(ss,outer,nsimd,{ + int lane = acceleratorSIMTlane(nsimd); + dbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]; //conversion + }); + + } else if ( sizeof(word)==4){ + // Can either choose to represent as half vs float and prec change + // OR + // truncate the mantissa bfp16 style + + uint32_t *fbuf =(uint32_t *) packet.recv_buf; + uint16_t *hbuf =(uint16_t *) packet.compressed_recv_buf; + + accelerator_forNB(ss,outer,nsimd,{ + int lane = acceleratorSIMTlane(nsimd); + fbuf[ss*nsimd+lane] = ((uint32_t)hbuf[ss*nsimd+lane])<<16; //copy back and pad each word with zeroes + }); + + } else { + assert(0 && "unknown floating point precision"); + } + } + } + void CompressPacket(Packet &packet) + { + packet.xbytes_compressed = packet.xbytes; + packet.compressed_send_buf = packet.send_buf; + + packet.rbytes_compressed = packet.rbytes; + packet.compressed_recv_buf = packet.recv_buf; + + if ( !SloppyComms ) { + return; + } + + typedef typename getPrecision::real_scalar_type word; + uint64_t words = packet.xbytes/sizeof(word); + const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word); + const uint64_t outer = words/nsimd; + + if (packet.do_recv && _grid->IsOffNode(packet.from_rank) ) { + + packet.rbytes_compressed = packet.rbytes/2; + packet.compressed_recv_buf = DeviceBufferMalloc(packet.rbytes_compressed); + // std::cout << " CompressPacket recv from "<IsOffNode(packet.to_rank) ) { + + packet.xbytes_compressed = packet.xbytes/2; + packet.compressed_send_buf = DeviceBufferMalloc(packet.xbytes_compressed); + // std::cout << " CompressPacket send to "<>16; // convert as in Bagel/BFM ; bfloat16 ; s7e8 Intel patent + }); + + } else { + assert(0 && "unknown floating point precision"); + } + + } + // else { + // std::cout << " CompressPacket send is uncompressed to "< > &reqs) { - // std::cout << "Communicate Begin "<Barrier(); FlightRecorder::StepLog("Communicate begin"); + /////////////////////////////////////////////// // All GPU kernel tasks must complete - // accelerator_barrier(); // All kernels should ALREADY be complete - // _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer - // But the HaloGather had a barrier too. + // accelerator_barrier(); All kernels should ALREADY be complete + //Everyone is here, so noone running slow and still using receive buffer + _grid->StencilBarrier(); + // But the HaloGather had a barrier too. + /////////////////////////////////////////////// + if (SloppyComms) { + DeviceBufferFreeAll(); + } + for(int i=0;iCompressPacket(Packets[i]); + } + if (SloppyComms) { + accelerator_barrier(); +#ifdef NVLINK_GET + _grid->StencilBarrier(); +#endif + } + for(int i=0;iBarrier(); _grid->StencilSendToRecvFromPrepare(MpiReqs, - Packets[i].send_buf, + Packets[i].compressed_send_buf, Packets[i].to_rank,Packets[i].do_send, - Packets[i].recv_buf, + Packets[i].compressed_recv_buf, Packets[i].from_rank,Packets[i].do_recv, - Packets[i].xbytes,Packets[i].rbytes,i); + Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i); } // std::cout << "Communicate PollDtoH "<Barrier(); @@ -424,19 +557,22 @@ class CartesianStencil : public CartesianStencilAcceleratorBarrier(); _grid->StencilSendToRecvFromBegin(MpiReqs, - Packets[i].send_buf, + Packets[i].send_buf,Packets[i].compressed_send_buf, Packets[i].to_rank,Packets[i].do_send, - Packets[i].recv_buf, + Packets[i].recv_buf,Packets[i].compressed_recv_buf, Packets[i].from_rank,Packets[i].do_recv, - Packets[i].xbytes,Packets[i].rbytes,i); + Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i); + // std::cout << "Communicate Begin started "<Barrier(); } FlightRecorder::StepLog("Communicate begin has finished"); // Get comms started then run checksums // Having this PRIOR to the dslash seems to make Sunspot work... (!) for(int i=0;iBarrier(); _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done - if ( this->partialDirichlet ) DslashLogPartial(); - else if ( this->fullDirichlet ) DslashLogDirichlet(); + // if ( this->partialDirichlet ) DslashLogPartial(); + if ( this->fullDirichlet ) DslashLogDirichlet(); else DslashLogFull(); // acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete // accelerator_barrier(); for(int i=0;iDecompressPacket(Packets[i]); if ( Packets[i].do_recv ) - FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank); + FlightRecorder::recvLog(Packets[i].compressed_recv_buf,Packets[i].rbytes_compressed,Packets[i].from_rank); } FlightRecorder::StepLog("Finish communicate complete"); } @@ -655,7 +792,7 @@ class CartesianStencil : public CartesianStencilAccelerator &dv) { Decompress d; - d.partial = this->partialDirichlet; + // d.partial = this->partialDirichlet; d.dims = _grid->_fdimensions; d.kernel_p = k_p; d.mpi_p = m_p; @@ -664,7 +801,7 @@ class CartesianStencil : public CartesianStencilAccelerator &rpointers,Integer buffer_size,Integer type,std::vector &mv) { Merge m; - m.partial = this->partialDirichlet; + // m.partial = this->partialDirichlet; m.dims = _grid->_fdimensions; m.type = type; m.mpointer = merge_p; @@ -769,8 +906,8 @@ class CartesianStencil : public CartesianStencilAccelerator_comms_send[ii] = comm_dim; this->_comms_recv[ii] = comm_dim; - this->_comms_partial_send[ii] = 0; - this->_comms_partial_recv[ii] = 0; + // this->_comms_partial_send[ii] = 0; + // this->_comms_partial_recv[ii] = 0; if ( block && comm_dim ) { assert(abs(displacement) < ld ); // Quiesce communication across block boundaries @@ -791,10 +928,10 @@ class CartesianStencil : public CartesianStencilAccelerator_comms_send[ii] = 0; if ( ( (ld*pc ) % block ) == 0 ) this->_comms_recv[ii] = 0; } - if ( partialDirichlet ) { - this->_comms_partial_send[ii] = !this->_comms_send[ii]; - this->_comms_partial_recv[ii] = !this->_comms_recv[ii]; - } + // if ( partialDirichlet ) { + // this->_comms_partial_send[ii] = !this->_comms_send[ii]; + // this->_comms_partial_recv[ii] = !this->_comms_recv[ii]; + // } } } } @@ -806,6 +943,7 @@ class CartesianStencil : public CartesianStencilAcceleratorparameters=p; @@ -823,7 +961,7 @@ class CartesianStencil : public CartesianStencilAcceleratorsame_node.resize(npoints); if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0); - partialDirichlet = p.partialDirichlet; + // partialDirichlet = p.partialDirichlet; DirichletBlock(p.dirichlet); // comms send/recv set up fullDirichlet=0; for(int d=0;dNsimd(); - // Allow for multiple stencils to exist simultaneously + // Allow for multiple stencils to be communicated simultaneously if (!preserve_shm) _grid->ShmBufferFreeAll(); @@ -972,7 +1110,8 @@ class CartesianStencil : public CartesianStencilAcceleratorNsimd(); - int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ; + // int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ; + int comms_recv = this->_comms_recv[point]; int fd = _grid->_fdimensions[dimension]; int ld = _grid->_ldimensions[dimension]; int rd = _grid->_rdimensions[dimension]; @@ -1161,8 +1300,8 @@ class CartesianStencil : public CartesianStencilAccelerator_comms_send[point]; int comms_recv = this->_comms_recv[point]; - int comms_partial_send = this->_comms_partial_send[point] ; - int comms_partial_recv = this->_comms_partial_recv[point] ; + // int comms_partial_send = this->_comms_partial_send[point] ; + // int comms_partial_recv = this->_comms_partial_recv[point] ; GRID_ASSERT(rhs.Grid()==_grid); // conformable(_grid,rhs.Grid()); @@ -1197,11 +1336,11 @@ class CartesianStencil : public CartesianStencilAccelerator_ostride[dimension]; // base offset for start of plane @@ -1228,7 +1367,8 @@ class CartesianStencil : public CartesianStencilAcceleratoru_recv_buf_p; @@ -1262,7 +1402,8 @@ class CartesianStencil : public CartesianStencilAcceleratoru_recv_buf_p[comm_off], &recv_buf[comm_off], words,Decompressions); @@ -1302,8 +1443,8 @@ class CartesianStencil : public CartesianStencilAccelerator_comms_send[point]; int comms_recv = this->_comms_recv[point]; - int comms_partial_send = this->_comms_partial_send[point] ; - int comms_partial_recv = this->_comms_partial_recv[point] ; + // int comms_partial_send = this->_comms_partial_send[point] ; + // int comms_partial_recv = this->_comms_partial_recv[point] ; int fd = _grid->_fdimensions[dimension]; int rd = _grid->_rdimensions[dimension]; @@ -1378,18 +1519,20 @@ class CartesianStencil : public CartesianStencilAcceleratoru_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers); } From 30cf9a0a7984d610a6081d423d65dec3a8326d3c Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sat, 9 May 2026 10:16:17 +0200 Subject: [PATCH 51/53] Update Benchmark_dwf_fp32.cc --- benchmarks/Benchmark_dwf_fp32.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc index c83e129a47..f092751946 100644 --- a/benchmarks/Benchmark_dwf_fp32.cc +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -266,7 +266,7 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy) FermionAction::ImplParams p; p.dirichlet=Dirichlet; FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p); - //Dw.SloppyComms(sloppy); + Dw.SloppyComms(sloppy); Dw.ImportGauge(Umu); int ncall =300; From 14bff9e1aa9f5ead983e246a8213d21b67564d74 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sat, 9 May 2026 08:19:26 +0000 Subject: [PATCH 52/53] merge adjust --- Grid/communicator/Communicator_none.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc index 5adbc5d5de..a7232fcdd4 100644 --- a/Grid/communicator/Communicator_none.cc +++ b/Grid/communicator/Communicator_none.cc @@ -127,6 +127,8 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest dest=0; } +int CartesianCommunicator::IsOffNode(int rank) { return false; } + double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, int xmit_to_rank,int dox, void *recv, From a6ede60b08b266ff3d97d46a753908fc70552305 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sat, 9 May 2026 08:35:01 +0000 Subject: [PATCH 53/53] merge adjust --- Grid/communicator/SharedMemoryMPI.cc | 55 ++++--------------- Grid/cshift/Cshift_mpi.h | 4 +- Grid/lattice/Lattice_reduction_gpu.h | 2 +- Grid/lattice/Lattice_reduction_sycl.h | 10 ++-- Grid/qcd/action/fermion/CayleyFermion5D.h | 4 +- .../implementation/CayleyFermion5Dcache.h | 4 +- Grid/qcd/smearing/GaugeConfiguration.h | 1 - Grid/threads/Accelerator.cc | 4 +- Grid/util/Init.cc | 3 +- HMC/ComputeWilsonFlow.cc | 3 +- configure.ac | 2 +- 11 files changed, 29 insertions(+), 63 deletions(-) diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 4af766fd56..213543ccd7 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -540,49 +540,21 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) /////////////////////////////////////////////////////////////////////////////////////////////////////////// #ifndef ACCELERATOR_AWARE_MPI // printf("Host buffer allocate for GPU non-aware MPI\n"); -#if 0 - HostCommBuf= acceleratorAllocHost(bytes); -#else HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host -#if 0 - #warning "Moving host buffers to specific NUMA domain" - int numa; - char *numa_name=(char *)getenv("MPI_BUF_NUMA"); - if(numa_name) { - unsigned long page_size = sysconf(_SC_PAGESIZE); - numa = atoi(numa_name); - unsigned long page_count = bytes/page_size; - std::vector pages(page_count); - std::vector nodes(page_count,numa); - std::vector status(page_count,-1); - for(unsigned long p=0;p void Cshift_comms(Lattice &ret,const Lattice &r int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; static deviceVector send_buf; send_buf.resize(buffer_size); static deviceVector recv_buf; recv_buf.resize(buffer_size); - #ifndef ACCELERATOR_AWARE_MPI int pad = (8 + sizeof(vobj) - 1) / sizeof(vobj); static hostVector hsend_buf; hsend_buf.resize(buffer_size+pad); static hostVector hrecv_buf; hrecv_buf.resize(buffer_size+pad); #endif - + int cb= (cbmask==0x2)? Odd : Even; int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); RealD tcopy=0.0; @@ -351,7 +350,6 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice sharedMemPerBlock ) { - //std::cout << GridLogError << "The object is too large for the shared memory." << std::endl; + std::cout << GridLogError << "The object is too large for the shared memory." << std::endl; return 0; } while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2; diff --git a/Grid/lattice/Lattice_reduction_sycl.h b/Grid/lattice/Lattice_reduction_sycl.h index c0d1d07810..ace7f01086 100644 --- a/Grid/lattice/Lattice_reduction_sycl.h +++ b/Grid/lattice/Lattice_reduction_sycl.h @@ -96,11 +96,11 @@ template Word checksum_gpu(Word *vec,uint64_t L) theGridAccelerator->submit([&](sycl::handler &cgh) { auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>()); cgh.parallel_for(sycl::range<1>{L}, - Reduction, - [=] (sycl::id<1> index, auto &sum) { - auto l = index % 61; - sum ^= vec[index]<>(64-l); - }); + Reduction, + [=] (sycl::id<1> index, auto &sum) { + auto l = index % 61; + sum ^= vec[index]<>(64-l); + }); }); } theGridAccelerator->wait(); diff --git a/Grid/qcd/action/fermion/CayleyFermion5D.h b/Grid/qcd/action/fermion/CayleyFermion5D.h index 6d6c93237e..77491def22 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5D.h +++ b/Grid/qcd/action/fermion/CayleyFermion5D.h @@ -87,14 +87,14 @@ class CayleyFermion5D : public WilsonFermion5D ///////////////////////////////////////////////////// // Instantiate different versions depending on Impl ///////////////////////////////////////////////////// - virtual void M5D(const FermionField &psi, + void M5D(const FermionField &psi, const FermionField &phi, FermionField &chi, std::vector &lower, std::vector &diag, std::vector &upper); - virtual void M5Ddag(const FermionField &psi, + void M5Ddag(const FermionField &psi, const FermionField &phi, FermionField &chi, std::vector &lower, diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h index 8e7e2a603a..4bc703a313 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h @@ -130,7 +130,7 @@ CayleyFermion5D::MooeeInv (const FermionField &psi_i, FermionField &chi GridBase *grid=psi_i.Grid(); autoView(psi , psi_i,AcceleratorRead); - autoView(chi , chi_i,AcceleratorWriteDiscard); + autoView(chi , chi_i,AcceleratorWrite); int Ls=this->Ls; @@ -194,7 +194,7 @@ CayleyFermion5D::MooeeInvDag (const FermionField &psi_i, FermionField &chi int Ls=this->Ls; autoView(psi , psi_i,AcceleratorRead); - autoView(chi , chi_i,AcceleratorWriteDiscard); + autoView(chi , chi_i,AcceleratorWrite); acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); diff --git a/Grid/qcd/smearing/GaugeConfiguration.h b/Grid/qcd/smearing/GaugeConfiguration.h index 522541a19d..6f6fd016ef 100644 --- a/Grid/qcd/smearing/GaugeConfiguration.h +++ b/Grid/qcd/smearing/GaugeConfiguration.h @@ -120,7 +120,6 @@ class SmearedConfiguration : public ConfigurationBase } public: /*! @brief Returns smeared configuration at level 'Level' */ -public: const GaugeField &get_smeared_conf(int Level) const { return SmearedSet[Level]; diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index f750b56c94..aa39f88c93 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -210,8 +210,8 @@ void acceleratorInit(void) // sycl::gpu_selector selector; // sycl::device selectedDevice { selector }; theGridAccelerator = new sycl::queue (sycl::gpu_selector_v); - //theCopyAccelerator = new sycl::queue (sycl::gpu_selector_v); - theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway. + theCopyAccelerator = new sycl::queue (sycl::gpu_selector_v); + // theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway. #ifdef GRID_SYCL_LEVEL_ZERO_IPC zeInit(0); diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 2702b89629..23061372a1 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -397,7 +397,7 @@ void Grid_init(int *argc,char ***argv) ///////////////////////////////////////////////////////////////// if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){ Grid_quiesce_nodes(); - } else { + } else { FILE *fp; std::ostringstream fname; @@ -412,7 +412,6 @@ void Grid_init(int *argc,char ***argv) fname << (rank/radix)*radix ; mkdir(fname.str().c_str(), S_IRWXU ); fname << "/"; - fname<<"Grid.stdout."; fname<::avgPlaquette(Uflow); RealD WFlow_TC = WilsonLoops::TopologicalCharge(Uflow); RealD WFlow_TC5Li = WilsonLoops::TopologicalCharge5Li(Uflow); diff --git a/configure.ac b/configure.ac index b06303799d..ce786b0169 100644 --- a/configure.ac +++ b/configure.ac @@ -275,7 +275,7 @@ esac ############### CHECKSUM COMMS AC_ARG_ENABLE([checksum-comms], [AS_HELP_STRING([--enable-checksum-comms=yes|no],[checksum all communication])], - [ac_CHECKSUM_COMMS=${enable_checksum_comms}], [ac_CHECKSUM_COMMS=yes]) + [ac_CHECKSUM_COMMS=${enable_checksum_comms}], [ac_CHECKSUM_COMMS=no]) case ${ac_CHECKSUM_COMMS} in yes)