diff --git a/Grid/Grid.h b/Grid/Grid.h index 70859c8154..4bf573b77d 100644 --- a/Grid/Grid.h +++ b/Grid/Grid.h @@ -40,11 +40,11 @@ Author: paboyle #include #include -#include +#include // now contains #include #include #include #include -#include +//#include //moved to ActionCore.h #include #endif diff --git a/Grid/GridCore.h b/Grid/GridCore.h index 90d01f2e46..cdb1aeb755 100644 --- a/Grid/GridCore.h +++ b/Grid/GridCore.h @@ -42,12 +42,12 @@ Author: paboyle #include #include #include -#include #include -//#include -#include #include #include +#include +//#include +#include #include #include #include diff --git a/Grid/Grid_Eigen_Dense.h b/Grid/Grid_Eigen_Dense.h index 8bd1d11357..0d2249ec25 100644 --- a/Grid/Grid_Eigen_Dense.h +++ b/Grid/Grid_Eigen_Dense.h @@ -39,6 +39,7 @@ #endif /* HIP save and restore compile environment*/ + #ifdef GRID_HIP #pragma push #pragma push_macro("__HIP_DEVICE_COMPILE__") @@ -63,11 +64,12 @@ #endif /*HIP restore*/ +/* #ifdef __HIP__REDEFINE__ #pragma pop_macro("__HIP_DEVICE_COMPILE__") #pragma pop #endif - +*/ #if defined __GNUC__ #pragma GCC diagnostic pop #endif diff --git a/Grid/cartesian/Cartesian_base.h b/Grid/cartesian/Cartesian_base.h index 1a5049ca21..c32de62b24 100644 --- a/Grid/cartesian/Cartesian_base.h +++ b/Grid/cartesian/Cartesian_base.h @@ -92,6 +92,7 @@ class GridBase : public CartesianCommunicator , public GridThread { //////////////////////////////////////////////////////////////// virtual int CheckerBoarded(int dim) =0; virtual int CheckerBoard(const Coordinate &site)=0; + virtual int CheckerDim(void){ return 0; }; virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0; virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0; virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0; diff --git a/Grid/cartesian/Cartesian_red_black.h b/Grid/cartesian/Cartesian_red_black.h index f218c2b56d..3099907a8c 100644 --- a/Grid/cartesian/Cartesian_red_black.h +++ b/Grid/cartesian/Cartesian_red_black.h @@ -60,6 +60,7 @@ class GridRedBlackCartesian : public GridBase // int _checker_dim; std::vector _checker_board; + virtual int CheckerDim(void){ return _checker_dim; }; virtual int isCheckerBoarded(void) const { return 1; }; virtual int CheckerBoarded(int dim){ if( dim==_checker_dim) return 1; diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h index fa3f27a56f..19eb5ba7ed 100644 --- a/Grid/cshift/Cshift_common.h +++ b/Grid/cshift/Cshift_common.h @@ -52,6 +52,7 @@ inline std::pair *MapCshiftTable(void) template void Gather_plane_simple (const Lattice &rhs,deviceVector &buffer,int dimension,int plane,int cbmask, int off=0) { + GRID_TRACE("Gather_plane_simple"); int rd = rhs.Grid()->_rdimensions[dimension]; if ( !rhs.Grid()->CheckerBoarded(dimension) ) { @@ -105,6 +106,7 @@ Gather_plane_extract(const Lattice &rhs, ExtractPointerArray pointers, int dimension,int plane,int cbmask) { + GRID_TRACE("Gather_plane_extract"); int rd = rhs.Grid()->_rdimensions[dimension]; if ( !rhs.Grid()->CheckerBoarded(dimension) ) { @@ -160,6 +162,7 @@ Gather_plane_extract(const Lattice &rhs, ////////////////////////////////////////////////////// template void Scatter_plane_simple (Lattice &rhs,deviceVector &buffer, int dimension,int plane,int cbmask) { + GRID_TRACE("Scatter_plane_simple"); int rd = rhs.Grid()->_rdimensions[dimension]; if ( !rhs.Grid()->CheckerBoarded(dimension) ) { @@ -214,6 +217,7 @@ template void Scatter_plane_simple (Lattice &rhs,deviceVector< ////////////////////////////////////////////////////// template void Scatter_plane_merge(Lattice &rhs,ExtractPointerArray pointers,int dimension,int plane,int cbmask) { + GRID_TRACE("Scatter_plane_merge"); int rd = rhs.Grid()->_rdimensions[dimension]; if ( !rhs.Grid()->CheckerBoarded(dimension) ) { @@ -263,6 +267,7 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA template void Copy_plane(Lattice& lhs,const Lattice &rhs, int dimension,int lplane,int rplane,int cbmask) { + GRID_TRACE("Copy_plane"); int rd = rhs.Grid()->_rdimensions[dimension]; if ( !rhs.Grid()->CheckerBoarded(dimension) ) { @@ -311,6 +316,7 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs template void Copy_plane_permute(Lattice& lhs,const Lattice &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type) { + GRID_TRACE("Copy_plane_permute"); int rd = rhs.Grid()->_rdimensions[dimension]; if ( !rhs.Grid()->CheckerBoarded(dimension) ) { @@ -358,6 +364,7 @@ template void Copy_plane_permute(Lattice& lhs,const Lattice void Cshift_local(Lattice& ret,const Lattice &rhs,int dimension,int shift) { + GRID_TRACE("Cshift_local"); int sshift[2]; sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even); diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h index 252478ade2..4688ff25ad 100644 --- a/Grid/cshift/Cshift_mpi.h +++ b/Grid/cshift/Cshift_mpi.h @@ -38,6 +38,7 @@ extern uint64_t checksum_index; const int Cshift_verbose=0; template Lattice Cshift(const Lattice &rhs,int dimension,int shift) { + GRID_TRACE("Cshift"); typedef typename vobj::vector_type vector_type; typedef typename vobj::scalar_type scalar_type; @@ -138,6 +139,7 @@ template void Cshift_simple(Lattice& ret,const Lattice & } template void Cshift_comms(Lattice& ret,const Lattice &rhs,int dimension,int shift) { + GRID_TRACE("Cshift_comms"); int sshift[2]; sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even); @@ -156,6 +158,7 @@ template void Cshift_comms(Lattice& ret,const Lattice &r template void Cshift_comms_simd(Lattice& ret,const Lattice &rhs,int dimension,int shift) { + GRID_TRACE("Cshift_comms_simd"); int sshift[2]; sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even); @@ -173,6 +176,7 @@ template void Cshift_comms_simd(Lattice& ret,const Lattice void Cshift_comms(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) { + GRID_TRACE("Cshift_comms_cb"); typedef typename vobj::vector_type vector_type; typedef typename vobj::scalar_type scalar_type; @@ -305,6 +309,7 @@ template void Cshift_comms(Lattice &ret,const Lattice &r template void Cshift_comms_simd(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) { + GRID_TRACE("Cshift_comms_simd_cb"); GridBase *grid=rhs.Grid(); const int Nsimd = grid->Nsimd(); typedef typename vobj::vector_type vector_type; diff --git a/Grid/lattice/Lattice_trace.h b/Grid/lattice/Lattice_trace.h index 9a0c6f9647..507a6804d3 100644 --- a/Grid/lattice/Lattice_trace.h +++ b/Grid/lattice/Lattice_trace.h @@ -97,6 +97,7 @@ Lattice > > > Determinant(const Lattice Lattice > > > Inverse(const Lattice > > > &Umu) { +#if 0 GridBase *grid=Umu.Grid(); auto lvol = grid->lSites(); Lattice > > > ret(grid); @@ -121,9 +122,163 @@ Lattice > > > Inverse(const LatticeoSites(); + const int Nsimd=grid->Nsimd(); + Lattice > > > ret(grid); + autoView(Umu_v,Umu,CpuRead); + autoView(ret_v,ret,CpuWrite); + thread_for(site,osites,{ + Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N); + + iScalar > > Us; + iScalar > > Ui; + + for(int lane=0;lane +accelerator_inline void LUdcmp( iMatrix &LU, iVector &P) +{ + + const RealD TINY=1.0e-40; + + type1 vv[N], tmp, _max; + for(int i=0; i _max ) _max = tmp; + assert( abs(_max) > TINY ); + vv[i] = abs(1.0/_max); + } + + int imax; + for(int k=0; k_max) { + _max = tmp; + imax = i; + } + } + if(k!=imax){ + for(int j=0; j +accelerator_inline void solve( iVector &x, const iMatrix LU, const iVector P){ + + type1 sum = 0.0; + + int ii=0; + for(int i=0; i0.0) ii=i+1; + x(i) = sum; + } + for(int i=N-1; i>=0; i--){ + sum = x(i); + for(int j=i+1; j +Lattice > > > Inverse_RealPart(const Lattice > > > &Umu) +{ + GridBase *grid=Umu.Grid(); + auto osites = grid->oSites(); + const int Nsimd=grid->Nsimd(); + Lattice > > > ret(grid); +#if 0 // CPU version + autoView(Umu_v,Umu,CpuRead); + autoView(ret_v,ret,CpuWrite); + thread_for(site,osites,{ + Eigen::MatrixXd EigenU = Eigen::MatrixXd::Zero(N,N); + + iScalar > > Us; + iScalar > > Ui; + + for(int lane=0;laneoSites(),vComplex::Nsimd(),{ + iMatrix LU; + iVector P; + iVector e; + // scalar layout won't coalesce +#ifdef GRID_SIMT + { + int blane=acceleratorSIMTlane(Nsimd); // buffer lane +#else + for(int blane=0;blane_rdimensions[d] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[d]); } } - //////////////////////////////////////////////////////////////////////////////////////////// // remove and insert a half checkerboard //////////////////////////////////////////////////////////////////////////////////////////// + template inline void pickCheckerboard(int cb,Lattice &half,const Lattice &full) { - half.Checkerboard() = cb; - - autoView( half_v, half, CpuWrite); - autoView( full_v, full, CpuRead); - thread_for(ss, full.Grid()->oSites(),{ - int cbos; - Coordinate coor; - full.Grid()->oCoorFromOindex(coor,ss); - cbos=half.Grid()->CheckerBoard(coor); - - if (cbos==cb) { - int ssh=half.Grid()->oIndex(coor); - half_v[ssh] = full_v[ss]; - } - }); + acceleratorPickCheckerboard(cb,half,full); } template inline void setCheckerboard(Lattice &full,const Lattice &half) { - int cb = half.Checkerboard(); - autoView( half_v , half, CpuRead); - autoView( full_v , full, CpuWrite); - thread_for(ss,full.Grid()->oSites(),{ - - Coordinate coor; - int cbos; - - full.Grid()->oCoorFromOindex(coor,ss); - cbos=half.Grid()->CheckerBoard(coor); - - if (cbos==cb) { - int ssh=half.Grid()->oIndex(coor); - full_v[ss]=half_v[ssh]; - } - }); + acceleratorSetCheckerboard(full,half); } -template inline void acceleratorPickCheckerboard(int cb,Lattice &half,const Lattice &full, int checker_dim_half=0) +template inline void acceleratorPickCheckerboard(int cb,Lattice &half,const Lattice &full, int dummy=0) { half.Checkerboard() = cb; autoView(half_v, half, AcceleratorWrite); @@ -95,6 +66,7 @@ template inline void acceleratorPickCheckerboard(int cb,Lattice_ndimension; Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask; Coordinate ostride_half = half.Grid()->_ostride; + int checker_dim_half = half.Grid()->CheckerDim(); accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{ Coordinate coor; @@ -119,7 +91,7 @@ template inline void acceleratorPickCheckerboard(int cb,Lattice inline void acceleratorSetCheckerboard(Lattice &full,const Lattice &half, int checker_dim_half=0) +template inline void acceleratorSetCheckerboard(Lattice &full,const Lattice &half, int dummy=0) { int cb = half.Checkerboard(); autoView(half_v , half, AcceleratorRead); @@ -129,6 +101,7 @@ template inline void acceleratorSetCheckerboard(Lattice &full, unsigned long ndim_half = half.Grid()->_ndimension; Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask; Coordinate ostride_half = half.Grid()->_ostride; + int checker_dim_half = half.Grid()->CheckerDim(); accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{ Coordinate coor; diff --git a/Grid/lattice/Lattice_view.h b/Grid/lattice/Lattice_view.h index 4affe542b6..98df3a104c 100644 --- a/Grid/lattice/Lattice_view.h +++ b/Grid/lattice/Lattice_view.h @@ -144,7 +144,7 @@ class ViewCloser #define autoView(l_v,l,mode) \ auto l_v = l.View(mode); \ - ViewCloser _autoView##l_v(l_v,__FILE__,__LINE__,mode); + ViewCloser _autoView##l_v(l_v,(const char *)__FILE__,(int)__LINE__,(int)mode); #else // Little autoscope assister diff --git a/Grid/lattice/PaddedCell.h b/Grid/lattice/PaddedCell.h index 23f6c13af6..43e75c6ab6 100644 --- a/Grid/lattice/PaddedCell.h +++ b/Grid/lattice/PaddedCell.h @@ -298,6 +298,7 @@ class PaddedCell { if( processors[d]==1 ) fll[d]=0; } localCopyRegion(in,out,fll,tll,local); + //DumpSliceNorm("PaddedCell Extract ",out); return out; } template @@ -309,6 +310,7 @@ class PaddedCell { for(int d=0;d @@ -320,6 +322,7 @@ class PaddedCell { for(int d=0;d @@ -436,8 +442,8 @@ class PaddedCell { RealD t_comms=0.0; RealD t_copy=0.0; - // std::cout << GridLogMessage << "dimension " < scalardata(lsites); std::vector iodata(lsites); // Munge, checksum, byte order in here + FlightRecorder::StepLog("readLatticeObject BEGIN"); + grid->Barrier(); IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|control, nersc_csum,scidac_csuma,scidac_csumb); + FlightRecorder::StepLog("readLatticeObject END"); GridStopWatch timer; timer.Start(); @@ -611,9 +614,12 @@ class BinaryIO { timer.Stop(); while (attemptsLeft >= 0) { + FlightRecorder::StepLog("writeLatticeObject BEGIN"); grid->Barrier(); IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|control, nersc_csum,scidac_csuma,scidac_csumb); + FlightRecorder::StepLog("writeLatticeObject END"); + grid->Barrier(); if (checkWrite) { std::vector ckiodata(lsites); diff --git a/Grid/parallelIO/IldgIO.h b/Grid/parallelIO/IldgIO.h index 8ed7df5b57..7b916ac16b 100644 --- a/Grid/parallelIO/IldgIO.h +++ b/Grid/parallelIO/IldgIO.h @@ -24,7 +24,8 @@ See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ -#pragma once +#ifndef GRID_ILDG_IO_H +#define GRID_ILDG_IO_H #ifdef HAVE_LIME #include @@ -951,4 +952,4 @@ NAMESPACE_END(Grid); //HAVE_LIME #endif - +#endif diff --git a/Grid/parallelIO/MetaData.h b/Grid/parallelIO/MetaData.h index 1fc3fe31d5..1e38cd4879 100644 --- a/Grid/parallelIO/MetaData.h +++ b/Grid/parallelIO/MetaData.h @@ -27,6 +27,9 @@ *************************************************************************************/ /* END LEGAL */ +#ifndef GRID_METADATA_IO_H +#define GRID_METADATA_IO_H + #include #include #include @@ -36,6 +39,7 @@ #include #include + NAMESPACE_BEGIN(Grid); /////////////////////////////////////////////////////// @@ -343,3 +347,4 @@ struct Gauge3x2unmunger{ NAMESPACE_END(Grid); +#endif diff --git a/Grid/qcd/action/Action.h b/Grid/qcd/action/Action.h index 737c1ff02b..8b8da5b21d 100644 --- a/Grid/qcd/action/Action.h +++ b/Grid/qcd/action/Action.h @@ -33,6 +33,7 @@ Author: paboyle #ifndef GRID_QCD_ACTION_H #define GRID_QCD_ACTION_H + //////////////////////////////////////////// // Abstract base interface //////////////////////////////////////////// diff --git a/Grid/qcd/action/ActionBase.h b/Grid/qcd/action/ActionBase.h index 96afea0a5a..a765a9d1a9 100644 --- a/Grid/qcd/action/ActionBase.h +++ b/Grid/qcd/action/ActionBase.h @@ -32,8 +32,67 @@ directory #ifndef ACTION_BASE_H #define ACTION_BASE_H +//#define TIMom +#define PRINT_SNAPSHOTS +#ifdef PRINT_SNAPSHOTS +#include +#include +#endif + + NAMESPACE_BEGIN(Grid); +#ifdef PRINT_SNAPSHOTS +template LatticeComplex LocalFieldSquareNorm(Lattice& F){ + LatticeComplex Hloc(F.Grid()), tmp(F.Grid()); + LatticeColourMatrix Fmu(F.Grid()); + Hloc = Zero(); + for (int mu = 0; mu < Nd; mu++) { + Fmu = PeekIndex(F, mu); + tmp = (-1.0/HMC_MOMENTUM_DENOMINATOR)*trace(Fmu * Fmu); + Hloc = Hloc + tmp; + } + return Hloc; +} +template void writeField(Lattice &F, std::string filename, bool reduce=true, bool alg=true) { + + std::replace(filename.begin(),filename.end(), '/', '_'); + std::replace(filename.begin(),filename.end(), '(', '_'); + std::replace(filename.begin(),filename.end(), ')', ' '); // wanted to eliminate ')' from file name + filename.erase(std::remove_if(filename.begin(), filename.end(), [](unsigned char x) { return std::isspace(x); }), filename.end()); + +#ifdef HAVE_LIME + emptyUserRecord record; + ScidacWriter WR(F.Grid()->IsBoss()); + WR.open(filename); + if (reduce) { + auto F_reduced=alg? LocalFieldSquareNorm(F) : localNorm2(F); + WR.writeScidacFieldRecord(F_reduced,record,0); + } + else + WR.writeScidacFieldRecord(F,record,0); + WR.close(); +#else + assert( false && "writeField function needs c-lime support!"); +#endif +} + +// vobj = vLorentzColourMatrixD +template void writeConfig(Lattice &F, std::string filename){ + + std::replace(filename.begin(),filename.end(), '/', '_'); + std::replace(filename.begin(),filename.end(), '(', '_'); + std::replace(filename.begin(),filename.end(), ')', ' '); // wanted to eliminate ')' from file name + filename.erase(std::remove_if(filename.begin(), filename.end(), [](unsigned char x) { return std::isspace(x); }), filename.end()); + + // Use NERSC IO for consistency with our HMC runs + // Then, GaugeStats = PeriodicGaugeStatistics <- the default + int precision32 = 1; + int tworow = 0; + NerscIO::writeConfiguration(F, filename, tworow, precision32); +} +#endif + /////////////////////////////////// // Smart configuration base class /////////////////////////////////// @@ -53,7 +112,7 @@ template class Action { public: - bool is_smeared = false; + bool is_smeared = false; RealD deriv_norm_sum; RealD deriv_max_sum; RealD Fdt_norm_sum; @@ -116,9 +175,17 @@ class Action } virtual void deriv(ConfigurationBase& U, GaugeField& dSdU) { - deriv(U.get_U(is_smeared),dSdU); + // PRINT_SNAPSHOTS: For some actions, deriv is overridden => could be better if we put writeField in update_P + deriv(U.get_U(is_smeared),dSdU); +#ifdef PRINT_SNAPSHOTS + writeField(dSdU, "F_"+action_name()+"_lat."+std::to_string(deriv_num)); +#endif + if ( is_smeared ) { U.smeared_force(dSdU); +#ifdef PRINT_SNAPSHOTS + writeField(dSdU, "F_"+action_name()+"_smr."+std::to_string(deriv_num)); +#endif } } /////////////////////////////// diff --git a/Grid/qcd/action/ActionCore.h b/Grid/qcd/action/ActionCore.h index db916b0217..6c5f214741 100644 --- a/Grid/qcd/action/ActionCore.h +++ b/Grid/qcd/action/ActionCore.h @@ -30,7 +30,24 @@ directory #ifndef QCD_ACTION_CORE #define QCD_ACTION_CORE -#include + +#include + +//////////////////////////////////////////// +// I/O Interfaces for Action and HMC +//////////////////////////////////////////// +#include // also included from Grid/qcd/action/gauge/Gauge.h +#include // moved from Grid.h +// moved from HMC_aggregate.h to allow IO op's in action +// Action.h preceeds HMC_aggregate.h in Grid.h => safe to move here from HMC_aggregate.h +#include +#include +#include +#include +#if !defined(GRID_COMMS_NONE) +#include +#endif +NAMESPACE_CHECK(Ildg); #include NAMESPACE_CHECK(ActionBase); diff --git a/Grid/qcd/action/gauge/GaugeImplTypes.h b/Grid/qcd/action/gauge/GaugeImplTypes.h index 95b692905f..3137952d0f 100644 --- a/Grid/qcd/action/gauge/GaugeImplTypes.h +++ b/Grid/qcd/action/gauge/GaugeImplTypes.h @@ -125,7 +125,22 @@ template PokeIndex(P, Pmu, mu); } } - + + //#ifdef TIMom + static inline void generate_momenta(Field &P, GridSerialRNG & sRNG, GridParallelRNG &pRNG, LatticeComplex W) + { + LinkField Pmu(P.Grid()); + Pmu = Zero(); + + for (int mu = 0; mu < Nd; mu++) { + Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pmu); + RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR) ; + Pmu = scale*W*Pmu; + PokeIndex(P, Pmu, mu); + } + } + //#endif + static inline Field projectForce(Field &P) { Field ret(P.Grid()); Group::taProj(P, ret); @@ -151,7 +166,7 @@ template // diff += end - start; // std::cout << "Time to exponentiate matrix " << diff.count() << " s\n"; } - + static inline RealD FieldSquareNorm(Field& U){ LatticeComplex Hloc(U.Grid()); Hloc = Zero(); diff --git a/Grid/qcd/hmc/HMC.h b/Grid/qcd/hmc/HMC.h index c56ae6ad8c..1087dfd2fe 100644 --- a/Grid/qcd/hmc/HMC.h +++ b/Grid/qcd/hmc/HMC.h @@ -263,10 +263,14 @@ class HybridMonteCarlo { if (traj < Params.StartTrajectory + Params.NoMetropolisUntil) { std::cout << GridLogHMC << "-- Thermalization" << std::endl; } + +#ifndef PRINT_SNAPSHOTS + Grid_heartbeat(); +#endif double t0=usecond(); Ucopy = Ucur; - + DeltaH = evolve_hmc_step(Ucopy); // Metropolis-Hastings test bool accept = true; @@ -282,6 +286,10 @@ class HybridMonteCarlo { double t1=usecond(); std::cout << GridLogHMC << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl; +#ifndef PRINT_SNAPSHOTS + Grid_heartbeat_off(); +#endif + TheIntegrator.print_timer(); TheIntegrator.Smearer.set_Field(Ucur); diff --git a/Grid/qcd/hmc/HMC_aggregate.h b/Grid/qcd/hmc/HMC_aggregate.h index cb5109532b..f8edf9dc0c 100644 --- a/Grid/qcd/hmc/HMC_aggregate.h +++ b/Grid/qcd/hmc/HMC_aggregate.h @@ -35,16 +35,6 @@ directory #include #include -// annoying location; should move this ? -#include -#include -#include -#include -#if !defined(GRID_COMMS_NONE) -#include -#endif -NAMESPACE_CHECK(Ildg); - #include #include #include diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h index f497f7498f..b0b8b3bf40 100644 --- a/Grid/qcd/hmc/integrators/Integrator.h +++ b/Grid/qcd/hmc/integrators/Integrator.h @@ -27,8 +27,9 @@ with this program; if not, write to the Free Software Foundation, Inc., See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ - /* END LEGAL */ - //-------------------------------------------------------------------- + /* END LEGAL */ +//-------------------------------------------------------------------- + #ifndef INTEGRATOR_INCLUDED #define INTEGRATOR_INCLUDED @@ -41,6 +42,8 @@ class IntegratorParameters: Serializable { GRID_SERIALIZABLE_CLASS_MEMBERS(IntegratorParameters, std::string, name, // name of the integrator unsigned int, MDsteps, // number of outer steps + RealD, alpha, + int, box_dim, RealD, trajL) // trajectory length IntegratorParameters(int MDsteps_ = 10, RealD trajL_ = 1.0) @@ -62,6 +65,7 @@ class IntegratorParameters: Serializable { } }; + /*! @brief Class for Molecular Dynamics management */ template class Integrator { @@ -94,11 +98,30 @@ class Integrator { static MomentumFilterNone filter; return &filter; } +#ifdef TIMom + + // TODO: make mask as a member; put most of the code to the constructor + static void scaleMom(LatticeComplex& mask, RealD alpha, int box_dim ){ + LatticeInteger coor(mask.Grid()),tmp(mask.Grid()); + LatticeComplex ones(mask.Grid()); + mask = Zero(); ones = ComplexD(1.0,0.0); tmp = Zero(); + for(int i=0; iapplyFilter(MomFiltered); - +#ifdef TIMom + LatticeComplex W(U.Grid()); W = Zero(); + scaleMom(W,Params.alpha*Params.alpha, Params.box_dim); + for (int mu = 0; mu < Nd; mu++) { + typename FieldImplementation::LinkField Pmu(W.Grid()); + Pmu = PeekIndex(MomFiltered, mu); + Pmu = W*Pmu; + PokeIndex(MomFiltered, Pmu, mu); + } +#endif // exponential of Mom*U in the gauge fields case FieldImplementation::update_field(MomFiltered, U, ep); - +#ifdef PRINT_SNAPSHOTS + writeConfig(U,"U_lat_"+std::to_string(t_U+ep)); +#endif // Update the smeared fields, can be implemented as observer Smearer.set_Field(U); // Update the higher representations fields Representations.update(U); // void functions if fundamental representation +#ifdef PRINT_SNAPSHOTS + writeConfig(Smearer.get_SmearedU(),"U_smr_"+std::to_string(t_U+ep)); +#endif } virtual void step(Field& U, int level, int first, int last) = 0; @@ -402,8 +439,19 @@ class Integrator { std::cout << GridLogIntegrator << "Integrator refresh" << std::endl; std::cout << GridLogIntegrator << "Generating momentum" << std::endl; +#ifndef TIMom FieldImplementation::generate_momenta(P, sRNG, pRNG); - +#else + std::cout << GridLogIntegrator << "Scale momentum" << std::endl; + + LatticeComplex W(U.Grid()); W = Zero(); + //////////////////// + // Setup the mask + //////////////////// + scaleMom(W,1/Params.alpha, Params.box_dim); + FieldImplementation::generate_momenta(P, sRNG, pRNG, W); +#endif + // Update the smeared fields, can be implemented as observer // necessary to keep the fields updated even after a reject // of the Metropolis @@ -459,7 +507,10 @@ class Integrator { std::cout << GridLogIntegrator << "Integrator action\n"; RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom - +#ifdef PRINT_SNAPSHOTS + std::cout << GridLogMessage << "Final Half Mom Squared: " << H << std::endl; +#endif + RealD Hterm; // Actions @@ -505,7 +556,10 @@ class Integrator { std::cout << GridLogIntegrator << "Integrator initial action\n"; RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom - +#ifdef PRINT_SNAPSHOTS + std::cout << GridLogMessage << "Initial Half Mom Squared: " << H << std::endl; +#endif + RealD Hterm; // Actions diff --git a/Grid/qcd/smearing/GaugeConfigurationMasked.h b/Grid/qcd/smearing/GaugeConfigurationMasked.h index 73243be207..0d79c1775f 100644 --- a/Grid/qcd/smearing/GaugeConfigurationMasked.h +++ b/Grid/qcd/smearing/GaugeConfigurationMasked.h @@ -4,18 +4,23 @@ @brief Declares the GaugeConfiguration class */ #pragma once +#ifndef GAUGE_CONFIG_MASK_H NAMESPACE_BEGIN(Grid); - +//#define DEBUG template void Dump(const Lattice & lat, std::string s, Coordinate site = Coordinate({0,0,0,0})) { typename T::scalar_object tmp; + if (lat.Checkerboard() != lat.Grid()->CheckerBoard(site)) + site = Coordinate({0,0,0,1}); peekSite(tmp,lat,site); std::cout << " Dump "< { public: INHERIT_GIMPL_TYPES(Gimpl); - + private: // These live in base class // const unsigned int smearingLevels; // Smear_Stout *StoutSmearing; // std::vector SmearedSet; - - std::vector masks; + GridCartesian * UGrid; // keep a copy of the grid + GridRedBlackCartesian * UrbGrid; // keep a copy of the redblack grid for life of object + PaddedCell Ghost; + + std::vector masks; + std::vector cbs; + std::vector gStencils; + std::vector gStencils_smear; + typedef typename SU3Adjoint::AMatrix AdjMatrix; typedef typename SU3Adjoint::LatticeAdjMatrix AdjMatrixField; typedef typename SU3Adjoint::LatticeAdjVector AdjVectorField; - + typedef typename SU3::vAlgebraMatrix vAlgebraMatrix; + + void BaseSmearDerivative(GaugeField& SigmaTerm, const GaugeField& iLambda, const GaugeField& U, int mmu, RealD rho) - { + {GRID_TRACE("BaseSmearDerivative"); // Reference // Morningstar, Peardon, Phys.Rev.D69,054501(2004) // Equation 75 @@ -58,6 +72,9 @@ class SmearedConfigurationMasked : public SmearedConfiguration GaugeLinkField sh_field(grid), temp_Sigma(grid); Real rho_munu, rho_numu; + RealD t = 0; + t-=usecond(); + rho_munu = rho; rho_numu = rho; for(int mu = 0; mu < Nd; ++mu){ @@ -100,65 +117,975 @@ class SmearedConfigurationMasked : public SmearedConfiguration // staple = Zero(); sh_field = Cshift(U_nu, mu, 1); - temp_Sigma = Zero(); + temp_Sigma = Zero(); + + if ( mu == mmu ) + temp_Sigma = -rho_munu*adj(sh_field)*adj(U_mu)*iLambda_mu*U_nu; + + if ( nu == mmu ) { + temp_Sigma += rho_numu*adj(sh_field)*adj(U_mu)*iLambda_nu*U_nu; + + u_tmp = adj(U_nu)*iLambda_nu; + sh_field = Cshift(u_tmp, mu, 1); + temp_Sigma += -rho_numu*sh_field*adj(U_mu)*U_nu; + } + + sh_field = Cshift(temp_Sigma, nu, -1); + Gimpl::AddLink(SigmaTerm, sh_field, mu); + + } + } + + t+=usecond(); + std::cout << GridLogMessage << " BaseSmearDerivative " << t/1e3 << " ms " << std::endl; + + } +#ifdef DEBUG + //for debugging + void BaseSmear_cb(GaugeLinkField& Cup, const GaugeField& U,int mu,RealD rho) { + GRID_TRACE("BaseSmear_cb"); + GridBase *grid = U.Grid(); + GridBase *hgrid = Cup.Grid(); + GaugeLinkField tmp_stpl(grid); + GaugeLinkField tmp_stpl_eo(hgrid); + WilsonLoops WL; + int cb = Cup.Checkerboard(); + RealD t = 0; + + t-=usecond(); + Cup = Zero(); + for(int nu=0; nu WL; + RealD t = 0; + + t-=usecond(); + Cup = Zero(); + for(int nu=0; nuNsimd()<oSites(), ggrid->Nsimd(), { + typedef decltype(coalescedRead(tmp_stpl_v[0])) LinkMat; + + LinkMat tmp = Zero(); + for(int nu=0; nu_offset], e->_permute, Nd)(nu)(); + e = gStencil_v.GetEntry(1+inc,ss); + auto U_mu_xpnu = coalescedReadGeneralPermute(gU_v[e->_offset], e->_permute, Nd)(mu)(); + e = gStencil_v.GetEntry(2+inc,ss); + auto Udag_nu_xpmu = adj(coalescedReadGeneralPermute(gU_v[e->_offset], e->_permute, Nd))(nu)(); + + tmp()() = tmp()() + U_nu_x * U_mu_xpnu * Udag_nu_xpmu; + + e = gStencil_v.GetEntry(3+inc,ss); + auto Udag_nu_xmnu = adj(coalescedReadGeneralPermute(gU_v[e->_offset], e->_permute, Nd))(nu)(); + e = gStencil_v.GetEntry(4+inc,ss); + auto U_mu_xmnu = coalescedReadGeneralPermute(gU_v[e->_offset], e->_permute, Nd)(mu)(); + e = gStencil_v.GetEntry(5+inc,ss); + auto U_nu_xpmu_mnu = coalescedReadGeneralPermute(gU_v[e->_offset], e->_permute, Nd)(nu)(); + + tmp()() = tmp()() + Udag_nu_xmnu * U_mu_xmnu * U_nu_xpmu_mnu; + + } + } + coalescedWrite(tmp_stpl_v[ss],rho*tmp); + }); + pickCheckerboard(cb,Cup,Ghost.Extract(tmp_stpl)); + + t+=usecond(); + std::cout << GridLogMessage << " BaseSmear_ghost " << t/1e3 << " ms " << std::endl; + } + + // Adjoint vector to GaugeField force + void InsertForce(GaugeField &Fdet,AdjVectorField &Fdet_nu,int nu) + { + Complex ci(0,1); + GaugeLinkField Fdet_pol(Fdet.Grid()); + RealD t = 0; + + t-=usecond(); + Fdet_pol=Zero(); + for(int e=0;e<8;e++){ + ColourMatrix te; + SU3::generator(e, te); + auto tmp=peekColour(Fdet_nu,e); + Fdet_pol=Fdet_pol + ci*tmp*te; // but norm of te is different.. why? + } + pokeLorentz(Fdet, Fdet_pol, nu); + + t+=usecond(); + std::cout << GridLogPerformance << " InsertForce " << t/1e3 << " ms " << std::endl; + } + + void Compute_MpInvJx_dNxxdSy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR, AdjMatrixField MpInvJx,AdjVectorField &Fdet2 ) + { + GRID_TRACE("Compute_MpInvJx_dNxxdSy"); + int cb = PlaqL.Checkerboard(); + GridBase *grid = PlaqL.Grid(); + const int Ngen = SU3Adjoint::Dimension; + Complex ci(0,1); + RealD t; + + t=-usecond(); +#if 0 // Due to Aurora Issue + autoView(Fdet2_v,Fdet2,AcceleratorWrite); + autoView(PlaqL_v,PlaqL,AcceleratorRead); + autoView(PlaqR_v,PlaqR,AcceleratorRead); + autoView(MpInvJx_v,MpInvJx,AcceleratorRead); + const int nsimd = vAlgebraMatrix::Nsimd(); + accelerator_for(ss,grid->oSites(),nsimd,{ + typedef decltype(coalescedRead(MpInvJx_v[0])) adj_mat; + typedef decltype(coalescedRead(Fdet2_v[0])) adj_vec; + + adj_mat Dbc; + adj_vec Fdet; + ColourMatrix ta; + + for(int a=0;aoSites(),a,Ngen,nsimd,{ + typedef decltype(coalescedRead(MpInvJx_v[0])) adj_mat; + typedef decltype(coalescedRead(Fdet2_v[0])) adj_vec; + + adj_mat Dbc; + ColourMatrix ta; + + // Qlat Tb = 2i Tb^Grid + SU3::generator(a, ta); + ta = 2.0 * ci * ta; + auto UtaU = adj(PlaqL_v(ss))*ta*PlaqR_v(ss); + SU3::LieAlgebraProject(Dbc,UtaU); + + coalescedWrite(Fdet2_v[ss]()()(a),traceProduct(MpInvJx_v(ss),Dbc)()()()); + }); + +#endif + t+=usecond(); + //std::cout << GridLogMessage << "DEBUG: Compute_MpInvJx_dNxxdSy " << nsimd<<" " <oSites(),nsimd,{ + typedef decltype(coalescedRead(NxAd_v[0])) adj_mat; + adj_mat NxAd_site; + SU3::LieAlgebraProject(NxAd_site,PlaqL_v(ss),PlaqR_v(ss)); + coalescedWrite(NxAd_v[ss],NxAd_site); + }); + t+=usecond(); + // std::cout << GridLogMessage << "DEBUG: CNxy" << nsimd<(U,mu); + tmp=PeekIndex(masks[smr],mu); + Umu=Umu*tmp; + PokeIndex(U, Umu, mu); + } + } +public: + + void logDetJacobianForceLevel(const GaugeField &U, GaugeField &force ,int smr) + { + GRID_TRACE("logDetJacobianForceLevel"); + GridBase* grid = U.Grid(); + GridBase* hgrid = UrbGrid; // For now, assume masking is based on red-black checkerboarding + assert(grid==UGrid); + //GaugeField C(grid); + GaugeField Umsk(grid); + std::vector Umu(Nd,grid); + GaugeLinkField Cmu(hgrid); // U and staple; C contains factor of epsilon + GaugeLinkField Zx(hgrid); // U times Staple, contains factor of epsilon + GaugeLinkField Utmp(grid); + GaugeLinkField Ueo(hgrid); + GaugeLinkField PlaqL(hgrid); + GaugeLinkField PlaqR(hgrid); + const int Ngen = SU3Adjoint::Dimension; + ColourMatrix Ident; + const int Nsimd = vAlgebraMatrix::Nsimd(); + + AdjVectorField dJdXe_nMpInv(hgrid); + AdjVectorField dJdXe_nMpInv_y(hgrid); + AdjMatrixField MpAd(hgrid); // Mprime luchang's notes + AdjMatrixField MpAdInv(hgrid); // Mprime inverse + AdjMatrixField NxxAd(hgrid); // Nxx in adjoint space + AdjMatrixField JxAd(hgrid); + AdjMatrixField ZxAd(hgrid); + + RealD t0 = usecond(); + Ident = ComplexD(1.0); + for(int d=0;d(masks[smr],mu); // the cb mask + + Umsk = U; + ApplyMask(Umsk,smr); + Utmp = peekLorentz(Umsk,mu); + pickCheckerboard(cb,Ueo,Utmp); + + GaugeField gU(grid); + std::vector gUmu(Nd,grid); + { + GRID_TRACE("ExchangePeriodic"); + gU = Ghost.ExchangePeriodic(U); + for(int d=0; dStoutSmearing->SmearRho[1]; + int idx=0; + for(int mu=0;mu<4;mu++){ + for(int nu=0;nu<4;nu++){ + if ( mu!=nu) assert(this->StoutSmearing->SmearRho[idx]==rho); + else assert(this->StoutSmearing->SmearRho[idx]==0.0); + idx++; + }} + + ////////////////////////////////////////////////////////////////// + // Assemble the N matrix + ////////////////////////////////////////////////////////////////// + // Computes ALL the staples -- could compute one only and do it here + RealD time; + time=-usecond(); + BaseSmear_ghost(Cmu, gU, mu, rho); +#ifdef DEBUG + GaugeLinkField Cmu2(hgrid); + Cmu2.Checkerboard() = cb; + BaseSmear_cb(Cmu2, U, mu, rho); + std::cout << GridLogMessage << " DEBUG: BaseSmear " <oSites(),Nsimd,{ + typedef decltype(coalescedRead(JxAd_v[0])) adj_mat; + adj_mat X, JxAd_site; + RealD kpfac = 1; + + X=1.0; + JxAd_site = X; + for(int k=1;k<12;k++){ + X=-X*ZxAd_v(ss); + kpfac = kpfac /(k+1); + JxAd_site = JxAd_site + X * kpfac; + } + coalescedWrite(JxAd_v[ss],JxAd_site); + }); + } + time+=usecond(); + std::cout << GridLogMessage << "Jx took "<oSites(),Nsimd,{ + typedef decltype(coalescedRead(ZxAd_v[0])) adj_mat; + typedef decltype(coalescedRead(dJdXe_nMpInv_v[0])) adj_vec; + adj_mat X, t2, dt2, t3, dt3, aunit, nMpInv_site; + adj_vec dJdXe_nMpInv_site; + iVector iTas; + iVector dJdX_b; + + for(int b=0;b 1; --j) { + t3 = t2*(1.0 / (j + 1)) + aunit; + t2 = X * t3; + for(int b=0;boSites(), ggrid->Nsimd(), { + GeneralStencilEntry const* e = gStencil_v.GetEntry(0,ss); + auto U_nu_x = coalescedReadGeneralPermute(gU_nu_v[e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(1,ss); + auto U_mu_xpnu = coalescedReadGeneralPermute(gU_mu_v[e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(2,ss); + auto Udag_nu_xpmu = adj(coalescedReadGeneralPermute(gU_nu_v[e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(3,ss); + auto Udag_mu_x = adj(coalescedReadGeneralPermute(gU_mu_v[e->_offset], e->_permute, Nd)); + + auto stencil_ss = (-rho) * U_nu_x * U_mu_xpnu * Udag_nu_xpmu * Udag_mu_x; + + coalescedWrite(gPlaqR_v[ss],stencil_ss); + } + ); + t_ck -= usecond(); + pickCheckerboard(cb,PlaqR,Ghost.Extract(gPlaqR)); + t_ck += usecond(); + } + time+=usecond(); tLR += usecond(); + std::cout << GridLogMessage << "PlaqLR took "<Nsimd()<<" "<oSites(), ggrid->Nsimd(), { + GeneralStencilEntry const* e = gStencil_v.GetEntry(0,ss); + auto U_nu_y = coalescedReadGeneralPermute(gU_nu_v[e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(1,ss); + auto Udag_mu_ymmupnu = adj(coalescedReadGeneralPermute(gU_mu_v[e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(2,ss); + auto Udag_nu_ymmu = adj(coalescedReadGeneralPermute(gU_nu_v[e->_offset], e->_permute, Nd)); + auto stencil_ss = (rho) * U_nu_y * Udag_mu_ymmupnu * Udag_nu_ymmu; + coalescedWrite(gPlaqR_v[ss],stencil_ss); + + e = gStencil_v.GetEntry(3,ss); + auto Udag_mu_ymmu = adj(coalescedReadGeneralPermute(gU_mu_v[e->_offset], e->_permute, Nd)); + stencil_ss = Udag_mu_ymmu; + coalescedWrite(gPlaqL_v[ss],stencil_ss); + } + ); + pickCheckerboard((cb+1)%2,PlaqR,Ghost.Extract(gPlaqR)); + pickCheckerboard((cb+1)%2,PlaqL,Ghost.Extract(gPlaqL)); + } + tLR += usecond(); + + tNxy -= usecond(); + Nxy.Checkerboard() = (cb+1)%2; FdetV.Checkerboard() = (cb+1)%2; + t_cshift-= usecond(); + dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,mu,-1); + t_cshift += usecond(); + ComputeNxy(PlaqL, PlaqR,Nxy); + Fdet1_nu_oe = transpose(Nxy)*dJdXe_nMpInv_y; + tNxy += usecond(); + + tMJx -= usecond(); + MpInvJx_nu = Cshift(MpInvJx,mu,-1); + Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV); + Fdet2_nu_oe = FdetV; + tMJx += usecond(); + + ///////////////// -ve nu ///////////////// + // __ + // | | + // x== // nu polarisation -- clockwise + tLR -= usecond(); + { GRID_TRACE("Staple"); + autoView( gStencil_v , gStencils[mu*(Nd-1)*6+(nu-(mu<=nu))*6+2], AcceleratorRead); + accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), { + GeneralStencilEntry const* e = gStencil_v.GetEntry(0,ss); + auto U_mu_y = coalescedReadGeneralPermute(gU_mu_v[e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(1,ss); + auto U_nu_ypmu = coalescedReadGeneralPermute(gU_nu_v[e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(2,ss); + auto Udag_mu_ypnu = adj(coalescedReadGeneralPermute(gU_mu_v[e->_offset], e->_permute, Nd)); + + auto stencil_ss = (rho) * U_mu_y * U_nu_ypmu * Udag_mu_ypnu; + coalescedWrite(gPlaqL_v[ss],stencil_ss); + } + ); + pickCheckerboard((cb+1)%2,PlaqL,Ghost.Extract(gPlaqL)); + pickCheckerboard((cb+1)%2,PlaqR,Umu[nu]); + } + tLR += usecond(); + + tNxy -= usecond(); + t_cshift-=usecond(); + dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,nu,1); + t_cshift+=usecond(); + ComputeNxy(PlaqL,PlaqR,Nxy); + Fdet1_nu_oe = Fdet1_nu_oe + transpose(Nxy)*dJdXe_nMpInv_y; + tNxy += usecond(); + + tMJx -= usecond(); + t_cshift-=usecond(); + MpInvJx_nu = Cshift(MpInvJx,nu,1); + t_cshift+=usecond(); + Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV); + Fdet2_nu_oe = Fdet2_nu_oe+FdetV; + tMJx += usecond(); + + // x== + // | | + // |__| // nu polarisation + tLR -= usecond(); + { + GRID_TRACE("Staple"); + autoView( gStencil_v , gStencils[mu*(Nd-1)*6+(nu-(mu<=nu))*6+3], AcceleratorRead); + accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), { + GeneralStencilEntry const* e = gStencil_v.GetEntry(0,ss); + auto U_nu_y = coalescedReadGeneralPermute(gU_nu_v[e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(1,ss); + auto Udag_mu_ymmupnu = adj(coalescedReadGeneralPermute(gU_mu_v[e->_offset], e->_permute, Nd)); + auto stencil_ss = (-rho) * U_nu_y * Udag_mu_ymmupnu; + coalescedWrite(gPlaqL_v[ss],stencil_ss); + + e = gStencil_v.GetEntry(2,ss); + auto Udag_mu_ymmu = adj(coalescedReadGeneralPermute(gU_mu_v[e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(3,ss); + auto U_nu_ymmu = coalescedReadGeneralPermute(gU_nu_v[e->_offset], e->_permute, Nd); + stencil_ss = Udag_mu_ymmu * U_nu_ymmu; + coalescedWrite(gPlaqR_v[ss],stencil_ss); + } + ); + pickCheckerboard(cb,PlaqL,Ghost.Extract(gPlaqL)); + pickCheckerboard(cb,PlaqR,Ghost.Extract(gPlaqR)); + } + tLR += usecond(); + + tNxy -= usecond(); + Nxy.Checkerboard() = cb; FdetV.Checkerboard() = cb; + t_cshift-=usecond(); + dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,mu,-1); + dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv_y,nu,1); + t_cshift+=usecond(); + + ComputeNxy(PlaqL,PlaqR,Nxy); + Fdet1_nu_eo = Fdet1_nu_eo + transpose(Nxy)*dJdXe_nMpInv_y; + tNxy += usecond(); + + tMJx -= usecond(); + t_cshift-=usecond(); + MpInvJx_nu = Cshift(MpInvJx,mu,-1); + MpInvJx_nu = Cshift(MpInvJx_nu,nu,1); + t_cshift+=usecond(); + Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV); + Fdet2_nu_eo = Fdet2_nu_eo+FdetV; + tMJx += usecond(); + + ///////////////////////////////////////////////////////////////////// + // Set up the determinant force contribution in 3x3 algebra basis + ///////////////////////////////////////////////////////////////////// + t_ins -= usecond(); + setCheckerboard(Fdet1_nu, Fdet1_nu_eo); + setCheckerboard(Fdet1_nu, Fdet1_nu_oe); + InsertForce(Fdet1,Fdet1_nu,nu); + setCheckerboard(Fdet2_nu, Fdet2_nu_eo); + setCheckerboard(Fdet2_nu, Fdet2_nu_oe); + InsertForce(Fdet2,Fdet2_nu,nu); + t_ins+= usecond(); + + ////////////////////////////////////////////////// + // Parallel direction terms + ////////////////////////////////////////////////// + + Nxy.Checkerboard() = (cb+1)%2; FdetV.Checkerboard() = (cb+1)%2; + + // __ + // | " + // |__"x // mu polarisation + tLR -= usecond(); + { + GRID_TRACE("Staple"); + autoView( gStencil_v , gStencils[mu*(Nd-1)*6+(nu-(mu<=nu))*6+4], AcceleratorRead); + accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), { + GeneralStencilEntry const* e = gStencil_v.GetEntry(0,ss); + auto U_mu_y = coalescedReadGeneralPermute(gU_mu_v[e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(1,ss); + auto Udag_nu_ypmumnu = adj(coalescedReadGeneralPermute(gU_nu_v[e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(2,ss); + auto Udag_mu_ymnu = adj(coalescedReadGeneralPermute(gU_mu_v[e->_offset], e->_permute, Nd)); + auto stencil_ss = (-rho) * U_mu_y * Udag_nu_ypmumnu * Udag_mu_ymnu; + coalescedWrite(gPlaqL_v[ss],stencil_ss); + + e = gStencil_v.GetEntry(3,ss); + auto Udag_nu_ymnu = adj(coalescedReadGeneralPermute(gU_nu_v[e->_offset], e->_permute, Nd)); + stencil_ss = Udag_nu_ymnu; + coalescedWrite(gPlaqR_v[ss],stencil_ss); + } + ); + pickCheckerboard((cb+1)%2,PlaqL,Ghost.Extract(gPlaqL)); + pickCheckerboard((cb+1)%2,PlaqR,Ghost.Extract(gPlaqR)); + } + tLR += usecond(); + + tNxy -= usecond(); + t_cshift-=usecond(); + dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,nu,-1); + t_cshift+=usecond(); + + ComputeNxy(PlaqL,PlaqR,Nxy); + Fdet1_mu_oe = Fdet1_mu_oe + transpose(Nxy)*dJdXe_nMpInv_y; + tNxy += usecond(); + + tMJx -= usecond(); + t_cshift-=usecond(); + MpInvJx_nu = Cshift(MpInvJx,nu,-1); + t_cshift+=usecond(); + Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV); + Fdet2_mu_oe = Fdet2_mu_oe+FdetV; + tMJx += usecond(); + + // __ + // " | + // x__| // mu polarisation + tLR -= usecond(); + { + GRID_TRACE("Staple"); + autoView( gStencil_v , gStencils[mu*(Nd-1)*6+(nu-(mu<=nu))*6+5], AcceleratorRead); + accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), { + GeneralStencilEntry const* e = gStencil_v.GetEntry(0,ss); + auto U_mu_y = coalescedReadGeneralPermute(gU_mu_v[e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(1,ss); + auto U_nu_ypmu = coalescedReadGeneralPermute(gU_nu_v[e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(2,ss); + auto Udag_mu_ypnu = adj(coalescedReadGeneralPermute(gU_mu_v[e->_offset], e->_permute, Nd)); + + auto stencil_ss = (-rho) * U_mu_y * U_nu_ypmu * Udag_mu_ypnu; + coalescedWrite(gPlaqL_v[ss],stencil_ss); + } + ); + pickCheckerboard((cb+1)%2,PlaqL,Ghost.Extract(gPlaqL)); + pickCheckerboard((cb+1)%2,PlaqR,Umu[nu]); + } + tLR += usecond(); + + tNxy -= usecond(); + t_cshift-=usecond(); + dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,nu,1); + t_cshift+=usecond(); + ComputeNxy(PlaqL,PlaqR,Nxy); + Fdet1_mu_oe = Fdet1_mu_oe + transpose(Nxy)*dJdXe_nMpInv_y; + tNxy += usecond(); + + tMJx -= usecond(); + t_cshift-=usecond(); + MpInvJx_nu = Cshift(MpInvJx,nu,1); + t_cshift+=usecond(); + Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV); + Fdet2_mu_oe = Fdet2_mu_oe+FdetV; + tMJx += usecond(); + + } + } + RealD t5 = usecond(); + setCheckerboard(Fdet1_mu, Fdet1_mu_oe); + setCheckerboard(Fdet2_mu, Fdet2_mu_oe); + InsertForce(Fdet1,Fdet1_mu,mu); + InsertForce(Fdet2,Fdet2_mu,mu); + + force= (-0.5)*( Fdet1 + Fdet2); + RealD t1 = usecond(); + std::cout << GridLogMessage << " logDetJacobianForce t3-t0 "< gUmu(Nd,grid); + GaugeLinkField PlaqL(hgrid); + GaugeLinkField Z(hgrid); + GaugeLinkField Umu(hgrid), Cmu(hgrid); + LatticeComplex ln_det(hgrid); + typedef typename SU3Adjoint::LatticeAdjMatrix AdjMatrixField; + AdjMatrixField Ncb(hgrid); + AdjMatrixField Zac(hgrid); + ColourMatrix Ident; + + RealD time=0, tN=0, tZ=0, t_M=0, tJ_lnDet=0, t_det=0, t_ln=0; + int mu= (smr/2) %Nd; + + time -= usecond(); + Ident = ComplexD(1.0); + + int cb = cbs[smr]; //Assume: the applied mask is the cb mask - if ( mu == mmu ) - temp_Sigma = -rho_munu*adj(sh_field)*adj(U_mu)*iLambda_mu*U_nu; + Z.Checkerboard() = cb; + Cmu.Checkerboard() = cb; + PlaqL.Checkerboard() = cb; + Ncb.Checkerboard() = cb; + ln_det.Checkerboard() = cb; - if ( nu == mmu ) { - temp_Sigma += rho_numu*adj(sh_field)*adj(U_mu)*iLambda_nu*U_nu; + pickCheckerboard(cb,Umu,peekLorentz(U,mu)); - u_tmp = adj(U_nu)*iLambda_nu; - sh_field = Cshift(u_tmp, mu, 1); - temp_Sigma += -rho_numu*sh_field*adj(U_mu)*U_nu; - } - - sh_field = Cshift(temp_Sigma, nu, -1); - Gimpl::AddLink(SigmaTerm, sh_field, mu); + ////////////////////////////////////////////////////////////////// + // Assemble the N matrix + ////////////////////////////////////////////////////////////////// - } + tN -= usecond(); + {GRID_TRACE("ExchangePeriodic"); + gU = Ghost.ExchangePeriodic(U); } - } - - void BaseSmear(GaugeLinkField& Cup, const GaugeField& U,int mu,RealD rho) { - GridBase *grid = U.Grid(); - GaugeLinkField tmp_stpl(grid); - WilsonLoops WL; - Cup = Zero(); - for(int nu=0; nuStoutSmearing->SmearRho[1]; + PlaqL = Ident; + BaseSmear_ghost(Cmu,gU,mu,rho); + ComputeNxy(PlaqL,Umu * adj(Cmu),Ncb);//we can expand function body and form a bigger accelerated loop + tN += usecond(); + + ////////////////////////////////////////////////////////////////// + // Assemble Luscher exp diff map J matrix + ////////////////////////////////////////////////////////////////// + tZ -= usecond(); + {GRID_TRACE("Z"); + // Ta so Z lives in Lie algabra + Z = Ta(Cmu * adj(Umu)); + // Move Z to the Adjoint Rep == make_adjoint_representation + SU3Adjoint::make_adjoint_rep(Zac, Z); } - } - // Adjoint vector to GaugeField force - void InsertForce(GaugeField &Fdet,AdjVectorField &Fdet_nu,int nu) - { - Complex ci(0,1); - GaugeLinkField Fdet_pol(Fdet.Grid()); - Fdet_pol=Zero(); - for(int e=0;e<8;e++){ - ColourMatrix te; - SU3::generator(e, te); - auto tmp=peekColour(Fdet_nu,e); - Fdet_pol=Fdet_pol + ci*tmp*te; // but norm of te is different.. why? + tZ += usecond(); + + tJ_lnDet -= usecond(); + t_M -= usecond(); + {GRID_TRACE("Mab"); + autoView(ln_det_v,ln_det,AcceleratorWrite); + autoView(Zac_v,Zac,AcceleratorRead); + autoView(Ncb_v,Ncb,AcceleratorRead); + accelerator_for(ss,hgrid->oSites(),hgrid->Nsimd(),{ + typedef decltype(coalescedRead(Zac_v(0))) adj_mat; + + adj_mat X, Jac, Mab_ss; + RealD kpfac = 1; + + ////////////////////////////////////// + // J(x) = 1 + Sum_k=1..N (-Zac)^k/(k+1)! + ////////////////////////////////////// + X=1.0; + Jac = X; + for(int k=1;k<12;k++){ + X=(-1.0)*X*Zac_v(ss); + kpfac = kpfac /((RealD) (k+1)); + Jac = Jac + X * kpfac; + } + + //////////////////////////// + // Mab + //////////////////////////// + Mab_ss = Complex(1.0,0.0); + Mab_ss = Mab_ss - Jac * Ncb_v(ss); + + + //////////////////////////// + // ln det + //////////////////////////// + auto detD = Determinant(Mab_ss); + coalescedWrite(ln_det_v[ss],log(detD)); + }); } - pokeLorentz(Fdet, Fdet_pol, nu); + t_M+=usecond(); + + tJ_lnDet += usecond(); + Complex result = sum(ln_det); + time += usecond(); + std::cout << GridLogMessage << " logDetJacobianLevel " << time/1e3 <<" ms N "< SU3::generator(a, ta); ta = 2.0 * ci * ta; // Qlat Tb = 2i Tb^Grid - UtaU= adj(PlaqL)*ta*PlaqR; // 6ms + { + GRID_TRACE("UtaU"); + UtaU= adj(PlaqL)*ta*PlaqR; // 6ms + } tta+=usecond(); //////////////////////////////////////////// // Could add this entire C-loop to a projection routine @@ -178,9 +1108,19 @@ class SmearedConfigurationMasked : public SmearedConfiguration SU3::generator(c, tc); tc = 2.0*ci*tc; tp-=usecond(); - D = Ta( tc *UtaU); // 2ms + { + GRID_TRACE("D = Ta tc*UtaU"); + D = Ta( tc *UtaU); // 2ms + } #if 1 - SU3::LieAlgebraProject(Dbc_opt,D,c); // 5.5ms + { + GRID_TRACE("LieAlgebraProject"); + time=-usecond(); + SU3::LieAlgebraProject(Dbc_opt,D,c); // 5.5ms + time+=usecond(); + tpl+=time; + //std::cout << GridLogMessage << " LieAlgebraProject_in_Compute_MpInvJx " << a << " " << c << " "<