diff --git a/Grid/allocator/Allocator.h b/Grid/allocator/Allocator.h index 589ea36f83..f151bc0d54 100644 --- a/Grid/allocator/Allocator.h +++ b/Grid/allocator/Allocator.h @@ -2,3 +2,4 @@ #include #include #include +#include diff --git a/Grid/allocator/DeviceMemoryAllocator.cc b/Grid/allocator/DeviceMemoryAllocator.cc new file mode 100644 index 0000000000..c6711deba2 --- /dev/null +++ b/Grid/allocator/DeviceMemoryAllocator.cc @@ -0,0 +1,235 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/DeviceMemoryAllocator.h + + Copyright (C) 2025 + +Author: Christoph Lehner + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + +#include + +NAMESPACE_BEGIN(Grid); + +#define DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE (64*1024) +#define OVERALLOCATION_FACTOR 1.2 + +#ifdef GRID_DEVICE_MEMORY_ALLOCATOR +struct DeviceMemoryAllocator { + + bool initialized; + char* base; + size_t size; + size_t offset; + bool verbose; + + DeviceMemoryAllocator() { + initialized = false; + base = 0; + size = 0; + offset = 0; + verbose = false; + } + + ~DeviceMemoryAllocator() { + if (initialized) { + acceleratorFreeDevice(base); + initialized = false; + } + } + + std::vector pages; + std::map > size_map; + + void Init(size_t _size) { + assert(!initialized); + + char* str; + if ((str = getenv("GRID_OVERALLOCATION_FACTOR"))) { + _size = (size_t)(_size * atof(str)); + } else { + _size = (size_t)(_size * OVERALLOCATION_FACTOR); + } + + verbose = (getenv("GRID_DEBUG_DEVICE_ALLOCATOR") != 0); + + size_t n_pages = (_size + DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE - 1) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; + size = n_pages * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; + std::cout << GridLogMessage << "Init device allocator with " << size << " bytes" << std::endl; + + base = (char*)acceleratorAllocDeviceInternal(size); + assert(base); + + if (verbose) + std::cout << GridLogMessage << "Initialize memory to zero" << std::endl; + + { + uint64_t* ba = (uint64_t*)base; + size_t n = size / sizeof(uint64_t); + size_t MAX_BLOCK_INIT = 128*1024*1024; + while (n > 0) { + size_t n0 = n; + if (n0 > MAX_BLOCK_INIT) + n0 = MAX_BLOCK_INIT; + accelerator_for(i, n0, 1, { + ba[i] = (uint64_t)-1; + }); + ba += n0; + n -= n0; + } + } + + if (verbose) + std::cout << GridLogMessage << "Done" << std::endl; + + offset = 0; + + pages.resize(n_pages, 0); + + if (verbose) + std::cout << GridLogMessage << "Pages initialized" << std::endl; + + initialized = true; + } + + void* attemptReuseExactSize(size_t n_pages) { + auto sm = size_map.find(n_pages); + if (sm != size_map.end() && sm->second.size() > 0) { + size_t index = sm->second.back(); + sm->second.pop_back(); + + if (sm->second.size() == 0) + size_map.erase(sm); + + assert(pages[index] == 0); + pages[index] = n_pages; + + return base + index * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; + } + return 0; + } + + void* attemptAllocUnused(size_t n_pages) { + size_t end = (offset + n_pages) * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; + void* ptr = 0; + + if (end <= size) { + pages[offset] = n_pages; + + ptr = base + offset * DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; + offset += n_pages; + + if (verbose) { + size_t reusable_pages = 0; + for (auto & sm : size_map) + reusable_pages += sm.first * sm.second.size(); + + std::cout << GridLogMessage << (size - end) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE << " pages left to allocate (" + << (size - end) * 100 / size << "% unallocated, " << reusable_pages << " reusable pages)" << std::endl; + } + } + + return ptr; + } + + void* alloc(size_t bytes) { + if (!initialized) + Init(MemoryManager::DeviceMaxBytes); + + if (!bytes) + bytes++; + + size_t n_pages = (bytes + DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE - 1) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; + + // first check if block of perfect size is available + void* ptr; + if ((ptr = attemptReuseExactSize(n_pages))) { + + if (verbose) + std::cout << GridLogMessage << "Can re-use perfect pointer for " << n_pages << " pages" << std::endl; + + return ptr; + } + + // if not, attempt to allocate in the unused area + if ((ptr = attemptAllocUnused(n_pages))) + return ptr; + + // last attempt, find a re-usable region that barely fits and return it + // for loop of std::map iterates in ascending order + size_t reusable_pages = 0; + size_t n_pages_usable = 0; + for (auto & sm : size_map) { + assert(sm.second.size() > 0); // should never be empty + reusable_pages += sm.first * sm.second.size(); + if (n_pages_usable == 0 && sm.first > n_pages) + n_pages_usable = sm.first; + } + + if (n_pages_usable == 0) { + std::cout << GridLogMessage << "Out of memory for " << n_pages << " pages! Re-usable pages at time of death:" << std::endl; + + for (auto & sm : size_map) { + std::cout << GridLogMessage << sm.second.size() << " x " << sm.first << " pages" << std::endl; + } + + exit(1); + } + + if ((ptr = attemptReuseExactSize(n_pages_usable))) { + + if (verbose) + std::cout << GridLogMessage << "Can re-use pointer for " << n_pages_usable << " pages when " << n_pages << " were needed; " << reusable_pages << " reusable pages" << std::endl; + + return ptr; + } + + // this should never be reached + assert(0); + return ptr; + } + + void free(void* ptr) { + if (!initialized) + return; + + size_t index = ((size_t)((char*)ptr - base)) / DEVICE_MEMORY_ALLOCATOR_PAGE_SIZE; + size_t n_pages = pages[index]; + //std::cout << GridLogMessage << "Freeing ptr " << ptr << " has " << n_pages << " pages" << std::endl; + pages[index] = 0; + auto & sm = size_map[n_pages]; + sm.push_back(index); + } +}; + +static DeviceMemoryAllocator dma; + +void *acceleratorAllocDevice(size_t bytes) { + return dma.alloc(bytes); +} + +void acceleratorFreeDevice(void *ptr) { + dma.free(ptr); +} +#endif + +NAMESPACE_END(Grid); diff --git a/Grid/allocator/DeviceMemoryAllocator.h b/Grid/allocator/DeviceMemoryAllocator.h new file mode 100644 index 0000000000..13c7eb8e18 --- /dev/null +++ b/Grid/allocator/DeviceMemoryAllocator.h @@ -0,0 +1,36 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/DeviceMemoryAllocator.h + + Copyright (C) 2025 + +Author: Christoph Lehner + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + +NAMESPACE_BEGIN(Grid); + +#ifdef GRID_DEVICE_MEMORY_ALLOCATOR +void *acceleratorAllocDevice(size_t bytes); +void acceleratorFreeDevice(void *ptr); +#endif + +NAMESPACE_END(Grid); diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 38e8072d4e..bb168a66a4 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -44,6 +44,11 @@ Author: paboyle inline void *memalign(size_t align, size_t bytes) { return malloc(bytes); } #endif +#ifdef GRID_DEVICE_MEMORY_ALLOCATOR +#define acceleratorAllocDevice acceleratorAllocDeviceInternal +#define acceleratorFreeDevice acceleratorFreeDeviceInternal +#endif + NAMESPACE_BEGIN(Grid); ////////////////////////////////////////////////////////////////////////////////// @@ -346,7 +351,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { }); \ }); -#define accelerator_barrier(dummy) { theGridAccelerator->wait(); } +#define accelerator_barrier(dummy) { theGridAccelerator->wait_and_throw(); theGridAccelerator->wait_and_throw(); } inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);}; inline void *acceleratorAllocHost(size_t bytes) { return malloc_host(bytes,*theGridAccelerator);}; @@ -355,7 +360,7 @@ inline void acceleratorFreeHost(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; -inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); } +inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait_and_throw(); theCopyAccelerator->wait_and_throw(); } /////// @@ -365,7 +370,7 @@ typedef sycl::event acceleratorEvent_t; inline void acceleratorEventWait(acceleratorEvent_t ev) { - ev.wait(); + ev.wait_and_throw(); } inline int acceleratorEventIsComplete(acceleratorEvent_t ev) @@ -377,9 +382,9 @@ inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *t inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); } inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); } -inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} -inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} -inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();} +inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait_and_throw();theCopyAccelerator->wait_and_throw();} +inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait_and_throw();theCopyAccelerator->wait_and_throw();} +inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait_and_throw();theCopyAccelerator->wait_and_throw();} inline int acceleratorIsCommunicable(void *ptr) { @@ -650,7 +655,7 @@ inline void acceleratorFreeCpu (void *ptr){free(ptr);}; ////////////////////////////////////////////// #ifdef GRID_SYCL -inline void acceleratorFenceComputeStream(void){ theGridAccelerator->ext_oneapi_submit_barrier(); }; +inline void acceleratorFenceComputeStream(void){ theGridAccelerator->ext_oneapi_submit_barrier(); theGridAccelerator->ext_oneapi_submit_barrier(); }; #else // Ordering within a stream guaranteed on Nvidia & AMD inline void acceleratorFenceComputeStream(void){ }; @@ -720,3 +725,8 @@ template T acceleratorGet(T& dev) NAMESPACE_END(Grid); + +#ifdef GRID_DEVICE_MEMORY_ALLOCATOR +#undef acceleratorAllocDevice +#undef acceleratorFreeDevice +#endif diff --git a/benchmarks/Benchmark_dwf_fp32_jureap.cc b/benchmarks/Benchmark_dwf_fp32_jureap.cc new file mode 100644 index 0000000000..92dd40bc1b --- /dev/null +++ b/benchmarks/Benchmark_dwf_fp32_jureap.cc @@ -0,0 +1,431 @@ + /************************************************************************************* + Grid physics library, www.github.com/paboyle/Grid + Source file: ./benchmarks/Benchmark_dwf.cc + Copyright (C) 2015 + + Author: Peter Boyle + Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#ifdef GRID_CUDA +#define CUDA_PROFILE +#endif + +#ifdef CUDA_PROFILE +#include +#endif + +using namespace std; +using namespace Grid; + +//////////////////////// +/// Move to domains //// +//////////////////////// + +Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT +}; + +void Benchmark(int Ls, Coordinate Dirichlet); + +#include +#include + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + + int threads = GridThread::GetThreads(); + + int Ls=16; + for(int i=0;i> Ls; + } + } + + ////////////////// + // With comms + ////////////////// + Coordinate Dirichlet(Nd+1,0); + + std::cout << "\n\n\n\n\n\n" < seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); +#define SINGLE +#ifdef SINGLE + typedef vComplexF Simd; + typedef LatticeFermionF FermionField; + typedef LatticeGaugeFieldF GaugeField; + typedef LatticeColourMatrixF ColourMatrixField; + typedef DomainWallFermionF FermionAction; +#endif +#ifdef DOUBLE + typedef vComplexD Simd; + typedef LatticeFermionD FermionField; + typedef LatticeGaugeFieldD GaugeField; + typedef LatticeColourMatrixD ColourMatrixField; + typedef DomainWallFermionD FermionAction; +#endif +#ifdef DOUBLE2 + typedef vComplexD2 Simd; + typedef LatticeFermionD2 FermionField; + typedef LatticeGaugeFieldD2 GaugeField; + typedef LatticeColourMatrixD2 ColourMatrixField; + typedef DomainWallFermionD2 FermionAction; +#endif + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl; + GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG")); + + std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; + GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG")); + + + FermionField src (FGrid); random(RNG5,src); +#if 0 + src = Zero(); + { + Coordinate origin({0,0,0,latt4[2]-1,0}); + SpinColourVectorF tmp; + tmp=Zero(); + tmp()(0)(0)=Complex(-2.0,0.0); + std::cout << " source site 0 " << tmp<::HotConfiguration(RNG4,Umu); + // SU::ColdConfiguration(Umu); + UmuCopy=Umu; + std::cout << GridLogMessage << "Random gauge initialised " << std::endl; + + //////////////////////////////////// + // Apply BCs + //////////////////////////////////// + Coordinate Block(4); + for(int d=0;d<4;d++) Block[d]= Dirichlet[d+1]; + + std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl; + std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl; + + DirichletFilter Filter(Block); + Filter.applyFilter(Umu); + + //////////////////////////////////// + // Naive wilson implementation + //////////////////////////////////// + std::vector U(4,UGrid); + for(int mu=0;mu(Umu,mu); + } + + std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl; + + if (1) + { + ref = Zero(); + for(int mu=0;muoSites();ss++){ + for(int s=0;soSites();ss++){ + for(int s=0;s_Nprocessors; + RealD NN = UGrid->NodeCount(); + + std::cout << GridLogMessage<< "*****************************************************************" <Barrier(); + Dw.Dhop(src,result,0); + std::cout<Barrier(); + std::this_thread::sleep_for(std::chrono::seconds(5)); + auto cpu_start = std::chrono::system_clock::now(); + double t0=usecond(); + for(int i=0;iBarrier(); + auto cpu_stop = std::chrono::system_clock::now(); + std::this_thread::sleep_for(std::chrono::seconds(5)); + + if (FGrid->ThisRank() == 0) { + std::ofstream file("energy.times"); + file<<"energy_start:"<< std::chrono::duration_cast(cpu_start.time_since_epoch()).count()<(cpu_stop.time_since_epoch()).count()<1.0e-4) ) { + std::cout<Barrier(); + std::cout<Barrier(); + exit(-1); + } + assert (n2e< 1.0e-4 ); + } + + if (1) + { // Naive wilson dag implementation + ref = Zero(); + for(int mu=0;muoSites();ss++){ + for(int s=0;soSites();ss++){ + for(int s=0;sBarrier(); + Dw.DhopEO(src_o,r_e,DaggerNo); + double t0=usecond(); + for(int i=0;iBarrier(); + + double volume=Ls; for(int mu=0;mu